In [1]:
!pip install numpy==1.26.0
!pip install gensim --upgrade --force-reinstall

Collecting numpy==1.26.0
  Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
Successfully installed numpy-1.26.0
Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip uninstall numba -y
!pip install numba==0.60.0

Found existing installation: numba 0.60.0
Uninstalling numba-0.60.0:
  Successfully uninstalled numba-0.60.0
Collecting numba==0.60.0
  Downloading numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)
Downloading numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numba
Successfully installed numba-0.60.0


In [4]:
!pip install tensorflow==2.18.0



In [1]:
import os
#os.kill(os.getpid(), 9)  # Restart the runtime to apply changes

In [1]:
import numpy
import gensim
import tensorflow
import numba

print(f"NumPy version: {numpy.__version__}")
print(f"Gensim version: {gensim.__version__}")
print(f"TensorFlow version: {tensorflow.__version__}")
print(f"Numba version: {numba.__version__}")

NumPy version: 1.26.4
Gensim version: 4.3.3
TensorFlow version: 2.18.0
Numba version: 0.60.0


In [2]:
import torch
torch.cuda.empty_cache()  # Releases unoccupied cached memory
torch.cuda.memory_summary(device=None, abbreviated=False)  # To check memory status



Data Preparation & Embeddings

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import re
import os

# Force GPU usage

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

print("All packages imported successfully!")

Using device: cuda
All packages imported successfully!


Load & Preprocess Dataset

In [10]:
def load_and_preprocess_data(file_path):
    columns = ['polarity', 'id', 'date', 'query', 'user', 'text']

        # Adding 'error_bad_lines' to skip problematic rows and 'on_bad_lines' for pandas >=1.3.0
    try:
        df = pd.read_csv(file_path, encoding='latin-1', names=columns, on_bad_lines='skip', skiprows=1,low_memory=False)
    except:
        df = pd.read_csv(file_path, encoding='latin-1', names=columns, error_bad_lines=False,skiprows=1,)  # For older versions of pandas

    df = df[['polarity', 'text']]

    label_mapping = {0: 0, 2: 1, 4: 2}
    df['polarity'] = df['polarity'].map(label_mapping)

    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        text = re.sub(r'[^a-z0-9 ]', '', text)
        return text

    df['text'] = df['text'].apply(clean_text)
    return df

file_path = '/content/drive/MyDrive/Analytics Vidhya/Generative AI/NLP using PyTorch/archive/training.1600000.processed.noemoticon.csv'
data = load_and_preprocess_data(file_path)

invalid_data = data[(data['polarity'] < 0) | (data['polarity'] > 2)]
print(invalid_data)

X = data['text'].values
y = data['polarity'].values

train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, test_size=0.1, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

# Displaying dataset information
print(f"Total samples: {len(data)}")
print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Testing samples: {len(test_texts)}")

Empty DataFrame
Columns: [polarity, text]
Index: []
Total samples: 1048572
Training samples: 849342
Validation samples: 94372
Testing samples: 104858


In [13]:
from transformers import BertTokenizer, BertForSequenceClassification
import gensim.downloader as api

# Initialize BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_texts(texts, max_length=128):
    input_ids, attention_masks = [], []

    for text in texts:
        encoded = bert_tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_length, padding='max_length',
                                             return_attention_mask=True, truncation=True)
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return torch.tensor(input_ids).to(device), torch.tensor(attention_masks).to(device)

# Encode BERT Inputs
train_inputs, train_masks = encode_texts(train_texts)
val_inputs, val_masks = encode_texts(val_texts)
test_inputs, test_masks = encode_texts(test_texts)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [15]:
# ===========================
# Feature Engineering - GloVe Embeddings (GPU Optimized)
# ===========================

glove_model = api.load('glove-twitter-100')
embedding_dim = 100
max_seq_length = 128
batch_size = 2000  # Initial Batch Size (Will Adjust if GPU Memory is Low)
embedding_cache = {}
os.makedirs('embedding_batches', exist_ok=True)

def check_gpu_memory():
    # Check available GPU memory
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.get_device_properties(0).total_memory
        allocated_memory = torch.cuda.memory_allocated(0)
        free_memory = gpu_memory - allocated_memory
        return free_memory / (1024 ** 3)  # Convert to GB
    return None

def create_embedding_matrix(texts, batch_name):
    global batch_size
    total_texts = len(texts)

    # Adjust batch size based on available GPU memory
    free_memory = check_gpu_memory()
    if free_memory and free_memory < 2:  # If less than 2GB is free, reduce batch size
        batch_size = max(batch_size // 2, 128)  # Don't go below 128 to avoid slow processing
        print(f"Low GPU Memory - Adjusting batch size to {batch_size}")

    for batch_start in range(0, total_texts, batch_size):
        batch_texts = texts[batch_start:batch_start + batch_size]
        batch_embeddings = []

        for text in batch_texts:
            words = text.split()[:max_seq_length]
            embeddings = []

            for word in words:
                if word in embedding_cache:
                    embeddings.append(embedding_cache[word])
                elif word in glove_model:
                    embedding = glove_model[word]
                    embedding_cache[word] = embedding
                    embeddings.append(embedding)
                else:
                    embedding = np.zeros(embedding_dim)
                    embedding_cache[word] = embedding
                    embeddings.append(embedding)

            if len(embeddings) < max_seq_length:
                embeddings.extend([np.zeros(embedding_dim)] * (max_seq_length - len(embeddings)))

            batch_embeddings.append(torch.tensor(np.array(embeddings), dtype=torch.float32, device=device))

        batch_file_name = f"embedding_batches/{batch_name}_batch_{batch_start // batch_size}.pt"
        torch.save(torch.stack(batch_embeddings), batch_file_name)
        print(f"Saved {batch_file_name} successfully.")


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

create_embedding_matrix(train_texts, 'train')
create_embedding_matrix(val_texts, 'val')
create_embedding_matrix(test_texts, 'test')

print("Embedding Generation & Saving Completed Successfully!")

Saved embedding_batches/train_batch_0.pt successfully.
Saved embedding_batches/train_batch_1.pt successfully.
Saved embedding_batches/train_batch_2.pt successfully.
Saved embedding_batches/train_batch_3.pt successfully.
Saved embedding_batches/train_batch_4.pt successfully.
Saved embedding_batches/train_batch_5.pt successfully.
Saved embedding_batches/train_batch_6.pt successfully.
Saved embedding_batches/train_batch_7.pt successfully.
Saved embedding_batches/train_batch_8.pt successfully.
Saved embedding_batches/train_batch_9.pt successfully.
Saved embedding_batches/train_batch_10.pt successfully.
Saved embedding_batches/train_batch_11.pt successfully.
Saved embedding_batches/train_batch_12.pt successfully.
Saved embedding_batches/train_batch_13.pt successfully.
Saved embedding_batches/train_batch_14.pt successfully.
Saved embedding_batches/train_batch_15.pt successfully.
Saved embedding_batches/train_batch_16.pt successfully.
Saved embedding_batches/train_batch_17.pt successfully.
Sa

In [16]:
import os  # Importing the 'os' module
import torch
from torch.utils.data import TensorDataset, DataLoader
import gc

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_embedding_batch(batch_path):
    """
    Load a single batch from disk to avoid memory overload.
    """
    return torch.load(batch_path, map_location=device)

def prepare_word2vec_dataloader(batch_paths, labels, batch_size=64):
    """
    Prepare DataLoader that loads one batch at a time during training to avoid memory overload.
    """
    for i, batch_path in enumerate(batch_paths):
        # Dynamically load embeddings from disk, one batch at a time
        embeddings = load_embedding_batch(batch_path)
        num_embeddings = embeddings.size(0)

        # Calculate label indices for the current batch
        start_idx = i * num_embeddings
        end_idx = min(start_idx + num_embeddings, len(labels))

        # Load labels for the current batch
        batch_labels = torch.tensor(labels[start_idx:end_idx], device=device)

        # If embeddings and labels mismatch, adjust them
        if len(batch_labels) != num_embeddings:
            batch_labels = batch_labels[:num_embeddings]

        # Create DataLoader for the current batch
        dataset = TensorDataset(embeddings, batch_labels)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        yield dataloader  # Yield DataLoader to avoid memory overload

        # Release memory
        del embeddings, batch_labels, dataset, dataloader
        torch.cuda.empty_cache()
        gc.collect()

def load_embedding_batches(batch_name):
    """
    Load all embedding batch file paths for a given batch name (train, val, test).
    """
    batch_files = sorted([f for f in os.listdir('embedding_batches') if f.startswith(batch_name)])
    batch_paths = [os.path.join('embedding_batches', file) for file in batch_files]
    return batch_paths

# Load batch paths
train_embedding_paths = load_embedding_batches('train')
val_embedding_paths = load_embedding_batches('val')
test_embedding_paths = load_embedding_batches('test')

print('Batches Loaded Successfully - Ready for Processing!')

Batches Loaded Successfully - Ready for Processing!


In [17]:
# ===========================
# Prepare DataLoaders with Generator Approach
# ===========================

# Train DataLoader Generator
train_loader_w2v = prepare_word2vec_dataloader(train_embedding_paths, train_labels)

# Validation DataLoader Generator
val_loader_w2v = prepare_word2vec_dataloader(val_embedding_paths, val_labels)

# Test DataLoader Generator
test_loader_w2v = prepare_word2vec_dataloader(test_embedding_paths, test_labels)

print('DataLoaders Prepared Successfully (Lazy Loading Enabled)')


DataLoaders Prepared Successfully (Lazy Loading Enabled)


In [18]:
# ===========================
# Model Definitions (BERT, RNN, LSTM, GRU)
# ===========================

# ===========================
# BERT Model
# ===========================

class BERTSentimentClassifier(nn.Module):
    def __init__(self):
        super(BERTSentimentClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits


# ===========================
# Base Model for RNN, LSTM, and GRU
# ===========================

class BaseRNNModel(nn.Module):
    def __init__(self, embedding_matrix):
        super(BaseRNNModel, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(128, 3)

    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        return x


# ===========================
# RNN Model
# ===========================

class RNNSentimentClassifier(BaseRNNModel):
    def __init__(self, embedding_matrix):
        super(RNNSentimentClassifier, self).__init__(embedding_matrix)
        self.rnn = nn.RNN(embedding_matrix.shape[1], 128, batch_first=True)

    def forward(self, x):
        x = super().forward(x)
        x, _ = self.rnn(x)
        x = x[:, -1, :]
        return self.fc(x)


# ===========================
# LSTM Model
# ===========================

class LSTMSentimentClassifier(BaseRNNModel):
    def __init__(self, embedding_matrix):
        super(LSTMSentimentClassifier, self).__init__(embedding_matrix)
        self.lstm = nn.LSTM(embedding_matrix.shape[1], 128, batch_first=True)

    def forward(self, x):
        x = super().forward(x)
        x, (h_n, c_n) = self.lstm(x)
        x = x[:, -1, :]
        return self.fc(x)


# ===========================
# GRU Model
# ===========================

class GRUSentimentClassifier(BaseRNNModel):
    def __init__(self, embedding_matrix):
        super(GRUSentimentClassifier, self).__init__(embedding_matrix)
        self.gru = nn.GRU(embedding_matrix.shape[1], 128, batch_first=True)

    def forward(self, x):
        x = super().forward(x)
        x, h_n = self.gru(x)
        x = x[:, -1, :]
        return self.fc(x)


print('Model Implementation Completed Successfully!')

Model Implementation Completed Successfully!


In [19]:
def get_memory_usage():
    return psutil.virtual_memory().used / (1024 ** 2)

def log_time_and_memory(start_time, start_memory):
    end_time = time.time()
    end_memory = get_memory_usage()
    time_taken = end_time - start_time
    memory_used = end_memory - start_memory
    return time_taken, memory_used


def plot_comparative_barchart(results_df):
    metrics = ['Accuracy', 'F1-Score', 'Precision', 'Recall', 'ROC-AUC']
    results_df[metrics].plot(kind='bar', figsize=(12, 6))
    plt.title('Model Performance Comparison')
    plt.xlabel('Models')
    plt.ylabel('Scores')
    plt.legend(loc='upper right')
    plt.show()


def plot_confusion_matrix(cm, model_name):
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} - Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()

In [20]:
# # ===========================
# # Model Training and Evaluation with GPU Utilization
# # ===========================


def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=5):
     model.to(device)
     model.train()
     training_losses = []
     validation_losses = []

     for epoch in range(epochs):
         total_loss = 0
         correct_predictions = 0

         for batch in train_loader:
             inputs, masks, labels = [x.to(device) for x in batch]  # Unpack inputs, masks, and labels
             optimizer.zero_grad()
             outputs = model(inputs, attention_mask=masks)
             loss = criterion(outputs, labels)
             loss.backward()
             optimizer.step()

             total_loss += loss.item()
             preds = torch.argmax(outputs, dim=1)
             correct_predictions += torch.sum(preds == labels).item()

         accuracy = correct_predictions / len(train_loader.dataset)
         training_losses.append(total_loss / len(train_loader))

         # Validation
         model.eval()
         val_loss = 0
         with torch.no_grad():
             for val_batch in val_loader:
                 val_inputs, val_masks, val_labels = [x.to(device) for x in val_batch]
                 val_outputs = model(val_inputs, attention_mask=val_masks)
                 val_loss += criterion(val_outputs, val_labels).item()

         validation_losses.append(val_loss / len(val_loader))
         print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss:.4f}, Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")

     return training_losses, validation_losses


def evaluate_model(model, test_loader, criterion):
     model.to(device)
     model.eval()
     total_loss = 0
     all_preds, all_labels = [], []

     with torch.no_grad():
         for batch in test_loader:
             inputs, masks, labels = [x.to(device) for x in batch]
             outputs = model(inputs, attention_mask=masks)
             loss = criterion(outputs, labels)
             total_loss += loss.item()

             preds = torch.argmax(outputs, dim=1)
             all_preds.extend(preds.cpu().numpy())
             all_labels.extend(labels.cpu().numpy())

     accuracy = accuracy_score(all_labels, all_preds)
     f1 = f1_score(all_labels, all_preds, average='weighted')
     precision = precision_score(all_labels, all_preds, average='weighted')
     recall = recall_score(all_labels, all_preds, average='weighted')
     cm = confusion_matrix(all_labels, all_preds)

     print(f"Test Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}")
     return accuracy, f1, precision, recall, cm

print("Model Training and Evaluation Code Integrated Successfully!")

Model Training and Evaluation Code Integrated Successfully!


In [23]:
# ===========================
# Model Training and Evaluation with GPU Utilization
# ===========================
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=5):
    model.to(device)
    model.train()
    training_losses = []
    validation_losses = []

    for epoch in range(epochs):
        total_loss = 0
        correct_predictions = 0

        for batch_idx, batch in enumerate(train_loader):  # Add batch_idx
            inputs, labels = batch  # Assuming your train_loader provides (inputs, labels)
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)  # Removed attention_mask as it's not needed for RNN/LSTM/GRU
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, preds = torch.max(outputs, 1)  # Get predicted labels
            correct_predictions += torch.sum(preds == labels).item()

        accuracy = correct_predictions / len(train_loader.dataset)  # Assuming train_loader has dataset attribute
        training_losses.append(total_loss / len(train_loader))

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for val_batch_idx, val_batch in enumerate(val_loader):  # Add val_batch_idx
                val_inputs, val_labels = val_batch  # Assuming val_loader provides (inputs, labels)
                val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)

                val_outputs = model(val_inputs)  # Removed attention_mask
                val_loss += criterion(val_outputs, val_labels).item()

        validation_losses.append(val_loss / len(val_loader))  # Assuming val_loader has dataset attribute
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss:.4f}, Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")

    return training_losses, validation_losses


def evaluate_model(model, test_loader, criterion):
    model.to(device)
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):  # Add batch_idx
            inputs, labels = batch  # Assuming your test_loader provides (inputs, labels)
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)  # Removed attention_mask
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, preds = torch.max(outputs, 1)  # Get predicted labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    cm = confusion_matrix(all_labels, all_preds)

    print(f"Test Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}")
    return accuracy, f1, precision, recall, cm

print("Model Training and Evaluation Code Integrated Successfully!")

Model Training and Evaluation Code Integrated Successfully!


Model Training & Evaluation - Calling Functions

In [26]:
# ===========================
# DataLoader Preparation
# ===========================

from torch.utils.data import TensorDataset, DataLoader

def prepare_dataloader(inputs, masks, labels, batch_size=16):
    # Convert inputs, masks, and labels to tensors and ensure they're on CPU before sending to DataLoader
        inputs = torch.tensor(inputs, dtype=torch.long)
        masks = torch.tensor(masks, dtype=torch.long)
        labels = torch.tensor(labels, dtype=torch.long)

    # Verify label range
        if not torch.all((labels >= 0) & (labels < 3)):
          print("Warning: Some labels are outside the expected range [0, 1, 2].")

        dataset = TensorDataset(inputs, masks, labels)
        return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
# ===========================
# Prepare DataLoaders for BERT
# ===========================

# Generate DataLoader objects for training, validation, and testing datasets
train_loader = prepare_dataloader(train_inputs.cpu(), train_masks.cpu(), train_labels)
val_loader = prepare_dataloader(val_inputs.cpu(), val_masks.cpu(), val_labels)
test_loader = prepare_dataloader(test_inputs.cpu(), test_masks.cpu(), test_labels)

print("DataLoaders Prepared Successfully!")

# ===========================
# Train and Evaluate BERT Model (Using Mixed Precision Training)
# ===========================

model_name = 'BERT'
model = BERTSentimentClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()
scaler = torch.amp.GradScaler() # Initialize Gradient Scaler for AMP

training_losses = []
validation_losses = []
model.train()

for epoch in range(5):
    total_loss = 0
    correct_predictions = 0

    for batch in train_loader:
        inputs, masks, labels = [x.to(device) for x in batch]
        optimizer.zero_grad()

        with torch.amp.autocast(device_type='cuda'):
            outputs = model(inputs, attention_mask=masks)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels).item()

    accuracy = correct_predictions / len(train_loader.dataset)
    training_losses.append(total_loss / len(train_loader))
    print(f"Epoch [{epoch+1}/5], Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}")

# ===========================
# Evaluation
# ===========================
accuracy, f1, precision, recall, cm = evaluate_model(model, test_loader, criterion)

# ===========================
# Store Results
# ===========================
results = {}
results[model_name] = {
    'Accuracy': accuracy,
    'F1-Score': f1,
    'Precision': precision,
    'Recall': recall,
    'Confusion Matrix': cm,
    'Training Losses': training_losses
}

print(f"{model_name} training and evaluation completed successfully!")

  inputs = torch.tensor(inputs, dtype=torch.long)
  masks = torch.tensor(masks, dtype=torch.long)


DataLoaders Prepared Successfully!


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/5], Loss: 15461.3591, Accuracy: 0.8783


In [None]:
results = {}

model_name = 'RNN'
embedding_matrix = torch.tensor(next(iter(train_loader_w2v))[0].detach().cpu().numpy())
model = RNNSentimentClassifier(embedding_matrix).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# Training
training_losses, validation_losses = [], []

for train_dataloader in train_loader_w2v:
    train_loss, val_loss = train_model(
        model,
        train_dataloader,
        val_loader_w2v,
        optimizer,
        criterion,
        epochs=5
    )
    training_losses.extend(train_loss)
    validation_losses.extend(val_loss)

# Evaluation
all_accuracies, all_f1_scores, all_precisions, all_recalls, all_cms = [], [], [], [], []

for test_dataloader in test_loader_w2v:
    accuracy, f1, precision, recall, cm = evaluate_model(
        model,
        test_dataloader,
        criterion
    )
    all_accuracies.append(accuracy)
    all_f1_scores.append(f1)
    all_precisions.append(precision)
    all_recalls.append(recall)
    all_cms.append(cm)

# Aggregate Metrics
accuracy = sum(all_accuracies) / len(all_accuracies)
f1 = sum(all_f1_scores) / len(all_f1_scores)
precision = sum(all_precisions) / len(all_precisions)
recall = sum(all_recalls) / len(all_recalls)
cm = sum(all_cms)

# Store Results
results[model_name] = {
    'Accuracy': accuracy,
    'F1-Score': f1,
    'Precision': precision,
    'Recall': recall,
    'Confusion Matrix': cm,
    'Training Losses': training_losses,
    'Validation Losses': validation_losses
}

print(f"{model_name} training and evaluation completed successfully!")

In [None]:
# ===========================
# Prepare DataLoaders for BERT
# ===========================

# Generate DataLoader objects for training, validation, and testing datasets
train_loader = prepare_dataloader(train_inputs.cpu(), train_masks.cpu(), train_labels)
val_loader = prepare_dataloader(val_inputs, val_masks, val_labels)
test_loader = prepare_dataloader(test_inputs, test_masks, test_labels)

print("DataLoaders Prepared Successfully!")

# ===========================
# Train and Evaluate BERT Model (Using Mixed Precision Training)
# ===========================

model_name = 'BERT'
model = BERTSentimentClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()
scaler = torch.amp.GradScaler() # Initialize Gradient Scaler for AMP

training_losses = []
validation_losses = []
model.train()

for epoch in range(5):
    total_loss = 0
    correct_predictions = 0

    for batch in train_loader:
        inputs, masks, labels = [x.to(device) for x in batch]
        optimizer.zero_grad()

        with torch.amp.autocast(device_type='cuda'):
            outputs = model(inputs, attention_mask=masks)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels).item()

    accuracy = correct_predictions / len(train_loader.dataset)
    training_losses.append(total_loss / len(train_loader))
    print(f"Epoch [{epoch+1}/5], Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}")

# ===========================
# Evaluation
# ===========================
accuracy, f1, precision, recall, cm = evaluate_model(model, test_loader, criterion)

# ===========================
# Store Results
# ===========================
results = {}
results[model_name] = {
    'Accuracy': accuracy,
    'F1-Score': f1,
    'Precision': precision,
    'Recall': recall,
    'Confusion Matrix': cm,
    'Training Losses': training_losses
}

print(f"{model_name} training and evaluation completed successfully!")

  inputs = torch.tensor(inputs, dtype=torch.long)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Ensure you have a 'results' dictionary if not already initialized
if 'results' not in globals():
    results = {}

model_name = 'LSTM'

# Retrieve the embedding matrix from the first batch of train_loader_w2v
embedding_matrix = torch.tensor(next(iter(train_loader_w2v))[0].detach().cpu().numpy())
model = LSTMSentimentClassifier(embedding_matrix).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# Training
training_losses, validation_losses = [], []

for train_dataloader in train_loader_w2v:
    train_loss, val_loss = train_model(
        model,
        train_dataloader,
        val_loader_w2v,
        optimizer,
        criterion,
        epochs=5
    )
    training_losses.extend(train_loss)
    validation_losses.extend(val_loss)

# Evaluation
all_accuracies, all_f1_scores, all_precisions, all_recalls, all_cms = [], [], [], [], []

for test_dataloader in test_loader_w2v:
    accuracy, f1, precision, recall, cm = evaluate_model(
        model,
        test_dataloader,
        criterion
    )
    all_accuracies.append(accuracy)
    all_f1_scores.append(f1)
    all_precisions.append(precision)
    all_recalls.append(recall)
    all_cms.append(cm)

# Aggregate Metrics
accuracy = sum(all_accuracies) / len(all_accuracies)
f1 = sum(all_f1_scores) / len(all_f1_scores)
precision = sum(all_precisions) / len(all_precisions)
recall = sum(all_recalls) / len(all_recalls)
cm = sum(all_cms)

# Store Results
results[model_name] = {
    'Accuracy': accuracy,
    'F1-Score': f1,
    'Precision': precision,
    'Recall': recall,
    'Confusion Matrix': cm,
    'Training Losses': training_losses,
    'Validation Losses': validation_losses
}

print(f"{model_name} training and evaluation completed successfully!")

In [None]:
# Ensure you have a 'results' dictionary if not already initialized
if 'results' not in globals():
    results = {}

model_name = 'GRU'

# Retrieve the embedding matrix from the first batch of train_loader_w2v
embedding_matrix = torch.tensor(next(iter(train_loader_w2v))[0].detach().cpu().numpy())
model = GRUSentimentClassifier(embedding_matrix).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# Training
training_losses, validation_losses = [], []

for train_dataloader in train_loader_w2v:
    train_loss, val_loss = train_model(
        model,
        train_dataloader,
        val_loader_w2v,
        optimizer,
        criterion,
        epochs=5
    )
    training_losses.extend(train_loss)
    validation_losses.extend(val_loss)

# Evaluation
all_accuracies, all_f1_scores, all_precisions, all_recalls, all_cms = [], [], [], [], []

for test_dataloader in test_loader_w2v:
    accuracy, f1, precision, recall, cm = evaluate_model(
        model,
        test_dataloader,
        criterion
    )
    all_accuracies.append(accuracy)
    all_f1_scores.append(f1)
    all_precisions.append(precision)
    all_recalls.append(recall)
    all_cms.append(cm)

# Aggregate Metrics
accuracy = sum(all_accuracies) / len(all_accuracies)
f1 = sum(all_f1_scores) / len(all_f1_scores)
precision = sum(all_precisions) / len(all_precisions)
recall = sum(all_recalls) / len(all_recalls)
cm = sum(all_cms)

# Store Results
results[model_name] = {
    'Accuracy': accuracy,
    'F1-Score': f1,
    'Precision': precision,
    'Recall': recall,
    'Confusion Matrix': cm,
    'Training Losses': training_losses,
    'Validation Losses': validation_losses
}

print(f"{model_name} training and evaluation completed successfully!")

In [None]:
def plot_confusion_matrix(cm, model_name):
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} - Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()

def compare_models(results):
    # Convert results dictionary to DataFrame for comparison
    results_df = pd.DataFrame.from_dict(results, orient='index')

    # Display overall comparison report
    print("\nComparative Analysis Report:")
    print(results_df[['Accuracy', 'F1-Score', 'Precision', 'Recall']])

    # Plotting Bar Chart for Performance Comparison
    results_df[['Accuracy', 'F1-Score', 'Precision', 'Recall']].plot(kind='bar', figsize=(12, 6))
    plt.title('Model Performance Comparison')
    plt.xlabel('Models')
    plt.ylabel('Scores')
    plt.legend(loc='upper right')
    plt.show()

    # Visualize Confusion Matrices and Loss Curves for Each Model
    for model_name, metrics in results.items():
        # Plot Confusion Matrix
        plot_confusion_matrix(metrics['Confusion Matrix'], model_name)

        # Plot Training & Validation Loss
        plt.figure(figsize=(10, 5))
        plt.plot(metrics['Training Losses'], label='Training Loss')
        plt.plot(metrics['Validation Losses'], label='Validation Loss')
        plt.title(f'{model_name} - Training & Validation Loss Over Epochs')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()

    print("Comparison Completed Successfully.")

Evaluation & Comparison Completed Successfully!


In [None]:
results = {}

def add_model_results(model_name, accuracy, f1, precision, recall, roc_auc, time_taken, memory_used, training_losses, validation_losses, cm):
    results[model_name] = {
        'Accuracy': accuracy,
        'F1-Score': f1,
        'Precision': precision,
        'Recall': recall,
        'ROC-AUC': roc_auc,
        'Training Time (s)': time_taken,
        'Memory Usage (MB)': memory_used,
        'Training Losses': training_losses,
        'Validation Losses': validation_losses,
        'Confusion Matrix': cm
    }

print("Result Storage Function Created Successfully!")

# Example of adding model results. Make sure to replace the metric variables for each model.

# For BERT
add_model_results(
    model_name='BERT',
    accuracy=bert_accuracy,
    f1=bert_f1,
    precision=bert_precision,
    recall=bert_recall,
    roc_auc=bert_roc_auc,
    time_taken=bert_time_taken,
    memory_used=bert_memory_used,
    training_losses=bert_training_losses,
    validation_losses=bert_validation_losses,
    cm=bert_cm
)

# For RNN
add_model_results(
    model_name='RNN',
    accuracy=rnn_accuracy,
    f1=rnn_f1,
    precision=rnn_precision,
    recall=rnn_recall,
    roc_auc=rnn_roc_auc,
    time_taken=rnn_time_taken,
    memory_used=rnn_memory_used,
    training_losses=rnn_training_losses,
    validation_losses=rnn_validation_losses,
    cm=rnn_cm
)

# For LSTM
add_model_results(
    model_name='LSTM',
    accuracy=lstm_accuracy,
    f1=lstm_f1,
    precision=lstm_precision,
    recall=lstm_recall,
    roc_auc=lstm_roc_auc,
    time_taken=lstm_time_taken,
    memory_used=lstm_memory_used,
    training_losses=lstm_training_losses,
    validation_losses=lstm_validation_losses,
    cm=lstm_cm
)

# For GRU
add_model_results(
    model_name='GRU',
    accuracy=gru_accuracy,
    f1=gru_f1,
    precision=gru_precision,
    recall=gru_recall,
    roc_auc=gru_roc_auc,
    time_taken=gru_time_taken,
    memory_used=gru_memory_used,
    training_losses=gru_training_losses,
    validation_losses=gru_validation_losses,
    cm=gru_cm
)

print("All model results stored successfully!")

In [None]:
# ===========================
# Recommendation System
# ===========================

def generate_recommendations(results_df):
    # Identify the best model based on F1-Score
    best_model_name = results_df['F1-Score'].idxmax()
    best_model_metrics = results_df.loc[best_model_name]

    print("\nRecommendations:")
    print(f"The best model based on F1-Score is: {best_model_name}")
    print(f"Accuracy: {best_model_metrics['Accuracy']}")
    print(f"F1-Score: {best_model_metrics['F1-Score']}")
    print(f"Precision: {best_model_metrics['Precision']}")
    print(f"Recall: {best_model_metrics['Recall']}")

    # Check if ROC-AUC is present before printing
    if 'ROC-AUC' in best_model_metrics:
        print(f"ROC-AUC: {best_model_metrics['ROC-AUC']}")
    else:
        print("ROC-AUC metric is not available for this model.")

    print(f"Training Time (s): {best_model_metrics['Training Time (s)']}")
    print(f"Memory Usage (MB): {best_model_metrics['Memory Usage (MB)']}")

    # Provide Detailed Analysis
    print("\nFinal Recommendation:")
    if best_model_name == 'BERT':
        print("BERT provides the highest accuracy and generalization capabilities, but requires more computational resources.")
    elif best_model_name in ['LSTM', 'GRU']:
        print("LSTM/GRU models offer a good balance between performance and computational efficiency.")
    elif best_model_name == 'RNN':
        print("RNN is the simplest model and is best used as a baseline.")
    else:
        print(f"{best_model_name} is a custom model. Analyze its performance carefully.")

    # Generate Recommendation Report
    recommendation_report = pd.DataFrame({
        'Model': [best_model_name],
        'Accuracy': [best_model_metrics['Accuracy']],
        'F1-Score': [best_model_metrics['F1-Score']],
        'Precision': [best_model_metrics['Precision']],
        'Recall': [best_model_metrics['Recall']],
        'ROC-AUC': [best_model_metrics.get('ROC-AUC', 'N/A')],  # Handle missing ROC-AUC
        'Training Time (s)': [best_model_metrics['Training Time (s)']],
        'Memory Usage (MB)': [best_model_metrics['Memory Usage (MB)']],
        'Recommendation': ["Best Model Based on Trade-offs Between Performance and Computational Efficiency"]
    })

    print("\nRecommendation System Successfully Generated.")
    return recommendation_report