In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d kazanova/sentiment140
!unzip sentiment140.zip

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 98% 79.0M/80.9M [00:04<00:00, 23.4MB/s]
100% 80.9M/80.9M [00:04<00:00, 18.4MB/s]
Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [None]:
#final bert
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import random
import nltk
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt')

# Set random seeds
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# Constants
MAX_LEN = 60
BATCH_SIZE = 32  # Reduced batch size for better generalization
EPOCHS = 10
LEARNING_RATE = 2e-5
PATIENCE = 3  # Early stopping patience
N_FOLDS = 5  # Number of folds for cross-validation

class SentimentClassifier(nn.Module):
    def __init__(self, n_classes, dropout_p=0.3):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')

        # Improved architecture with additional layers
        self.pre_classifier = nn.Linear(self.bert.config.hidden_size, 512)
        self.dropout1 = nn.Dropout(p=dropout_p)
        self.layer_norm1 = nn.LayerNorm(512)

        self.hidden = nn.Linear(512, 256)
        self.dropout2 = nn.Dropout(p=dropout_p)
        self.layer_norm2 = nn.LayerNorm(256)

        self.out = nn.Linear(256, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        x = self.dropout1(outputs["pooler_output"])
        x = self.pre_classifier(x)
        x = torch.relu(x)
        x = self.layer_norm1(x)

        x = self.dropout2(x)
        x = self.hidden(x)
        x = torch.relu(x)
        x = self.layer_norm2(x)

        return self.out(x)

class TwitterDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len, augment=False):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.augment = augment

    def __len__(self):
        return len(self.texts)

    def augment_text(self, text):
        # Simple data augmentation techniques
        tokens = word_tokenize(text)
        if len(tokens) > 10 and random.random() < 0.5:
            # Randomly drop words
            drop_indices = random.sample(range(len(tokens)), k=random.randint(1, 3))
            tokens = [token for i, token in enumerate(tokens) if i not in drop_indices]
            text = ' '.join(tokens)
        return text

    def __getitem__(self, item):
        text = str(self.texts[item])

        if self.augment and random.random() < 0.3:
            text = self.augment_text(text)

        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

class EarlyStopping:
    def __init__(self, patience=7, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

def train_epoch(model, data_loader, loss_fn, optimizer, scheduler, device):
    model.train()
    losses = []
    all_predictions = []
    all_targets = []

    progress_bar = tqdm(data_loader, desc='Training')
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        outputs = outputs.float()

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        losses.append(loss.item())
        all_predictions.extend(preds.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

        progress_bar.set_postfix({'loss': np.mean(losses[-100:])})

    return np.mean(losses), f1_score(all_targets, all_predictions, average='binary')

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    losses = []
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)

            losses.append(loss.item())
            all_predictions.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    return (np.mean(losses),
            f1_score(all_targets, all_predictions, average='binary'),
            classification_report(all_targets, all_predictions))

def plot_training_metrics(train_metrics, val_metrics, metric_name):
    plt.figure(figsize=(10, 6))
    plt.plot(train_metrics, label=f'Training {metric_name}')
    plt.plot(val_metrics, label=f'Validation {metric_name}')
    plt.xlabel('Epoch')
    plt.ylabel(metric_name)
    plt.title(f'{metric_name} vs. Epoch')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'{metric_name.lower()}_plot.png')
    plt.close()

def main():
    # Load and preprocess data
    print("Loading data...")
    DATA_PATH = "training.1600000.processed.noemoticon.csv"
    df = pd.read_csv(DATA_PATH, encoding='ISO-8859-1',
                     names=['target', 'id', 'date', 'flag', 'user', 'text'])

    df['target'] = (df['target'] == 4).astype(int)

    # Calculate class weights for imbalanced dataset
    class_weights = torch.tensor(
        [1.0 / (df['target'] == i).mean() for i in range(2)],
        dtype=torch.float
    )

    # Take a subset for faster training (adjust as needed)
    df = df.sample(n=50000, random_state=RANDOM_SEED)
    print(f"Using {len(df)} samples")

    # Initialize tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

    # Prepare for k-fold cross validation
    kfold = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)

    # Lists to store metrics for each fold
    fold_metrics = []

    for fold, (train_idx, val_idx) in enumerate(kfold.split(df)):
        print(f'\nFold {fold + 1}/{N_FOLDS}')
        print('-' * 20)

        # Split data
        train_data = df.iloc[train_idx]
        val_data = df.iloc[val_idx]

        # Create datasets
        train_dataset = TwitterDataset(
            texts=train_data.text.values,
            targets=train_data.target.values,
            tokenizer=tokenizer,
            max_len=MAX_LEN,
            augment=True  # Enable data augmentation for training
        )

        val_dataset = TwitterDataset(
            texts=val_data.text.values,
            targets=val_data.target.values,
            tokenizer=tokenizer,
            max_len=MAX_LEN,
            augment=False
        )

        # Create data loaders
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

        # Initialize model and move to device
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = SentimentClassifier(n_classes=2)
        model = model.to(device)

        # Initialize optimizer and scheduler
        optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
        total_steps = len(train_loader) * EPOCHS
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        # Initialize loss function with class weights
        class_weights = class_weights.to(device)
        loss_fn = nn.CrossEntropyLoss(weight=class_weights).to(device)

        # Initialize early stopping
        early_stopping = EarlyStopping(patience=PATIENCE)

        # Training loop
        train_losses, train_f1s = [], []
        val_losses, val_f1s = [], []

        for epoch in range(EPOCHS):
            print(f'\nEpoch {epoch + 1}/{EPOCHS}')

            train_loss, train_f1 = train_epoch(
                model, train_loader, loss_fn, optimizer, scheduler, device
            )

            val_loss, val_f1, val_report = eval_model(
                model, val_loader, loss_fn, device
            )

            train_losses.append(train_loss)
            train_f1s.append(train_f1)
            val_losses.append(val_loss)
            val_f1s.append(val_f1)

            print(f'Train Loss: {train_loss:.4f}, F1: {train_f1:.4f}')
            print(f'Val Loss: {val_loss:.4f}, F1: {val_f1:.4f}')
            print('\nClassification Report:')
            print(val_report)

            # Early stopping check
            early_stopping(val_loss)
            if early_stopping.early_stop:
                print("Early stopping triggered")
                break

        # Plot metrics for this fold
        plot_training_metrics(train_losses, val_losses, 'Loss')
        plot_training_metrics(train_f1s, val_f1s, 'F1 Score')

        # Store fold metrics
        fold_metrics.append({
            'val_f1': val_f1,
            'val_loss': val_loss
        })

    # Print average metrics across folds
    avg_val_f1 = np.mean([m['val_f1'] for m in fold_metrics])
    avg_val_loss = np.mean([m['val_loss'] for m in fold_metrics])
    print(f'\nAverage across {N_FOLDS} folds:')
    print(f'Validation F1: {avg_val_f1:.4f}')
    print(f'Validation Loss: {avg_val_loss:.4f}')

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading data...
Using 50000 samples

Fold 1/5
--------------------

Epoch 1/10


Training: 100%|██████████| 1250/1250 [06:53<00:00,  3.02it/s, loss=0.389]
Evaluating: 100%|██████████| 313/313 [00:33<00:00,  9.40it/s]


Train Loss: 0.4464, F1: 0.7945
Val Loss: 0.3964, F1: 0.8289

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84      4977
           1       0.85      0.81      0.83      5023

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000


Epoch 2/10


Training: 100%|██████████| 1250/1250 [06:53<00:00,  3.02it/s, loss=0.318]
Evaluating: 100%|██████████| 313/313 [00:33<00:00,  9.44it/s]


Train Loss: 0.3189, F1: 0.8688
Val Loss: 0.4291, F1: 0.8359

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.80      0.82      4977
           1       0.81      0.86      0.84      5023

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000


Epoch 3/10


Training: 100%|██████████| 1250/1250 [06:52<00:00,  3.03it/s, loss=0.224]
Evaluating: 100%|██████████| 313/313 [00:33<00:00,  9.40it/s]


Train Loss: 0.2131, F1: 0.9193
Val Loss: 0.5082, F1: 0.8245

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83      4977
           1       0.85      0.80      0.82      5023

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000


Epoch 4/10


Training: 100%|██████████| 1250/1250 [06:52<00:00,  3.03it/s, loss=0.135]
Evaluating: 100%|██████████| 313/313 [00:33<00:00,  9.46it/s]


Train Loss: 0.1376, F1: 0.9550
Val Loss: 0.7178, F1: 0.8241

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      4977
           1       0.82      0.82      0.82      5023

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000

Early stopping triggered

Fold 2/5
--------------------

Epoch 1/10


Training: 100%|██████████| 1250/1250 [06:52<00:00,  3.03it/s, loss=0.408]
Evaluating: 100%|██████████| 313/313 [00:33<00:00,  9.35it/s]


Train Loss: 0.4447, F1: 0.7951
Val Loss: 0.3761, F1: 0.8366

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.82      0.83      4977
           1       0.83      0.85      0.84      5023

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000


Epoch 2/10


Training: 100%|██████████| 1250/1250 [06:52<00:00,  3.03it/s, loss=0.327]
Evaluating: 100%|██████████| 313/313 [00:33<00:00,  9.42it/s]


Train Loss: 0.3123, F1: 0.8709
Val Loss: 0.4073, F1: 0.8316

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84      4977
           1       0.85      0.81      0.83      5023

    accuracy                           0.83     10000
   macro avg       0.84      0.83      0.83     10000
weighted avg       0.84      0.83      0.83     10000


Epoch 3/10


Training: 100%|██████████| 1250/1250 [06:51<00:00,  3.04it/s, loss=0.208]
Evaluating: 100%|██████████| 313/313 [00:33<00:00,  9.39it/s]


Train Loss: 0.2092, F1: 0.9211
Val Loss: 0.4934, F1: 0.8315

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.83      0.83      4977
           1       0.83      0.83      0.83      5023

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000


Epoch 4/10


Training: 100%|██████████| 1250/1250 [06:52<00:00,  3.03it/s, loss=0.148]
Evaluating: 100%|██████████| 313/313 [00:33<00:00,  9.33it/s]


Train Loss: 0.1376, F1: 0.9532
Val Loss: 0.6788, F1: 0.8326

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.76      0.81      4977
           1       0.79      0.88      0.83      5023

    accuracy                           0.82     10000
   macro avg       0.83      0.82      0.82     10000
weighted avg       0.83      0.82      0.82     10000

Early stopping triggered

Fold 3/5
--------------------

Epoch 1/10


Training: 100%|██████████| 1250/1250 [06:52<00:00,  3.03it/s, loss=0.408]
Evaluating: 100%|██████████| 313/313 [00:33<00:00,  9.39it/s]


Train Loss: 0.4438, F1: 0.7943
Val Loss: 0.3978, F1: 0.8297

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.80      0.82      4990
           1       0.81      0.85      0.83      5010

    accuracy                           0.82     10000
   macro avg       0.83      0.82      0.82     10000
weighted avg       0.83      0.82      0.82     10000


Epoch 2/10


Training: 100%|██████████| 1250/1250 [06:52<00:00,  3.03it/s, loss=0.316]
Evaluating: 100%|██████████| 313/313 [00:33<00:00,  9.31it/s]


Train Loss: 0.3129, F1: 0.8710
Val Loss: 0.4064, F1: 0.8313

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.80      0.82      4990
           1       0.81      0.86      0.83      5010

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000


Epoch 3/10


Training: 100%|██████████| 1250/1250 [06:52<00:00,  3.03it/s, loss=0.204]
Evaluating: 100%|██████████| 313/313 [00:33<00:00,  9.41it/s]


Train Loss: 0.2048, F1: 0.9232
Val Loss: 0.5163, F1: 0.8267

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.85      0.83      4990
           1       0.84      0.81      0.83      5010

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000


Epoch 4/10


Training: 100%|██████████| 1250/1250 [06:52<00:00,  3.03it/s, loss=0.135]
Evaluating: 100%|██████████| 313/313 [00:33<00:00,  9.34it/s]


Train Loss: 0.1313, F1: 0.9564
Val Loss: 0.7057, F1: 0.8198

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.84      0.83      4990
           1       0.84      0.80      0.82      5010

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000

Early stopping triggered

Fold 4/5
--------------------

Epoch 1/10


Training: 100%|██████████| 1250/1250 [06:52<00:00,  3.03it/s, loss=0.414]
Evaluating: 100%|██████████| 313/313 [00:33<00:00,  9.44it/s]


Train Loss: 0.4568, F1: 0.7880
Val Loss: 0.3794, F1: 0.8369

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.77      0.82      4970
           1       0.80      0.88      0.84      5030

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000


Epoch 2/10


Training:  72%|███████▏  | 899/1250 [04:56<01:56,  3.02it/s, loss=0.32]