In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
)
import pandas as pd
import numpy as np
import random
from sklearn.metrics import accuracy_score, classification_report
import re
import os
from sklearn.model_selection import train_test_split
from datasets import load_dataset

# Load Dataset

#### Loading Tweeteval dataset from huggingface

In [2]:
dataset = load_dataset('tweet_eval', 'sentiment')

#### Inspecting the dataset

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


#### Convert to pandas DataFrames for easier handling

In [4]:
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

#### Map labels to sentiment strings for clarity

In [5]:
label_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}

train_df['label_name'] = train_df['label'].map(label_mapping)
val_df['label_name'] = val_df['label'].map(label_mapping)
test_df['label_name'] = test_df['label'].map(label_mapping)

In [6]:
print(train_df.head())

                                                text  label label_name
0  "QT @user In the original draft of the 7th boo...      2   positive
1  "Ben Smith / Smith (concussion) remains out of...      1    neutral
2  Sorry bout the stream last night I crashed out...      1    neutral
3  Chase Headley's RBI double in the 8th inning o...      1    neutral
4  @user Alciato: Bee will invest 150 million in ...      2   positive


In [7]:
value_counts = train_df['label_name'].value_counts()
value_counts

label_name
neutral     20673
positive    17849
negative     7093
Name: count, dtype: int64

# Preprocessing Data

#### Defining preprocessing function

In [8]:
def clean_tweet(text):
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)     # Remove @mentions
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)  # Remove URLs
    text = re.sub(r'#', '', text)                  # Remove hashtag symbol
    text = re.sub(r'\n', ' ', text)                # Remove line breaks
    text = re.sub(r'&amp;', '&', text)             # Replace HTML ampersand
    text = re.sub(r'[^a-zA-Z\'\s]', '', text)      # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()       # Remove extra whitespace
    return text

#### Applying function to train, validation and test splits

In [9]:
train_df['clean_text'] = train_df['text'].apply(clean_tweet)
val_df['clean_text'] = val_df['text'].apply(clean_tweet)
test_df['clean_text'] = test_df['text'].apply(clean_tweet)

#### Initialize tokenizer

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Prepare Dataloader

In [11]:
MAX_LEN = 64
BATCH_SIZE = 32

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,        # Add '[CLS]' and '[SEP]'
            max_length=self.max_len,
            truncation=True,                # Truncate longer sentences
            padding='max_length',           # Pad shorter sentences
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),        # Convert from [1, max_len] to [max_len]
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

#### Create Datasets

In [12]:
train_dataset = SentimentDataset(
    texts=train_df['clean_text'].values,
    labels=train_df['label'].values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

val_dataset = SentimentDataset(
    texts=val_df['clean_text'].values,
    labels=val_df['label'].values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

test_dataset = SentimentDataset(
    texts=test_df['clean_text'].values,
    labels=test_df['label'].values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

#### Create Data Loaders

In [13]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4
)

# Setup Model

#### Load Pre-Trained Bert Model

In [14]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3,   # Multi-class classification (negative, neutral, positive)
    output_attentions=False,
    output_hidden_states=False,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Set Device to CUDA

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [16]:
device

device(type='cuda')

#### Define optimizer and schedular

In [17]:
# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

# Calculate total steps
from transformers import get_linear_schedule_with_warmup

EPOCHS = 3
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,             # Default value in transformers
    num_training_steps=total_steps
)

# Define the loss function
loss_fn = nn.CrossEntropyLoss().to(device)



# Train the Model

#### Define training function

In [18]:
def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
):
    model = model.train()
    
    losses = []
    correct_predictions = 0
    
    for batch in data_loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Prevent exploding gradients
        optimizer.step()
        scheduler.step()
        
    return correct_predictions.double() / n_examples, np.mean(losses)

#### Define validation function

In [19]:
def eval_model(
    model,
    data_loader,
    loss_fn,
    device,
    n_examples
):
    model = model.eval()
    
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            logits = outputs.logits
            
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

#### Training loop

In [None]:
from collections import defaultdict
import time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    
    start_time = time.time()
    
    train_acc, train_loss = train_epoch(
        model,
        train_dataloader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train_dataset)
    )
    
    val_acc, val_loss = eval_model(
        model,
        val_dataloader,
        loss_fn,
        device,
        len(val_dataset)
    )
    
    end_time = time.time()
    epoch_time = end_time - start_time
    
    print(f'Train loss {train_loss:.4f} accuracy {train_acc:.4f}')
    print(f'Val   loss {val_loss:.4f} accuracy {val_acc:.4f}')
    print(f'Epoch time: {epoch_time // 60:.0f}m {epoch_time % 60:.0f}s')
    print()
    
    history['train_acc'].append(train_acc.cpu())
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc.cpu())
    history['val_loss'].append(val_loss)
    
    # Save the best model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

# Evaluate the model

#### Load the best model

In [None]:
model.load_state_dict(torch.load('best_model_state.bin'))

#### Define prediction function

In [None]:
def get_predictions(model, data_loader):
    model = model.eval()
    
    texts = []
    predictions = []
    prediction_probs = []
    real_values = []
    
    with torch.no_grad():
        for batch in data_loader:
            texts.extend(batch['text'])
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=1)
            _, preds = torch.max(probs, dim=1)
            
            predictions.extend(preds.cpu())
            prediction_probs.extend(probs.cpu())
            real_values.extend(labels.cpu())
            
    predictions = torch.stack(predictions)
    prediction_probs = torch.stack(prediction_probs)
    real_values = torch.stack(real_values)
    
    return texts, predictions, prediction_probs, real_values

#### Generate Classification Report on Validation Set

In [None]:
from sklearn.metrics import classification_report

label_names = ['negative', 'neutral', 'positive']

texts, y_pred, y_pred_probs, y_true = get_predictions(model, val_dataloader)

print(classification_report(y_true, y_pred, target_names=label_names))

#### Evaluate on Test Set

In [None]:
texts_test, y_pred_test, y_pred_probs_test, y_true_test = get_predictions(model, test_dataloader)

print('Test Set Evaluation:')
print(classification_report(y_true_test, y_pred_test, target_names=label_names))