# BERT
Tokenizer: WordPiece

EvaluationMetrics
    ● Accuracy:Overall percentage of correct predictions.
    ● Precision,Recall,F1-Score:Evaluate per class(negative,neutral, positive).
    ● Confusion Matrix:Show performance across all classes.
    ● ROC-AUCScore: Measure the ability of the model to distinguish between classes.

In [None]:
import pickle
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from tqdm import tqdm
# local imports
from metrics import calculate_precision_recall_f1, calculate_accuracy, get_confusion_matrix, plot_confusion_matrix

SEED = 42
np.random.seed(SEED)

# Check the available device
if torch.cuda.is_available():
    device = "cuda"
    torch.cuda.manual_seed_all(SEED)
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using device: {device}")
# Load the datasets 
with open("train.pkl", "rb") as f:
    train = pickle.load(f)
with open("val.pkl", "rb") as f:
    val = pickle.load(f)
with open("test.pkl", "rb") as f:
    test = pickle.load(f)

num_labels = train['label'].nunique()
print("Number of labels: ", num_labels)
train.head(2)

Using device: mps
Number of labels:  2


Unnamed: 0,polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet,label
237034,0,2058468667,Sat Jun 06 15:00:18 PDT 2009,NO_QUERY,bestthingaround,my star trek bootleg timed out and when i refr...,0
1387008,0,2068651245,Sun Jun 07 14:27:20 PDT 2009,NO_QUERY,Scriblit,yeah but the really pretty ones only go up to ...,0


In [60]:
from transformers import AutoModelForSequenceClassification, BertTokenizer

MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels  
).to(device)

# To check the maximum sequence length accepted by the tokenizer/model
max_seq_length = tokenizer.model_max_length
print(f"Max sequence length accepted by the tokenizer: {max_seq_length}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Max sequence length accepted by the tokenizer: 512


# Training Parameters

In [61]:
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5

# Build a PyTorch Dataset

In [62]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = str(self.data.iloc[idx]['text of the tweet'])
        label = self.data.iloc[idx]['label']

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_seq_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
            #  return_tensors='pt', these have shape [1, max_length]
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': int(label)
        }


In [None]:

train_dataset = CustomDataset(train, tokenizer=tokenizer, max_seq_length=max_seq_length)
val_dataset = CustomDataset(val, tokenizer=tokenizer, max_seq_length=max_seq_length)
test_dataset = CustomDataset(test, tokenizer=tokenizer, max_seq_length=max_seq_length)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Check the shape of a single item
sample = train_dataset[0]
print(f"input_ids shape: {sample['input_ids'].shape}")  # Without flatten: [1, 512]

# Check the shape after batching
batch = next(iter(train_loader))
print(f"Batched input_ids shape: {batch['input_ids'].shape}")  # Should be [16, 512], not [16, 1, 512]
# BERT expects bs,max_seq_length

# ========
# SET UP
# ========

# Optimizer 
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)


input_ids shape: torch.Size([512])
Batched input_ids shape: torch.Size([16, 512])


In [65]:
labels = [label for i, label in enumerate(train_dataset.data['label'].value_counts().index)]
labels
label_0 = labels[1]
label_4 = labels[0]
label_0,label_4

(0, 4)

In [67]:
# Estimate Training Time
import time

print("Estimating training time by running a few batches...")
print("=" * 60)

# Save model state to restore after timing
model_state = model.state_dict().copy()
optimizer_state = optimizer.state_dict().copy()

# Warm up (first batch is usually slower)
print("Warming up...")
model.train()
warmup_batch = next(iter(train_loader))
input_ids = warmup_batch['input_ids'].to(device)
attention_mask = warmup_batch['attention_mask'].to(device)
labels = warmup_batch['label'].to(device)
_ = model(input_ids, attention_mask=attention_mask, labels=labels)

# Time training batches
num_test_batches = 10
print(f"\nTiming {num_test_batches} training batches...")
start_time = time.time()

for i, batch in enumerate(train_loader):
    if i >= num_test_batches:
        break
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

train_time = time.time() - start_time
avg_time_per_batch_train = train_time / num_test_batches

# Time validation batches
print(f"Timing {num_test_batches} validation batches...")
model.eval()
start_time = time.time()

with torch.no_grad():
    for i, batch in enumerate(val_loader):
        if i >= num_test_batches:
            break
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

val_time = time.time() - start_time
avg_time_per_batch_val = val_time / num_test_batches

# Calculate estimates
total_train_batches = len(train_loader)
total_val_batches = len(val_loader)

time_per_epoch_train = avg_time_per_batch_train * total_train_batches
time_per_epoch_val = avg_time_per_batch_val * total_val_batches
time_per_epoch_total = time_per_epoch_train + time_per_epoch_val
total_training_time = time_per_epoch_total * EPOCHS

# Convert to readable format
def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    if hours > 0:
        return f"{hours}h {minutes}m {secs}s"
    elif minutes > 0:
        return f"{minutes}m {secs}s"
    else:
        return f"{secs}s"

print("\n" + "=" * 60)
print("TRAINING TIME ESTIMATE")
print("=" * 60)
print(f"Training batches per epoch: {total_train_batches:,}")
print(f"Validation batches per epoch: {total_val_batches:,}")
print(f"Average time per training batch: {avg_time_per_batch_train:.3f}s")
print(f"Average time per validation batch: {avg_time_per_batch_val:.3f}s")
print(f"\nTime per epoch:")
print(f"  Training: {format_time(time_per_epoch_train)}")
print(f"  Validation: {format_time(time_per_epoch_val)}")
print(f"  Total: {format_time(time_per_epoch_total)}")
print(f"\nEstimated total training time for {EPOCHS} epochs: {format_time(total_training_time)}")
print("=" * 60)

# Restore model and optimizer state (reset after timing test)
print("\nResetting model state after timing estimation...")
model.load_state_dict(model_state)
optimizer.load_state_dict(optimizer_state)
print("Ready to start training!")


Estimating training time by running a few batches...
Warming up...

Timing 10 training batches...
Timing 10 validation batches...

TRAINING TIME ESTIMATE
Training batches per epoch: 72,000
Validation batches per epoch: 18,000
Average time per training batch: 2.319s
Average time per validation batch: 0.388s

Time per epoch:
  Training: 46h 22m 55s
  Validation: 1h 56m 32s
  Total: 48h 19m 27s

Estimated total training time for 3 epochs: 144h 58m 23s

Resetting model state after timing estimation...
Ready to start training!


# Training Loop

In [66]:
best_val_loss = float('inf')  # Initialize best_val_loss to a very high value
best_epoch = -1  # Initialize best_epoch to an invalid value to track the epoch of the best validation loss

for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0
    total_val_loss = 0

    # Training with progress bar
    train_pbar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{EPOCHS} [Train]', leave=False)
    for batch in train_pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        
        # Update progress bar with current loss
        train_pbar.set_postfix({'loss': f'{loss.item():.4f}'})

    avg_train_loss = total_train_loss / len(train_loader)

    # Validation with progress bar
    model.eval()
    val_pbar = tqdm(val_loader, desc=f'Epoch {epoch + 1}/{EPOCHS} [Val]', leave=False)
    with torch.no_grad():
        for batch in val_pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()
            
            # Update progress bar with current loss
            val_pbar.set_postfix({'loss': f'{loss.item():.4f}'})

    avg_val_loss = total_val_loss / len(val_loader)

    # Check if the current validation loss is the lowest; if so, save the model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_epoch = epoch
        torch.save(model.state_dict(), 'best_model.pth')  # Save the best model

    print(f"Epoch {epoch + 1}/{EPOCHS}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

# Print the best epoch and its validation loss
print(f"The lowest validation loss was {best_val_loss:.4f} at epoch {best_epoch + 1}")

# Load the best model before calculating final metrics
model.load_state_dict(torch.load('best_model.pth'))

train_accuracy = calculate_accuracy(model, train_loader, device)
train_precision = calculate_precision_recall_f1(model, train_loader, device, label_0, label_4)
val_accuracy = calculate_accuracy(model, val_loader, device)
val_precision = calculate_precision_recall_f1(model, val_loader, device, label_0, label_4)
print(f'Best Model Training Accuracy: {train_accuracy:.2f}%')
print(f'Best Model Validation Accuracy: {val_accuracy:.2f}%')

# Print precision, recall, and F1 scores
print(f'\nTraining Metrics:')
print(f"  Label {label_0} - Precision: {train_precision['precision_label_0']:.4f}, Recall: {train_precision['recall_label_0']:.4f}, F1: {train_precision['f1_label_0']:.4f}")
print(f"  Label {label_4} - Precision: {train_precision['precision_label_4']:.4f}, Recall: {train_precision['recall_label_4']:.4f}, F1: {train_precision['f1_label_4']:.4f}")

print(f'\nValidation Metrics:')
print(f"  Label {label_0} - Precision: {val_precision['precision_label_0']:.4f}, Recall: {val_precision['recall_label_0']:.4f}, F1: {val_precision['f1_label_0']:.4f}")
print(f"  Label {label_4} - Precision: {val_precision['precision_label_4']:.4f}, Recall: {val_precision['recall_label_4']:.4f}, F1: {val_precision['f1_label_4']:.4f}")


                                                                                      

KeyboardInterrupt: 

In [None]:
# Generate and plot confusion matrices
# Map labels to class names (0 = Negative, 4 = Positive based on your assignment)
label_to_name = {0: 'Negative', 4: 'Positive'}
unique_labels = sorted(train['label'].unique())
class_names = [label_to_name.get(label, f'Label {label}') for label in unique_labels]

# Confusion Matrix for Validation Set
print("Validation Set Confusion Matrix:")
y_true_val, y_pred_val = get_confusion_matrix(model, val_loader, device)
cm_val = plot_confusion_matrix(y_true_val, y_pred_val, class_names, 
                                title="Validation Set - Confusion Matrix")

# Confusion Matrix for Test Set
print("\nTest Set Confusion Matrix:")
y_true_test, y_pred_test = get_confusion_matrix(model, test_loader, device)
cm_test = plot_confusion_matrix(y_true_test, y_pred_test, class_names, 
                                 title="Test Set - Confusion Matrix")

# Calculate test accuracy
test_accuracy = calculate_accuracy(model, test_loader, device)
print(f'\nTest Set Accuracy: {test_accuracy:.2f}%')
