# 4. Modeling

In this phase, we will fine-tune a pre-trained BERT model for our sentiment analysis task. We will use `vinai/bertweet-base`, a model specifically pre-trained on a large corpus of English tweets, making it highly suitable for this project.

### 4.1 Data Preparation

To make this notebook self-contained, we first need to load and prepare the data. This involves repeating the key steps from the `3_Data_Preparation` notebook: loading the sampled data, cleaning the text, splitting it into sets, tokenizing it for BERT, and creating PyTorch DataLoaders.

In [1]:
import pandas as pd
import re
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

# Define path
SAMPLED_DATA_PATH = '../data/sentiment140_sampled_200k.csv'

# --- Global Model and Device Setup ---
MODEL_NAME = 'vinai/bertweet-base'

# -- Hardware Check --
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS device found. Using Apple Silicon GPU.")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA device found. Using NVIDIA GPU.")
else:
    device = torch.device("cpu")
    print("No GPU found. Using CPU.")

# Load the sampled dataset
df_sampled = pd.read_csv(SAMPLED_DATA_PATH)

# --- Text Cleaning ---
def clean_tweet(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_sampled['cleaned_text'] = df_sampled['text'].apply(clean_tweet)

# --- Data Splitting ---
X = df_sampled['cleaned_text'].astype(str) # Ensure text is string
y = df_sampled['target'].apply(lambda x: 1 if x == 4 else 0) # Map target to 0 and 1

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# --- Tokenization ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=128)

train_encodings = tokenize_function(X_train.tolist())
val_encodings = tokenize_function(X_val.tolist())
test_encodings = tokenize_function(X_test.tolist())

# --- PyTorch Datasets and DataLoaders ---
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, y_train.to_numpy())
val_dataset = TweetDataset(val_encodings, y_val.to_numpy())
test_dataset = TweetDataset(test_encodings, y_test.to_numpy())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print("Data preparation complete. DataLoaders are ready.")

  from .autonotebook import tqdm as notebook_tqdm


MPS device found. Using Apple Silicon GPU.


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Data preparation complete. DataLoaders are ready.


### 4.2 Training and Evaluation Functions

Here we define the core functions for training and evaluating our PyTorch models. 
- **`train_epoch`**: Handles one full pass over the training data, including the forward pass, loss calculation, backpropagation, and weight updates. It's compatible with mixed-precision training.
- **`eval_model`**: Handles the evaluation on a dataset (e.g., validation or test set), calculating accuracy and F1-score.

In [2]:
from sklearn.metrics import accuracy_score, f1_score
import time

def train_epoch(model, data_loader, optimizer, device, scaler):
    model = model.train()
    total_loss = 0
    
    for batch in data_loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        with torch.autocast(device_type=device.type, enabled=device.type=='cuda' or device.type=='mps'):
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
        
        if scaler.is_enabled():
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        
        total_loss += loss.item()
        
    return total_loss / len(data_loader)

def eval_model(model, data_loader, device):
    model = model.eval()
    
    predictions = []
    actual_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with torch.autocast(device_type=device.type, enabled=device.type=='cuda' or device.type=='mps'):
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )
            
            _, preds = torch.max(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu())
            actual_labels.extend(labels.cpu())
            
    accuracy = accuracy_score(actual_labels, predictions)
    f1 = f1_score(actual_labels, predictions, average='weighted')
    
    return accuracy, f1

print("Training and evaluation functions defined.")

Training and evaluation functions defined.


### 4.3 Classical Baseline Model

Before diving into complex transformer models, it's crucial to establish a performance baseline with a simple, classical machine learning model. This gives us a benchmark to measure our transformer against and provides a point of comparison for explainability.

We will use a **TF-IDF (Term Frequency-Inverse Document Frequency)** vectorizer to convert the text into numerical features, followed by a **Logistic Regression** classifier. This is a strong and highly interpretable baseline for text classification.

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# 1. Create and train the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# 2. Train the Logistic Regression model
baseline_model = LogisticRegression(solver='liblinear', random_state=42)
baseline_model.fit(X_train_tfidf, y_train)

# 3. Evaluate the baseline model
y_pred_baseline = baseline_model.predict(X_test_tfidf)
print("--- Baseline Model Performance ---\n")
print(classification_report(y_test, y_pred_baseline, target_names=['Negative', 'Positive']))

# 4. Save the model and vectorizer for later use in Notebook 5
joblib.dump(baseline_model, '../models/baseline_model.pkl')
joblib.dump(tfidf_vectorizer, '../models/tfidf_vectorizer.pkl')
print("\nBaseline model and TF-IDF vectorizer saved.")

--- Baseline Model Performance ---

              precision    recall  f1-score   support

    Negative       0.80      0.78      0.79     10000
    Positive       0.79      0.81      0.80     10000

    accuracy                           0.79     20000
   macro avg       0.79      0.79      0.79     20000
weighted avg       0.79      0.79      0.79     20000


Baseline model and TF-IDF vectorizer saved.


### 4.4 Transformer Model - Hyperparameter Tuning

Now we turn to our transformer model. Instead of using default hyperparameters, we will conduct a systematic search to find the optimal configuration for our specific dataset. This ensures our final model is performing at its best and makes our results more robust.

**Strategy:**
1.  **Tune on a Subset:** To make this process fast, we will perform the search on a small, stratified subset (20,000 samples) of our training data.
2.  **Randomized Search:** We will test a fixed number of random combinations of learning rates and epochs. This is more efficient than testing every single combination (Grid Search).
3.  **Evaluate:** Each combination will be evaluated based on its F1-score on the full validation set.

In [4]:
import random
import numpy as np
import json # Import json for saving/loading hyperparameters
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW

# -- 1. Create a smaller subset for tuning --
# We take 20k samples from the training set for faster iteration.
X_train_subset, _, y_train_subset, _ = train_test_split(X_train, y_train, train_size=20000, random_state=42, stratify=y_train)

subset_encodings = tokenize_function(X_train_subset.tolist())
subset_dataset = TweetDataset(subset_encodings, y_train_subset.to_numpy())
subset_loader = DataLoader(subset_dataset, batch_size=16, shuffle=True)

print(f"Created a tuning subset with {len(X_train_subset)} samples.")

# -- 2. Define the search space --
param_grid = {
    'learning_rate': [1e-5, 2e-5, 3e-5, 5e-5],
    'epochs': [2, 3, 4]
}
num_combinations = 5 # Number of random combinations to test

# -- 3. Run the Randomized Search --
best_f1 = -1
best_params = {}
tuning_results = []

print("\nStarting hyperparameter tuning...")
for i in range(num_combinations):
    # Randomly sample parameters
    lr = random.choice(param_grid['learning_rate'])
    epochs = random.choice(param_grid['epochs'])
    
    print(f"\n--- Combination {i+1}/{num_combinations} ---")
    print(f"Testing: Learning Rate={lr}, Epochs={epochs}")
    
    # Instantiate a fresh model for each run
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    scaler = torch.cuda.amp.GradScaler()
    
    # Training loop for this combination
    for epoch in range(epochs):
        avg_train_loss = train_epoch(model, subset_loader, optimizer, device, scaler)
    
    # Evaluate on the full validation set
    val_accuracy, val_f1 = eval_model(model, val_loader, device)
    print(f"Result: Val Accuracy={val_accuracy*100:.2f}%, Val F1={val_f1:.4f}")
    
    tuning_results.append({'lr': lr, 'epochs': epochs, 'f1': val_f1})
    
    # Check if this is the best model so far
    if val_f1 > best_f1:
        best_f1 = val_f1
        best_params = {'learning_rate': lr, 'epochs': epochs}

print("\n--- Tuning Complete ---")
print(f"Best F1 Score on Validation Set: {best_f1:.4f}")
print(f"Best Hyperparameters: {best_params}")

# Save best_params to a JSON file
with open('../models/best_params.json', 'w') as f:
    json.dump(best_params, f)
print("Best hyperparameters saved to ../models/best_params.json")

Created a tuning subset with 20000 samples.

Starting hyperparameter tuning...

--- Combination 1/5 ---
Testing: Learning Rate=3e-05, Epochs=4


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()
  super().__init__(


Result: Val Accuracy=50.00%, Val F1=0.3333

--- Combination 2/5 ---
Testing: Learning Rate=2e-05, Epochs=3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()
  super().__init__(


Result: Val Accuracy=85.01%, Val F1=0.8501

--- Combination 3/5 ---
Testing: Learning Rate=1e-05, Epochs=3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()
  super().__init__(


Result: Val Accuracy=85.74%, Val F1=0.8574

--- Combination 4/5 ---
Testing: Learning Rate=1e-05, Epochs=4


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()
  super().__init__(


Result: Val Accuracy=85.31%, Val F1=0.8529

--- Combination 5/5 ---
Testing: Learning Rate=1e-05, Epochs=4


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()
  super().__init__(


Result: Val Accuracy=83.90%, Val F1=0.8381

--- Tuning Complete ---
Best F1 Score on Validation Set: 0.8574
Best Hyperparameters: {'learning_rate': 1e-05, 'epochs': 3}
Best hyperparameters saved to ../models/best_params.json


### 4.5 Transformer Model - Final Training

Now that we have identified the optimal hyperparameters from our search, we will use them to train our final model on the **full training dataset**. This ensures our model is both well-configured and learns from all available data.

In [5]:
import json # Import json for saving/loading hyperparameters

print("--- Starting Final Model Training ---")

# Load best_params from the JSON file
try:
    with open('../models/best_params.json', 'r') as f:
        best_params = json.load(f)
    print(f"Loaded optimal parameters: {best_params}")
except FileNotFoundError:
    print("Error: best_params.json not found. Please run the Hyperparameter Tuning step (4.3) first.")
    # Fallback to default if file not found, or raise an error
    best_params = {'learning_rate': 2e-5, 'epochs': 3} # Default values
    print(f"Using default parameters: {best_params}")

# 1. Instantiate the final model
final_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

# 2. Set up optimizer and scaler with the best learning rate
optimizer = AdamW(final_model.parameters(), lr=best_params['learning_rate'])
scaler = torch.cuda.amp.GradScaler()

# 3. Train on the full training data for the optimal number of epochs
for epoch in range(best_params['epochs']):
    start_time = time.time()
    
    # Use the full 'train_loader' this time
    avg_train_loss = train_epoch(final_model, train_loader, optimizer, device, scaler)
    val_accuracy, val_f1 = eval_model(final_model, val_loader, device)
    
    end_time = time.time()
    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
    
    print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins:.0f}m {epoch_secs:.0f}s')
    print(f'\tTrain Loss: {avg_train_loss:.3f}')
    print(f'\tVal. Accuracy: {val_accuracy*100:.2f}%')
    print(f'\tVal. F1 Score: {val_f1:.3f}')

# 4. Save the final, tuned model
final_model.save_pretrained('../models/bertweet_sentiment_finetuned')
tokenizer.save_pretrained('../models/bertweet_sentiment_finetuned')
print("\nFinal fine-tuned model saved to '../models/bertweet_sentiment_finetuned'")

--- Starting Final Model Training ---
Loaded optimal parameters: {'learning_rate': 1e-05, 'epochs': 3}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()
  super().__init__(


Epoch: 01 | Time: 51m 56s
	Train Loss: 0.339
	Val. Accuracy: 87.11%
	Val. F1 Score: 0.871
Epoch: 02 | Time: 165m 57s
	Train Loss: 0.276
	Val. Accuracy: 86.94%
	Val. F1 Score: 0.869
Epoch: 03 | Time: 135m 5s
	Train Loss: 0.220
	Val. Accuracy: 86.42%
	Val. F1 Score: 0.864

Final fine-tuned model saved to '../models/bertweet_sentiment_finetuned'
