# Exploring the DistilBERT Sentiment Analysis Model

This notebook allows you to explore the trained DistilBERT model, load it from a configuration file, and test it with your own text input using an interactive widget.

In [1]:
import os
import yaml
import torch
import numpy as np
import pickle
import ipywidgets as widgets
from IPython.display import display, HTML
from src.data import YelpDataProcessor
from src.models import DistilBERTSentimentModel
import logging

# Configure logging to be less verbose in the notebook
logging.basicConfig(level=logging.WARNING)

## 1. Load Configuration and Models

First, let's load the model configuration and trained model.

In [2]:
# Select which configuration to use
config_selector = widgets.Dropdown(
    options=[
        ('Default DistilBERT Config', 'model_configs/distilbert_default.yaml'),
        ('Tuned DistilBERT Config', 'model_configs/distilbert_tuning_v1.yaml')
    ],
    value='model_configs/distilbert_tuning_v1.yaml',
    description='Config:',
    style={'description_width': 'initial'}
)

display(config_selector)

Dropdown(description='Config:', index=1, options=(('Default DistilBERT Config', 'model_configs/distilbert_defa…

In [3]:
def load_config(config_path):
    """Load configuration from YAML file"""
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)    
    return config

def load_model_and_processor(config_path):
    """Load model and data processor based on config"""
    # Load configuration
    print(f"Loading configuration from {config_path}")
    config = load_config(config_path)
    
    # Extract configuration values
    data_config = config.get('data', {})
    model_config = config.get('model', {})
    hp_tuning_config = config.get('hyperparameter_tuning', {})
    best_params = hp_tuning_config.get('best_params', None)
    
    # Apply best parameters if they exist, otherwise use defaults
    effective_model_config = model_config.copy()
    if best_params is not None:
        if 'dropout' in best_params:
            effective_model_config['dropout'] = best_params['dropout']
                
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Initialize data processor with DistilBERT tokenizer
    pretrained_model = effective_model_config.get('pretrained_model', 'distilbert-base-cased')
    data_processor = YelpDataProcessor(
        data_path=data_config.get('path'),
        max_length=data_config.get('max_length', 128),
        batch_size=data_config.get('batch_size', 32),
        tokenizer_name=pretrained_model
    )
    
    # Load label encoder
    label_encoder_path = 'models/label_encoder.pkl'
    if os.path.exists(label_encoder_path):
        with open(label_encoder_path, 'rb') as f:
            data_processor.label_encoder = pickle.load(f)
    else:
        print("Label encoder not found. Loading sample data to create one.")
        df = data_processor.load_data()
        data_processor.prepare_data_bert(df)
    
    # Determine model name/directory
    model_name = config.get('name', 'distilbert_sentiment_model')
    model_dir = f"models/{model_name}"
    
    # Build model with configuration
    model = DistilBERTSentimentModel(
        num_classes=len(data_processor.label_encoder.classes_),
        dropout=effective_model_config.get('dropout', 0.1),
        pretrained_model=pretrained_model
    )
    
    # Load model weights if available
    best_model_path = os.path.join(model_dir, 'model_best_f1.pt')
    if os.path.exists(best_model_path):
        model.load_state_dict(torch.load(best_model_path, map_location=device, weights_only=True))
        print(f"Loaded trained model from {best_model_path}")
    else:
        print(f"Trained model not found at {best_model_path}. Using untrained model.")
    
    model.to(device)
    model.eval()  # Set model to evaluation mode
    
    return model, data_processor

In [4]:
# Load the model and processor based on selected config
model, data_processor = load_model_and_processor(config_selector.value)

Loading configuration from model_configs/distilbert_tuning_v1.yaml
Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded trained model from models/distilbert_sentiment_model/model_best_f1.pt


## 2. Model Architecture and Configuration

Let's examine our model architecture and the configuration that was used.

In [5]:
# Display model architecture
print("DistilBERT Sentiment Analysis Model Architecture:")
print(model)

# Display configuration details
config = load_config(config_selector.value)
print("\nConfiguration:")
for section, params in config.items():
    print(f"\n{section.upper()}:")
    if isinstance(params, dict):
        for param, value in params.items():
            print(f"  {param}: {value}")
    else:
        print(f"  {params}")

DistilBERT Sentiment Analysis Model Architecture:
DistilBERTSentimentModel(
  (bert): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(28996, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): DistilBertSdpaAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNo

## 3. Interactive Text Classification

Now let's create a widget to input your own text and see the sentiment prediction.

In [6]:
def predict_sentiment(text, model, data_processor, device='cuda' if torch.cuda.is_available() else 'cpu'):
    """Process text and make a prediction using DistilBERT"""
    # Ensure model is in evaluation mode
    model.eval()
    
    # Preprocess the text (keep case for DistilBERT)
    processed_text = data_processor.preprocess_text(text, lower=False)
    
    # Tokenize using DistilBERT tokenizer
    encoding = data_processor.bert_tokenizer(
        processed_text,
        truncation=True,
        padding='max_length',
        max_length=data_processor.max_length,
        return_tensors='pt'
    )
    
    # Move tensors to the right device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Make prediction
    with torch.no_grad():
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        probabilities = torch.nn.functional.softmax(output, dim=1)[0]
        predicted_class = torch.argmax(probabilities).item()
    
    # Get class name and probabilities
    predicted_label = data_processor.label_encoder.classes_[predicted_class]
    probs_dict = {data_processor.label_encoder.classes_[i]: prob.item() for i, prob in enumerate(probabilities)}
    
    return predicted_label, probs_dict, processed_text

In [7]:
# Create UI elements
text_input = widgets.Textarea(
    value='This restaurant was amazing! The food was delicious and the service was excellent.',
    placeholder='Enter your text here...',
    description='Review:',
    layout=widgets.Layout(width='100%', height='100px')
)

run_button = widgets.Button(
    description='Analyze Sentiment',
    button_style='primary',
    tooltip='Click to analyze the sentiment of the text'
)

config_change_button = widgets.Button(
    description='Change Config',
    button_style='info',
    tooltip='Click to load model from the selected config'
)

output_area = widgets.Output()

# Text styling for output
def style_prediction(label, probabilities):
    """Style the prediction output with colors and bars"""
    colors = {
        'positive': 'green',
        'neutral': 'orange',
        'negative': 'red'
    }
    
    result = f"<h3>Prediction: <span style='color:{colors.get(label, 'blue')}'>{label.upper()}</span></h3>"
    result += "<h4>Confidence Scores:</h4>"
    
    for label, prob in sorted(probabilities.items(), key=lambda x: x[1], reverse=True):
        percentage = prob * 100
        color = colors.get(label, 'blue')
        result += f"<div style='margin-bottom:5px;'>"
        result += f"<span style='display:inline-block; width:100px;'>{label}:</span>"
        result += f"<div style='display:inline-block; width:{percentage}%; background-color:{color}; height:20px;'></div>"
        result += f"<span style='margin-left:10px;'>{percentage:.2f}%</span>"
        result += "</div>"
    
    return result

# Define button click handlers
def on_run_button_clicked(b):
    with output_area:
        output_area.clear_output()
        if model is None or data_processor is None:
            print("Error: Model or data processor not loaded properly.")
            return
        
        text = text_input.value
        if not text.strip():
            print("Please enter some text to analyze.")
            return
        
        predicted_label, probabilities, processed_text = predict_sentiment(
            text, model, data_processor
        )
        
        print(f"Original text: {text}")
        print(f"Processed text: {processed_text}")
        display(HTML(style_prediction(predicted_label, probabilities)))

def on_config_change_clicked(b):
    global model, data_processor
    with output_area:
        output_area.clear_output()
        try:
            model, data_processor = load_model_and_processor(config_selector.value)
            print("Model and data processor loaded successfully!")
        except Exception as e:
            print(f"Error loading model: {e}")

# Attach click handlers
run_button.on_click(on_run_button_clicked)
config_change_button.on_click(on_config_change_clicked)

# Display UI
display(text_input)
display(widgets.HBox([run_button, config_change_button]))
display(output_area)

# Initialize prediction
on_run_button_clicked(None)

Textarea(value='This restaurant was amazing! The food was delicious and the service was excellent.', descripti…

HBox(children=(Button(button_style='primary', description='Analyze Sentiment', style=ButtonStyle(), tooltip='C…

Output()

## 4. Exploring Model Predictions on Sample Reviews

Let's look at some sample reviews and how the model predicts them.

In [8]:
sample_reviews = [
    "The food was absolutely terrible. I'll never come back to this restaurant again.",
    "The service was okay, but the food was mediocre. Not worth the price.",
    "It was an average experience. Nothing special but not bad either.",
    "The staff was friendly and the atmosphere was nice, but the food was just decent.",
    "Amazing experience! The chef prepared the best meal I've had in years."
]

for i, review in enumerate(sample_reviews):
    print(f"\nSample {i+1}: {review}")
    predicted_label, probabilities, processed_text = predict_sentiment(
        review, model, data_processor
    )
    print(f"Prediction: {predicted_label}")
    print("Probabilities:")
    for label, prob in sorted(probabilities.items(), key=lambda x: x[1], reverse=True):
        print(f"  {label}: {prob:.4f} ({prob*100:.2f}%)")


Sample 1: The food was absolutely terrible. I'll never come back to this restaurant again.
Prediction: positive
Probabilities:
  positive: 1.0000 (100.00%)
  negative: 0.0000 (0.00%)
  neutral: 0.0000 (0.00%)

Sample 2: The service was okay, but the food was mediocre. Not worth the price.
Prediction: positive
Probabilities:
  positive: 1.0000 (100.00%)
  negative: 0.0000 (0.00%)
  neutral: 0.0000 (0.00%)

Sample 3: It was an average experience. Nothing special but not bad either.
Prediction: positive
Probabilities:
  positive: 1.0000 (100.00%)
  negative: 0.0000 (0.00%)
  neutral: 0.0000 (0.00%)

Sample 4: The staff was friendly and the atmosphere was nice, but the food was just decent.
Prediction: positive
Probabilities:
  positive: 1.0000 (100.00%)
  negative: 0.0000 (0.00%)
  neutral: 0.0000 (0.00%)

Sample 5: Amazing experience! The chef prepared the best meal I've had in years.
Prediction: positive
Probabilities:
  positive: 1.0000 (100.00%)
  negative: 0.0000 (0.00%)
  neutral: 

## 5. Exploring Token Importance

For DistilBERT, we can use Integrated Gradients to visualize token importance.

In [9]:
def get_token_attributions(text, model, data_processor, device='cuda' if torch.cuda.is_available() else 'cpu'):
    """Get token attributions using a simple gradient-based approach"""
    model.train()  # Need gradients
    
    # Preprocess text
    processed_text = data_processor.preprocess_text(text, lower=False)
    
    # Tokenize
    encoding = data_processor.bert_tokenizer(
        processed_text,
        truncation=True,
        padding='max_length',
        max_length=data_processor.max_length,
        return_tensors='pt'
    )
    
    # Get tokens for display
    tokens = data_processor.bert_tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
    
    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Create embeddings tensor that requires grad
    model.bert.distilbert.embeddings.word_embeddings.weight.requires_grad = True
    
    # Forward pass with gradients
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    predicted_class = torch.argmax(output, dim=1).item()
    predicted_label = data_processor.label_encoder.classes_[predicted_class]
    
    # Compute gradients with respect to predicted class
    model.zero_grad()
    output[0, predicted_class].backward()
    
    # Get embeddings gradient
    token_embeddings = model.bert.distilbert.embeddings.word_embeddings(input_ids)
    if hasattr(token_embeddings, 'grad') and token_embeddings.grad is not None:
        # Compute L2 norm of gradients as token importance
        token_importance = torch.norm(token_embeddings.grad, dim=2)[0].detach().cpu().numpy()
    else:
        print("Warning: Could not compute gradients for token importance")
        token_importance = np.ones(len(tokens))
    
    # Set model back to eval mode
    model.eval()
    
    # Only include tokens with non-zero attention mask
    mask = attention_mask[0].detach().cpu().numpy()
    
    # Filter special tokens and padding
    special_tokens = ['[CLS]', '[SEP]', '[PAD]']
    filtered_tokens = []
    filtered_importance = []
    
    for i, (token, imp, m) in enumerate(zip(tokens, token_importance, mask)):
        if m > 0 and token not in special_tokens:
            filtered_tokens.append(token)
            filtered_importance.append(imp)
    
    return filtered_tokens, filtered_importance, predicted_label

In [10]:
# Create a function to visualize token importance
def visualize_token_importance(text):
    tokens, importance, predicted_label = get_token_attributions(text, model, data_processor)
    
    if tokens is None or importance is None:
        print("Couldn't extract token importance.")
        return
    
    # Normalize importance for visualization
    if len(importance) > 0:
        max_importance = max(importance)
        if max_importance > 0:
            norm_importance = [imp / max_importance for imp in importance]
        else:
            norm_importance = [0.0] * len(importance)
    else:
        norm_importance = []
    
    # Create HTML visualization
    html = f"<h3>Token Importance for: <span style='color:blue'>{predicted_label.upper()}</span></h3>"
    html += "<div style='line-height: 2.5; font-family: monospace; font-size: 16px;'>"
    
    for token, weight in zip(tokens, norm_importance):
        # Map weight to color intensity
        color_intensity = int(255 * (1 - weight))
        background_color = f"rgb(255, {color_intensity}, {color_intensity})"
        
        html += f"<span style='background-color: {background_color}; padding: 3px; margin: 2px; border-radius: 3px;'>{token}</span>"
    
    html += "</div>"
    display(HTML(html))

# Create UI for token importance visualization
importance_text = widgets.Textarea(
    value='The food was delicious but the service was terrible.',
    placeholder='Enter text to visualize token importance...',
    description='Text:',
    layout=widgets.Layout(width='100%', height='100px')
)

importance_button = widgets.Button(
    description='Visualize Tokens',
    button_style='success',
    tooltip='Click to visualize token importance'
)

importance_output = widgets.Output()

def on_importance_button_clicked(b):
    with importance_output:
        importance_output.clear_output()
        visualize_token_importance(importance_text.value)

importance_button.on_click(on_importance_button_clicked)

# Display importance UI
print("\nVisualize which tokens the model finds important:")
display(importance_text)
display(importance_button)
display(importance_output)

# Initialize visualization
on_importance_button_clicked(None)


Visualize which tokens the model finds important:


Textarea(value='The food was delicious but the service was terrible.', description='Text:', layout=Layout(heig…

Button(button_style='success', description='Visualize Tokens', style=ButtonStyle(), tooltip='Click to visualiz…

Output()

## 6. Comparing DistilBERT with LSTM Results

You can manually compare the predictions from the DistilBERT model with the LSTM model by running similar examples in both notebooks.

In [11]:
# Add some challenging reviews that may highlight differences between models
challenging_reviews = [
    "The food wasn't bad, but I wouldn't say it was good either.", # Ambiguous sentiment
    "Great atmosphere, but terrible food and rude service.", # Mixed sentiment
    "Well, this was an interesting experience to say the least.", # Subtle implied sentiment
    "This restaurant is just as good as any other chain restaurant nearby.", # Comparative but neutral
    "I've had better, but I've also had much worse." # Balanced perspective
]

for i, review in enumerate(challenging_reviews):
    print(f"\nChallenging Example {i+1}: {review}")
    predicted_label, probabilities, processed_text = predict_sentiment(
        review, model, data_processor
    )
    print(f"DistilBERT Prediction: {predicted_label}")
    print("Probabilities:")
    for label, prob in sorted(probabilities.items(), key=lambda x: x[1], reverse=True):
        print(f"  {label}: {prob:.4f} ({prob*100:.2f}%)")


Challenging Example 1: The food wasn't bad, but I wouldn't say it was good either.
DistilBERT Prediction: positive
Probabilities:
  positive: 1.0000 (100.00%)
  negative: 0.0000 (0.00%)
  neutral: 0.0000 (0.00%)

Challenging Example 2: Great atmosphere, but terrible food and rude service.
DistilBERT Prediction: positive
Probabilities:
  positive: 1.0000 (100.00%)
  negative: 0.0000 (0.00%)
  neutral: 0.0000 (0.00%)

Challenging Example 3: Well, this was an interesting experience to say the least.
DistilBERT Prediction: positive
Probabilities:
  positive: 1.0000 (100.00%)
  negative: 0.0000 (0.00%)
  neutral: 0.0000 (0.00%)

Challenging Example 4: This restaurant is just as good as any other chain restaurant nearby.
DistilBERT Prediction: positive
Probabilities:
  positive: 1.0000 (100.00%)
  negative: 0.0000 (0.00%)
  neutral: 0.0000 (0.00%)

Challenging Example 5: I've had better, but I've also had much worse.
DistilBERT Prediction: positive
Probabilities:
  positive: 1.0000 (100.00%