In [None]:
# imports

import json
import torch
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from torch import nn
from transformers import RobertaTokenizer, RobertaModel
from sklearn.metrics import confusion_matrix, mean_absolute_error, f1_score, classification_report
import os
from tqdm import tqdm

In [None]:
class PoliticalSpeechClassifier(nn.Module):
    def __init__(self, num_classes=5, dropout_rate=0.2):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        
        # Unfreeze more layers since we have more data
        for param in self.roberta.encoder.layer[-8:].parameters():
            param.requires_grad = True
        
        hidden_size = self.roberta.config.hidden_size
        
        # Shared features layer
        self.shared_features = nn.Sequential(
            nn.Linear(hidden_size, 1024),
            nn.LayerNorm(1024),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(1024, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )
        
        # Task-specific layers
        self.emotional_classifier = nn.Sequential(
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )
        
        self.political_classifier = nn.Sequential(
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        
        # Use mean pooling instead of just [CLS] token
        token_embeddings = outputs.last_hidden_state
        attention_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * attention_expanded, 1)
        sum_mask = torch.clamp(attention_expanded.sum(1), min=1e-9)
        pooled_output = sum_embeddings / sum_mask
        
        # Get shared features
        shared_features = self.shared_features(pooled_output)
        
        # Get task-specific predictions
        emotional_logits = self.emotional_classifier(shared_features)
        political_logits = self.political_classifier(shared_features)
        
        return emotional_logits, political_logits

def load_model(model_path, device):
    """
    Load the trained model
    
    Args:
        model_path (str): path to the trained model
        device (str): 'cpu' or 'cuda'

    Returns:
        model: the trained model
    """
    model_state = torch.load(model_path, map_location=device)
    model = PoliticalSpeechClassifier()
    model.load_state_dict(model_state['model_state_dict'])
    model.to(device)
    model.eval()
    return model

def predict_speech(model, tokenizer, speech_text, device):
    """
    Make predictions for a single speech
    
    Args:
        model: the trained model
        tokenizer: the tokenizer used to preprocess the text
        speech_text (str): the text of the speech
        device (str): 'cpu' or 'cuda'

    Returns:
        dict: a dictionary containing the predicted values
    """
    encoding = tokenizer(
        speech_text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        emotional_logits, political_logits = model(input_ids, attention_mask)
        
        emotional_probs = torch.softmax(emotional_logits, dim=1)
        political_probs = torch.softmax(political_logits, dim=1)
        
        emotional_pred = torch.argmax(emotional_probs, dim=1).item() + 1
        political_pred = torch.argmax(political_probs, dim=1).item() + 1
        
        emotional_confidence = emotional_probs[0][emotional_pred-1].item()
        political_confidence = political_probs[0][political_pred-1].item()
    
    return {
        'emotional_intensity': emotional_pred,
        'emotional_confidence': emotional_confidence,
        'political_spectrum': political_pred,
        'political_confidence': political_confidence
    }

def calculate_metrics(true_values, predicted_values):
    """
    Calculate various accuracy metrics
    
    Args:
        true_values (list): list of true values
        predicted_values (list): list of predicted values

    Returns:
        dict: a dictionary containing the calculated metrics
    """
    correct = sum(1 for t, p in zip(true_values, predicted_values) if t == p)
    accuracy = correct / len(true_values)
    
    # Calculate how far off predictions are
    mae = mean_absolute_error(true_values, predicted_values)
    
    # Calculate percentage of predictions that are off by at most 1
    off_by_one = sum(1 for t, p in zip(true_values, predicted_values) if abs(t - p) <= 1)
    off_by_one_pct = off_by_one / len(true_values)

    # Micro F1
    f1_micro = f1_score(true_values, predicted_values, average='micro')
    # Macro F1
    f1_macro = f1_score(true_values, predicted_values, average='macro')
    # Weighted F1
    f1_weighted = f1_score(true_values, predicted_values, average='weighted')
    
    # Detailed classification report
    class_report = classification_report(true_values, predicted_values, 
                                       labels=[1, 2, 3, 4, 5],
                                       output_dict=True)
    
    return {
        'accuracy': accuracy,
        'mae': mae,
        'off_by_one_pct': off_by_one_pct,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'class_report': class_report
    }


def plot_confusion_matrix(true_values, predicted_values, title, save_path):
    """Plot and save confusion matrix"""
    cm = confusion_matrix(true_values, predicted_values)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(save_path)
    plt.close()

def plot_prediction_distribution(predictions, title, save_path):
    """Plot distribution of predictions"""
    plt.figure(figsize=(10, 6))
    sns.countplot(x=predictions)
    plt.title(title)
    plt.xlabel('Predicted Value')
    plt.ylabel('Count')
    plt.savefig(save_path)
    plt.close()

In [None]:
def evaluate_model(json_path, model_path, n_samples=100, save_dir='evaluation_results'):
    """
    Main evaluation function
    
    Args:
        json_path (str): path to the JSON file containing the speech data
        model_path (str): path to the trained model
        n_samples (int): number of speeches to sample
        save_dir (str): directory to save evaluation results

    Returns:
        tuple(dict, list): a dictionary containing the calculated metrics, and
            a list of dictionaries containing the predicted values
    """
    # Set random seed
    random.seed(42)
    
    # Create save directory
    os.makedirs(save_dir, exist_ok=True)
    
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load data
    print("Loading data...")
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    # Randomly sample n speeches
    speech_ids = random.sample(list(data.keys()), n_samples)
    sampled_data = {k: data[k] for k in speech_ids}
    
    # Load model and tokenizer
    print("Loading model...")
    model = load_model(model_path, device)
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    
    # Make predictions
    print("Making predictions...")
    predictions = []
    true_emotional = []
    true_political = []
    pred_emotional = []
    pred_political = []
    
    for speech_id, speech_data in tqdm(sampled_data.items()):
        # Get true values
        true_emotional.append(speech_data['emotional_intensity'])
        true_political.append(speech_data['political_spectrum'])
        
        # Get predictions
        pred = predict_speech(model, tokenizer, speech_data['speech'], device)
        predictions.append(pred)
        pred_emotional.append(pred['emotional_intensity'])
        pred_political.append(pred['political_spectrum'])
    
    # Calculate metrics
    print("\nCalculating metrics...")
    emotional_metrics = calculate_metrics(true_emotional, pred_emotional)
    political_metrics = calculate_metrics(true_political, pred_political)
    
    # Save metrics
    metrics = {
        'emotional_intensity': emotional_metrics,
        'political_spectrum': political_metrics
    }
    
    with open(f"{save_dir}/evaluation_metrics.json", 'w') as f:
        json.dump(metrics, f, indent=4)
    
    # Plot confusion matrices
    plot_confusion_matrix(
        true_emotional, 
        pred_emotional, 
        "Emotional Intensity Confusion Matrix",
        f"{save_dir}/emotional_confusion_matrix.png"
    )
    
    plot_confusion_matrix(
        true_political, 
        pred_political, 
        "Political Spectrum Confusion Matrix",
        f"{save_dir}/political_confusion_matrix.png"
    )
    
    # Plot prediction distributions
    plot_prediction_distribution(
        pred_emotional,
        "Distribution of Emotional Intensity Predictions",
        f"{save_dir}/emotional_distribution.png"
    )
    
    plot_prediction_distribution(
        pred_political,
        "Distribution of Political Spectrum Predictions",
        f"{save_dir}/political_distribution.png"
    )
    
    # Print summary
    print("\nEvaluation Summary:")
    print("\nEmotional Intensity Metrics:")
    print(f"Accuracy: {emotional_metrics['accuracy']:.2%}")
    print(f"Mean Absolute Error: {emotional_metrics['mae']:.2f}")
    print(f"Predictions within ±1: {emotional_metrics['off_by_one_pct']:.2%}")
    print(f"F1 Score (micro): {emotional_metrics['f1_micro']:.2f}")
    print(f"F1 Score (macro): {emotional_metrics['f1_macro']:.2f}")
    print(f"F1 Score (weighted): {emotional_metrics['f1_weighted']:.2f}")
    print("\nDetailed Classification Report:")
    for score in range(1, 6):
        metrics = emotional_metrics['class_report'][str(score)]
        print(f"\nScore {score}:")
        print(f"  Precision: {metrics['precision']:.2f}")
        print(f"  Recall: {metrics['recall']:.2f}")
        print(f"  F1-score: {metrics['f1-score']:.2f}")
        print(f"  Support: {metrics['support']}")
    
    print("\nPolitical Spectrum Metrics:")
    print(f"Accuracy: {political_metrics['accuracy']:.2%}")
    print(f"Mean Absolute Error: {political_metrics['mae']:.2f}")
    print(f"Predictions within ±1: {political_metrics['off_by_one_pct']:.2%}")
    print(f"F1 Score (micro): {political_metrics['f1_micro']:.2f}")
    print(f"F1 Score (macro): {political_metrics['f1_macro']:.2f}")
    print(f"F1 Score (weighted): {political_metrics['f1_weighted']:.2f}")
    
    print("\nDetailed Political Classification Report:")
    for score in range(1, 6):
        metrics = political_metrics['class_report'][str(score)]
        print(f"\nScore {score}:")
        print(f"  Precision: {metrics['precision']:.2f}")
        print(f"  Recall: {metrics['recall']:.2f}")
        print(f"  F1-score: {metrics['f1-score']:.2f}")
        print(f"  Support: {metrics['support']}")
    
    return metrics, predictions

In [None]:
if __name__ == "__main__":
    JSON_PATH = "speeches_111_gpt_axis_labels_copy.json"
    MODEL_PATH = "../large-training-output/model_artifacts_20241202_142615/model.pt"
    N_SAMPLES = 500  # Number of speeches to evaluate
    
    metrics, predictions = evaluate_model(
        json_path=JSON_PATH,
        model_path=MODEL_PATH,
        n_samples=N_SAMPLES
    )

Using device: cpu
Loading data...
Loading model...


  model_state = torch.load(model_path, map_location=device)
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Making predictions...


100%|██████████| 500/500 [02:02<00:00,  4.09it/s]



Calculating metrics...

Evaluation Summary:

Emotional Intensity Metrics:
Accuracy: 82.80%
Mean Absolute Error: 0.18
Predictions within ±1: 99.60%
F1 Score (micro): 0.83
F1 Score (macro): 0.85
F1 Score (weighted): 0.83

Detailed Classification Report:

Score 1:
  Precision: 0.90
  Recall: 0.95
  F1-score: 0.92
  Support: 119.0

Score 2:
  Precision: 0.73
  Recall: 0.67
  F1-score: 0.70
  Support: 95.0

Score 3:
  Precision: 0.85
  Recall: 0.75
  F1-score: 0.80
  Support: 172.0

Score 4:
  Precision: 0.79
  Recall: 0.95
  F1-score: 0.86
  Support: 101.0

Score 5:
  Precision: 1.00
  Recall: 0.92
  F1-score: 0.96
  Support: 13.0

Political Spectrum Metrics:
Accuracy: 85.40%
Mean Absolute Error: 0.15
Predictions within ±1: 99.20%
F1 Score (micro): 0.85
F1 Score (macro): 0.84
F1 Score (weighted): 0.86

Detailed Political Classification Report:

Score 1:
  Precision: 0.71
  Recall: 0.91
  F1-score: 0.80
  Support: 11.0

Score 2:
  Precision: 0.71
  Recall: 0.82
  F1-score: 0.76
  Support: 