# File for doing our axis classification

In [15]:
import json
import torch
import numpy as np
from torch import nn
from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [16]:
# create some unlabeled data by removing axis labels from gpt output, so model has labeled and unlabeled data

with open('outputs/speeches_114_trimmed_gpt_axis_labels.json', 'r') as f:
    speeches = json.load(f)

# Remove the specified keys from each speech entry
for speech_key in speeches.keys():
    speeches[speech_key].pop('emotional_intensity', None)
    speeches[speech_key].pop('political_spectrum', None)

# Write the updated data to the output file
with open('outputs/speeches_114_trimmed_unlabeled_axis.json', 'w') as f:
    json.dump(speeches, f, indent=4)

In [12]:
class SpeechDataset(Dataset):
    def __init__(self, speeches, labels=None, tokenizer=None, max_length=512):
        """
        Dataset for political speeches
        
        Args:
            speeches (dict): Dictionary of speech IDs to speech text
            labels (dict): Dictionary of speech IDs to labels (optional)
            tokenizer: RoBERTa tokenizer
            max_length (int): Maximum sequence length
        """
        self.speeches = speeches
        self.speech_ids = list(speeches.keys())
        self.labels = labels
        self.tokenizer = tokenizer or RobertaTokenizer.from_pretrained('roberta-base')
        self.max_length = max_length

    def __len__(self):
        return len(self.speech_ids)

    def __getitem__(self, idx):
        speech_id = self.speech_ids[idx]
        speech = self.speeches[speech_id]['speech']

        # Tokenize speech
        encoding = self.tokenizer(
            speech,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'speech_id': speech_id
        }

        # Add labels if available
        if self.labels is not None:
            item['emotional_intensity'] = torch.tensor(self.labels[speech_id]['emotional_intensity'] - 1)  # 0-based indexing
            item['political_spectrum'] = torch.tensor(self.labels[speech_id]['political_spectrum'] - 1)

        return item

class PoliticalSpeechClassifier(nn.Module):
    def __init__(self, num_classes=10, dropout_rate=0.1):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        
        # Freeze some layers to prevent overfitting
        for param in self.roberta.embeddings.parameters():
            param.requires_grad = False
        
        # Classifier heads
        hidden_size = self.roberta.config.hidden_size
        self.emotional_classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )
        
        self.political_classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output

        emotional_logits = self.emotional_classifier(pooled_output)
        political_logits = self.political_classifier(pooled_output)

        return emotional_logits, political_logits

def load_data(labeled_file, unlabeled_file=None):
    """Load labeled and unlabeled data from JSON files"""
    with open(labeled_file, 'r') as f:
        labeled_data = json.load(f)
    
    unlabeled_data = None
    if unlabeled_file:
        with open(unlabeled_file, 'r') as f:
            unlabeled_data = json.load(f)
    
    return labeled_data, unlabeled_data

def train_model(model, train_loader, val_loader, num_epochs=5, device='cuda'):
    """Train the model"""
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs)
    
    best_val_loss = float('inf')
    best_model = None
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
        
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            emotional_labels = batch['emotional_intensity'].to(device)
            political_labels = batch['political_spectrum'].to(device)
            
            optimizer.zero_grad()
            emotional_logits, political_logits = model(input_ids, attention_mask)
            
            loss = criterion(emotional_logits, emotional_labels) + criterion(political_logits, political_labels)
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            total_loss += loss.item()
            
            progress_bar.set_postfix({'loss': loss.item()})
        
        # Validation
        model.eval()
        val_loss = 0
        correct_emotional = 0
        correct_political = 0
        total = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                emotional_labels = batch['emotional_intensity'].to(device)
                political_labels = batch['political_spectrum'].to(device)
                
                emotional_logits, political_logits = model(input_ids, attention_mask)
                
                loss = criterion(emotional_logits, emotional_labels) + criterion(political_logits, political_labels)
                val_loss += loss.item()
                
                _, emotional_predicted = torch.max(emotional_logits, 1)
                _, political_predicted = torch.max(political_logits, 1)
                
                correct_emotional += (emotional_predicted == emotional_labels).sum().item()
                correct_political += (political_predicted == political_labels).sum().item()
                total += emotional_labels.size(0)
        
        val_loss = val_loss / len(val_loader)
        emotional_accuracy = 100 * correct_emotional / total
        political_accuracy = 100 * correct_political / total
        
        print(f'Validation Loss: {val_loss:.4f}')
        print(f'Emotional Accuracy: {emotional_accuracy:.2f}%')
        print(f'Political Accuracy: {political_accuracy:.2f}%')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict().copy()
        
        scheduler.step()
    
    return best_model

def predict(model, dataset, device='cuda'):
    """Generate predictions for unlabeled data"""
    model.eval()
    predictions = {}
    
    loader = DataLoader(dataset, batch_size=16, shuffle=False)
    
    with torch.no_grad():
        for batch in tqdm(loader, desc='Generating predictions'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            speech_ids = batch['speech_id']
            
            emotional_logits, political_logits = model(input_ids, attention_mask)
            
            emotional_preds = torch.max(emotional_logits, 1)[1]
            political_preds = torch.max(political_logits, 1)[1]
            
            # Convert to 1-based indexing and move to CPU
            emotional_preds = emotional_preds.cpu().numpy() + 1
            political_preds = political_preds.cpu().numpy() + 1
            
            for speech_id, emotional, political in zip(speech_ids, emotional_preds, political_preds):
                predictions[speech_id] = {
                    'emotional_intensity': int(emotional),
                    'political_spectrum': int(political)
                }
    
    return predictions

In [14]:
def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load data (labeled, unlabeled)
    labeled_data, unlabeled_data = load_data('outputs/speeches_113_trimmed_gpt_axis_labels.json', 'outputs/speeches_114_trimmed_unlabeled_axis.json')
    
    # Split labeled data into train/val
    train_ids, val_ids = train_test_split(list(labeled_data.keys()), test_size=0.2, random_state=42)
    
    train_data = {k: labeled_data[k] for k in train_ids}
    val_data = {k: labeled_data[k] for k in val_ids}
    
    # Create datasets
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    
    train_dataset = SpeechDataset(train_data, train_data, tokenizer)
    val_dataset = SpeechDataset(val_data, val_data, tokenizer)
    
    if unlabeled_data:
        predict_dataset = SpeechDataset(unlabeled_data, tokenizer=tokenizer)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    
    # Initialize model
    model = PoliticalSpeechClassifier()
    
    # Train model
    best_model_state = train_model(model, train_loader, val_loader, device=device)
    
    # Load best model
    model.load_state_dict(best_model_state)
    
    # Generate predictions for unlabeled data
    if unlabeled_data:
        predictions = predict(model, predict_dataset, device)
        
        # Save predictions
        with open('predictions.json', 'w') as f:
            json.dump(predictions, f, indent=2)

if __name__ == "__main__":
    main()

Using device: cpu


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5: 100%|██████████| 5/5 [02:55<00:00, 35.09s/it, loss=4.44]


Validation Loss: 4.4675
Emotional Accuracy: 15.00%
Political Accuracy: 45.00%


Epoch 2/5: 100%|██████████| 5/5 [02:38<00:00, 31.77s/it, loss=4.26]


Validation Loss: 4.2086
Emotional Accuracy: 30.00%
Political Accuracy: 45.00%


Epoch 3/5: 100%|██████████| 5/5 [02:26<00:00, 29.29s/it, loss=3.98]


Validation Loss: 4.1003
Emotional Accuracy: 30.00%
Political Accuracy: 45.00%


Epoch 4/5: 100%|██████████| 5/5 [02:13<00:00, 26.73s/it, loss=3.94]


Validation Loss: 4.0470
Emotional Accuracy: 25.00%
Political Accuracy: 45.00%


Epoch 5/5: 100%|██████████| 5/5 [02:22<00:00, 28.49s/it, loss=3.67]


Validation Loss: 4.0293
Emotional Accuracy: 25.00%
Political Accuracy: 45.00%


Generating predictions: 100%|██████████| 6/6 [00:17<00:00,  2.95s/it]
