# File for doing our axis classification

In [17]:
import json
import torch
import numpy as np
from torch import nn
from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [18]:
# create some unlabeled data by removing axis labels from gpt output, so model has labeled and unlabeled data

with open('outputs/speeches_114_trimmed_gpt_axis_labels.json', 'r') as f:
    speeches = json.load(f)

# Remove the specified keys from each speech entry
for speech_key in speeches.keys():
    speeches[speech_key].pop('emotional_intensity', None)
    speeches[speech_key].pop('political_spectrum', None)

# Write the updated data to the output file
with open('outputs/speeches_114_trimmed_unlabeled_axis.json', 'w') as f:
    json.dump(speeches, f, indent=4)

In [20]:
class SpeechDataset(Dataset):
    def __init__(self, speeches, labels=None, tokenizer=None, max_length=512):
        """
        Dataset for political speeches
        
        Args:
            speeches (dict): Dictionary of speech IDs to speech text
            labels (dict): Dictionary of speech IDs to labels (optional)
            tokenizer: RoBERTa tokenizer
            max_length (int): Maximum sequence length
        """
        self.speeches = speeches
        self.speech_ids = list(speeches.keys())
        self.labels = labels
        self.tokenizer = tokenizer or RobertaTokenizer.from_pretrained('roberta-base')
        self.max_length = max_length

    def __len__(self):
        return len(self.speech_ids)

    def __getitem__(self, idx):
        speech_id = self.speech_ids[idx]
        speech = self.speeches[speech_id]['speech']

        # Tokenize speech
        encoding = self.tokenizer(
            speech,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'speech_id': speech_id
        }

        # Add labels if available
        if self.labels is not None:
            item['emotional_intensity'] = torch.tensor(self.labels[speech_id]['emotional_intensity'] - 1)  # 0-based indexing
            item['political_spectrum'] = torch.tensor(self.labels[speech_id]['political_spectrum'] - 1)

        return item

class PoliticalSpeechClassifier(nn.Module):
    def __init__(self, num_classes=5, dropout_rate=0.2):  # Increased dropout
        super().__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        
        # Unfreeze more layers
        for param in self.roberta.encoder.layer[-2:].parameters():
            param.requires_grad = True
            
        hidden_size = self.roberta.config.hidden_size
        self.shared_layer = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256)
        )
        
        self.emotional_classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )
        
        self.political_classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        
        shared_features = self.shared_layer(pooled_output)
        emotional_logits = self.emotional_classifier(shared_features)
        political_logits = self.political_classifier(shared_features)
        
        return emotional_logits, political_logits

def load_data(labeled_file, unlabeled_file=None):
    """Load labeled and unlabeled data from JSON files"""
    with open(labeled_file, 'r') as f:
        labeled_data = json.load(f)
    
    unlabeled_data = None
    if unlabeled_file:
        with open(unlabeled_file, 'r') as f:
            unlabeled_data = json.load(f)
    
    return labeled_data, unlabeled_data

def train_model(model, train_loader, val_loader, num_epochs=10, device='cuda'):
    """Train the model"""
    model = model.to(device)
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  # Add label smoothing
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.1)

    # Warmup + cosine decay
    from transformers import get_cosine_schedule_with_warmup
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 10
    scheduler = get_cosine_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    
    best_val_loss = float('inf')
    best_model = None
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
        
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            emotional_labels = batch['emotional_intensity'].to(device)
            political_labels = batch['political_spectrum'].to(device)
            
            optimizer.zero_grad()
            emotional_logits, political_logits = model(input_ids, attention_mask)
            
            loss = criterion(emotional_logits, emotional_labels) + criterion(political_logits, political_labels)
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            total_loss += loss.item()
            
            progress_bar.set_postfix({'loss': loss.item()})
        
        # Validation
        model.eval()
        val_loss = 0
        correct_emotional = 0
        correct_political = 0
        total = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                emotional_labels = batch['emotional_intensity'].to(device)
                political_labels = batch['political_spectrum'].to(device)
                
                emotional_logits, political_logits = model(input_ids, attention_mask)
                
                loss = criterion(emotional_logits, emotional_labels) + criterion(political_logits, political_labels)
                val_loss += loss.item()
                
                _, emotional_predicted = torch.max(emotional_logits, 1)
                _, political_predicted = torch.max(political_logits, 1)
                
                correct_emotional += (emotional_predicted == emotional_labels).sum().item()
                correct_political += (political_predicted == political_labels).sum().item()
                total += emotional_labels.size(0)
        
        val_loss = val_loss / len(val_loader)
        emotional_accuracy = 100 * correct_emotional / total
        political_accuracy = 100 * correct_political / total
        
        print(f'Validation Loss: {val_loss:.4f}')
        print(f'Emotional Accuracy: {emotional_accuracy:.2f}%')
        print(f'Political Accuracy: {political_accuracy:.2f}%')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict().copy()
        
        scheduler.step()
    
    return best_model

def predict(model, dataset, device='cuda'):
    """Generate predictions for unlabeled data"""
    model.eval()
    predictions = {}
    
    loader = DataLoader(dataset, batch_size=16, shuffle=False)
    
    with torch.no_grad():
        for batch in tqdm(loader, desc='Generating predictions'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            speech_ids = batch['speech_id']
            
            emotional_logits, political_logits = model(input_ids, attention_mask)
            
            emotional_preds = torch.max(emotional_logits, 1)[1]
            political_preds = torch.max(political_logits, 1)[1]
            
            # Convert to 1-based indexing and move to CPU
            emotional_preds = emotional_preds.cpu().numpy() + 1
            political_preds = political_preds.cpu().numpy() + 1
            
            for speech_id, emotional, political in zip(speech_ids, emotional_preds, political_preds):
                predictions[speech_id] = {
                    'emotional_intensity': int(emotional),
                    'political_spectrum': int(political)
                }
    
    return predictions

In [21]:
def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load data (labeled, unlabeled)
    labeled_data, unlabeled_data = load_data('outputs/speeches_113_trimmed_gpt_axis_labels.json', 'outputs/speeches_114_trimmed_unlabeled_axis.json')
    
    # Split labeled data into train/val
    train_ids, val_ids = train_test_split(list(labeled_data.keys()), test_size=0.2, random_state=42)
    
    train_data = {k: labeled_data[k] for k in train_ids}
    val_data = {k: labeled_data[k] for k in val_ids}
    
    # Create datasets
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    
    train_dataset = SpeechDataset(train_data, train_data, tokenizer)
    val_dataset = SpeechDataset(val_data, val_data, tokenizer)
    
    if unlabeled_data:
        predict_dataset = SpeechDataset(unlabeled_data, tokenizer=tokenizer)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    
    # Initialize model
    model = PoliticalSpeechClassifier()
    
    # Train model
    best_model_state = train_model(model, train_loader, val_loader, device=device)
    
    # Load best model
    model.load_state_dict(best_model_state)
    
    # Generate predictions for unlabeled data
    if unlabeled_data:
        predictions = predict(model, predict_dataset, device)
        
        # Save predictions
        with open('predictions.json', 'w') as f:
            json.dump(predictions, f, indent=2)

In [22]:
if __name__ == "__main__":
    main()

Using device: cpu


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 5/5 [02:18<00:00, 27.76s/it, loss=3.24]


Validation Loss: 3.1975
Emotional Accuracy: 15.00%
Political Accuracy: 55.00%


Epoch 2/10: 100%|██████████| 5/5 [02:09<00:00, 25.85s/it, loss=3.2] 


Validation Loss: 3.1929
Emotional Accuracy: 15.00%
Political Accuracy: 55.00%


Epoch 3/10: 100%|██████████| 5/5 [02:13<00:00, 26.69s/it, loss=3.19]


Validation Loss: 3.1831
Emotional Accuracy: 15.00%
Political Accuracy: 55.00%


Epoch 4/10: 100%|██████████| 5/5 [02:16<00:00, 27.39s/it, loss=3.17]


Validation Loss: 3.1669
Emotional Accuracy: 15.00%
Political Accuracy: 55.00%


Epoch 5/10: 100%|██████████| 5/5 [02:13<00:00, 26.65s/it, loss=3.17]


Validation Loss: 3.1351
Emotional Accuracy: 30.00%
Political Accuracy: 55.00%


Epoch 6/10: 100%|██████████| 5/5 [02:05<00:00, 25.07s/it, loss=3.07]


Validation Loss: 3.0512
Emotional Accuracy: 30.00%
Political Accuracy: 55.00%


Epoch 7/10: 100%|██████████| 5/5 [02:09<00:00, 26.00s/it, loss=2.94]


Validation Loss: 2.9830
Emotional Accuracy: 30.00%
Political Accuracy: 55.00%


Epoch 8/10: 100%|██████████| 5/5 [02:05<00:00, 25.10s/it, loss=2.68]


Validation Loss: 2.9271
Emotional Accuracy: 30.00%
Political Accuracy: 55.00%


Epoch 9/10: 100%|██████████| 5/5 [02:12<00:00, 26.54s/it, loss=2.88]


Validation Loss: 2.8738
Emotional Accuracy: 30.00%
Political Accuracy: 55.00%


Epoch 10/10: 100%|██████████| 5/5 [02:05<00:00, 25.14s/it, loss=2.63]


Validation Loss: 2.8214
Emotional Accuracy: 30.00%
Political Accuracy: 55.00%


Generating predictions: 100%|██████████| 6/6 [00:17<00:00,  2.89s/it]


In [24]:
import pandas as pd
import matplotlib.pyplot as plt

def create_visualizations_from_json(json_file_path: str):
    # Load the JSON data
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    # Extract ratings for emotional intensity and political spectrum
    emotional_intensity = [value['emotional_intensity'] for value in data.values()]
    political_spectrum = [value['political_spectrum'] for value in data.values()]
    
    # Compute distribution as percentages
    emotional_df = (
        pd.DataFrame({'Rating': emotional_intensity})
        .value_counts()
        .reset_index(name='Count')
        .rename(columns={0: 'Rating'})
    )
    emotional_df['Percentage'] = (emotional_df['Count'] / emotional_df['Count'].sum()) * 100
    
    political_df = (
        pd.DataFrame({'Rating': political_spectrum})
        .value_counts()
        .reset_index(name='Count')
        .rename(columns={0: 'Rating'})
    )
    political_df['Percentage'] = (political_df['Count'] / political_df['Count'].sum()) * 100
    
    # Create visualizations
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    # Plot Emotional Intensity
    ax1.bar(emotional_df['Rating'], emotional_df['Percentage'])
    ax1.set_title('Distribution of Emotional Intensity Ratings')
    ax1.set_xlabel('Emotional Intensity Rating (1-5)')
    ax1.set_ylabel('Percentage of Entries')
    ax1.set_xticks(range(1, 6))
    
    # Plot Political Spectrum
    ax2.bar(political_df['Rating'], political_df['Percentage'])
    ax2.set_title('Distribution of Political Spectrum Ratings')
    ax2.set_xlabel('Political Spectrum Rating (1=Far Left, 5=Far Right)')
    ax2.set_ylabel('Percentage of Entries')
    ax2.set_xticks(range(1, 6))
    
    # Adjust layout and save
    plt.tight_layout()
    plt.savefig('predicted_rating_distributions.png')
    plt.close()

# Provide the path to the JSON file
json_file_path = 'predictions.json'
create_visualizations_from_json(json_file_path)


In [25]:
import json
import torch
import numpy as np
from torch import nn
from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

class SpeechDataset(Dataset):
    def __init__(self, speeches, labels=None, tokenizer=None, max_length=512):
        self.speeches = speeches
        self.speech_ids = list(speeches.keys())
        self.labels = labels
        self.tokenizer = tokenizer or RobertaTokenizer.from_pretrained('roberta-base')
        self.max_length = max_length

    def __len__(self):
        return len(self.speech_ids)

    def __getitem__(self, idx):
        speech_id = self.speech_ids[idx]
        speech = self.speeches[speech_id]['speech']

        encoding = self.tokenizer(
            speech,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'speech_id': speech_id
        }

        if self.labels is not None:
            item['emotional_intensity'] = torch.tensor(self.speeches[speech_id]['emotional_intensity'] - 1)
            item['political_spectrum'] = torch.tensor(self.speeches[speech_id]['political_spectrum'] - 1)

        return item

class PoliticalSpeechClassifier(nn.Module):
    def __init__(self, num_classes=5):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        
        # Unfreeze more layers for larger dataset
        for param in self.roberta.encoder.layer[-6:].parameters():
            param.requires_grad = True
        
        hidden_size = self.roberta.config.hidden_size
        
        self.emotional_classifier = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )
        
        self.political_classifier = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        
        emotional_logits = self.emotional_classifier(pooled_output)
        political_logits = self.political_classifier(pooled_output)
        
        return emotional_logits, political_logits

def train_model(model, train_loader, val_loader, num_epochs=8, device='cuda'):
    model = model.to(device)
    
    # Class weights based on your distribution
    emotional_weights = torch.FloatTensor([
        1/0.245,  # class 1: 24.5%
        1/0.159,  # class 2: 15.9%
        1/0.267,  # class 3: 26.7%
        1/0.290,  # class 4: 29.0%
        1/0.039   # class 5: 3.9%
    ]).to(device)
    
    political_weights = torch.FloatTensor([
        1/0.097,  # class 1: 9.7%
        1/0.222,  # class 2: 22.2%
        1/0.523,  # class 3: 52.3%
        1/0.103,  # class 4: 10.3%
        1/0.056   # class 5: 5.6%
    ]).to(device)
    
    emotional_criterion = nn.CrossEntropyLoss(weight=emotional_weights)
    political_criterion = nn.CrossEntropyLoss(weight=political_weights)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    
    total_steps = len(train_loader) * num_epochs
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=2e-5,
        total_steps=total_steps
    )
    
    best_val_loss = float('inf')
    best_model = None
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct_emotional = 0
        correct_political = 0
        total = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
        
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            emotional_labels = batch['emotional_intensity'].to(device)
            political_labels = batch['political_spectrum'].to(device)
            
            optimizer.zero_grad()
            emotional_logits, political_logits = model(input_ids, attention_mask)
            
            loss = (emotional_criterion(emotional_logits, emotional_labels) + 
                   political_criterion(political_logits, political_labels))
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            
            # Calculate training accuracy
            _, emotional_predicted = torch.max(emotional_logits, 1)
            _, political_predicted = torch.max(political_logits, 1)
            
            correct_emotional += (emotional_predicted == emotional_labels).sum().item()
            correct_political += (political_predicted == political_labels).sum().item()
            total += emotional_labels.size(0)
            
            total_loss += loss.item()
            progress_bar.set_postfix({
                'loss': loss.item(),
                'emotional_acc': 100 * correct_emotional / total,
                'political_acc': 100 * correct_political / total
            })
        
        # Validation
        model.eval()
        val_loss = 0
        correct_emotional = 0
        correct_political = 0
        total = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                emotional_labels = batch['emotional_intensity'].to(device)
                political_labels = batch['political_spectrum'].to(device)
                
                emotional_logits, political_logits = model(input_ids, attention_mask)
                
                loss = (emotional_criterion(emotional_logits, emotional_labels) + 
                       political_criterion(political_logits, political_labels))
                val_loss += loss.item()
                
                _, emotional_predicted = torch.max(emotional_logits, 1)
                _, political_predicted = torch.max(political_logits, 1)
                
                correct_emotional += (emotional_predicted == emotional_labels).sum().item()
                correct_political += (political_predicted == political_labels).sum().item()
                total += emotional_labels.size(0)
        
        val_loss = val_loss / len(val_loader)
        emotional_accuracy = 100 * correct_emotional / total
        political_accuracy = 100 * correct_political / total
        
        print(f'\nValidation Loss: {val_loss:.4f}')
        print(f'Emotional Accuracy: {emotional_accuracy:.2f}%')
        print(f'Political Accuracy: {political_accuracy:.2f}%')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict().copy()
    
    return best_model

def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load the first JSON file
    with open('outputs/speeches_113_trimmed_gpt_axis_labels.json', 'r') as f:
        data = json.load(f)

    # Load the second JSON file
    with open('outputs/speeches_114_trimmed_gpt_axis_labels.json', 'r') as f:
        data2 = json.load(f)

    # Merge the dictionaries
    data.update(data2)  # Combines data2 into data (overwriting any duplicate keys)
    
    # Split data
    train_ids, val_ids = train_test_split(list(data.keys()), test_size=0.15, random_state=42)
    
    train_data = {k: data[k] for k in train_ids}
    val_data = {k: data[k] for k in val_ids}
    
    # Create datasets
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    
    train_dataset = SpeechDataset(train_data, train_data, tokenizer)
    val_dataset = SpeechDataset(val_data, val_data, tokenizer)
    
    # Create data loaders with larger batch size
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)
    
    # Initialize and train model
    model = PoliticalSpeechClassifier()
    best_model_state = train_model(model, train_loader, val_loader, device=device)
    
    # Save the best model
    torch.save(best_model_state, 'best_model.pt')

if __name__ == "__main__":
    main()

Using device: cpu


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/8:   0%|          | 0/49 [00:35<?, ?it/s]


KeyboardInterrupt: 