In [1]:
# imports

import json
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from typing import Dict, List, Tuple
from collections import defaultdict
import torch
from transformers import RobertaTokenizer, BertTokenizer, BertForSequenceClassification, RobertaModel
from datetime import datetime
from torch import nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load in model for axis classification

class PoliticalSpeechClassifier(nn.Module):
    def __init__(self, num_classes=5, dropout_rate=0.2):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        
        # Unfreeze more layers since we have substantial data
        for param in self.roberta.encoder.layer[-8:].parameters():
            param.requires_grad = True
        
        hidden_size = self.roberta.config.hidden_size
        
        # Shared features layer
        self.shared_features = nn.Sequential(
            nn.Linear(hidden_size, 1024),
            nn.LayerNorm(1024),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(1024, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )
        
        # Task-specific layers
        self.emotional_classifier = nn.Sequential(
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )
        
        self.political_classifier = nn.Sequential(
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        
        # Use mean pooling instead of just [CLS] token
        token_embeddings = outputs.last_hidden_state
        attention_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * attention_expanded, 1)
        sum_mask = torch.clamp(attention_expanded.sum(1), min=1e-9)
        pooled_output = sum_embeddings / sum_mask
        
        # Get shared features
        shared_features = self.shared_features(pooled_output)
        
        # Get task-specific predictions
        emotional_logits = self.emotional_classifier(shared_features)
        political_logits = self.political_classifier(shared_features)
        
        return emotional_logits, political_logits

In [3]:
# Part 1: Core Functions

def read_speech_file(file_path: str) -> Dict[str, str]:
    """Load speeches from a single congress file"""
    speeches = {}
    with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
        next(file)  # Skip header
        for line in file:
            try:
                parts = line.strip().split('|')
                if len(parts) == 2:
                    speech_id, speech = parts
                    word_count = len(speech.split())
                    if 35 < word_count < 400:
                        speeches[speech_id] = speech
            except Exception as e:
                continue
    return speeches

def load_congress_data(congress_range: range, base_paths: Dict[str, str]) -> Dict[str, Dict[str, Dict]]:
    """Load speeches and party information from multiple congresses"""
    all_data = {}
    
    for congress in tqdm(congress_range, desc="Loading congress data"):
        # Zero-pad congress number to 3 digits
        congress_str = str(congress)
        congress_str_padded = f"{congress:03d}"  # This will convert 79 to "079", 111 to "111", etc.
        
        # Determine which base path to use
        if congress <= 111:
            path = base_paths['bound']
        else:
            path = base_paths['daily']
            
        # Load speeches using padded number
        speech_file = os.path.join(path, f"speeches_{congress_str_padded}.txt")
        if not os.path.exists(speech_file):
            print(f"Could not find speech file: {speech_file}")
            continue
            
        # Read speeches
        speeches = {}
        with open(speech_file, 'r', encoding='utf-8', errors='replace') as file:
            next(file)  # Skip header
            for line in file:
                try:
                    parts = line.strip().split('|')
                    if len(parts) == 2:
                        speech_id, speech = parts
                        word_count = len(speech.split())
                        if 35 < word_count < 400:
                            speeches[speech_id] = {"speech": speech}
                except Exception as e:
                    continue
        
        # Load party information with padded number
        speaker_map_file = os.path.join(path, f"{congress_str_padded}_SpeakerMap.txt")
        if os.path.exists(speaker_map_file):
            with open(speaker_map_file, 'r', encoding='utf-8', errors='replace') as file:
                header = file.readline().strip().split('|')
                speech_id_idx = header.index('speech_id')
                party_idx = header.index('party')
                
                for line in file:
                    try:
                        parts = line.strip().split('|')
                        speech_id = parts[speech_id_idx]
                        party = parts[party_idx]
                        if speech_id in speeches:
                            speeches[speech_id]['party'] = party
                    except Exception as e:
                        continue
        else:
            print(f"Could not find speaker map file: {speaker_map_file}")
        
        # Only keep speeches with party information
        speeches = {
            k: v for k, v in speeches.items()
            if 'party' in v and v['party'] in ['D', 'R']
        }
        
        if speeches:
            all_data[congress_str] = speeches
            print(f"Loaded {len(speeches)} speeches for congress {congress_str_padded}")
        else:
            print(f"No valid speeches found for congress {congress_str_padded}")
            
    return all_data

def load_party_data(congress_range: range, base_paths: Dict[str, str]) -> Dict[str, str]:
    """Load party affiliations for speakers"""
    party_data = {}
    
    for congress in congress_range:
        congress_str = str(congress)
        
        # Determine which base path to use
        if congress <= 111:
            path = base_paths['bound']
        else:
            path = base_paths['daily']
            
        desc_file = os.path.join(path, f"descr_{congress_str}.txt")
        
        if os.path.exists(desc_file):
            with open(desc_file, 'r', encoding='utf-8', errors='replace') as file:
                next(file)  # Skip header
                for line in file:
                    try:
                        parts = line.strip().split('|')
                        if len(parts) >= 2:
                            speech_id = parts[0]
                            party = parts[-1]  # Party is usually the last column
                            if party in ['D', 'R']:  # Only keep Democrat and Republican
                                party_data[speech_id] = party
                    except:
                        continue
    
    return party_data

class CongressionalAnalysis:
    def __init__(self, 
                issue_model_path: str,
                axis_model_path: str,
                congress_range: range = range(79, 115)):
        """Initialize the analysis pipeline"""
        self.congress_range = congress_range
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

        self.VALID_ISSUES = {
            'Economy and Jobs',
            'Health and Social Services',
            'Education and Innovation',
            'Environment and Energy',
            'Defense and Security',
            'Immigration and Border Policy',
            'Justice and Civil Rights',
            'Infrastructure and Transportation',
            'Budget and Fiscal Responsibility'
        }

        self.ISSUE_MAP = {
            'LABEL_21': 'Economy and Jobs',
            'LABEL_31': 'Health and Social Services',
            'LABEL_22': 'Education and Innovation',
            'LABEL_26': 'Environment and Energy',
            'LABEL_19': 'Defense and Security',
            'LABEL_43': 'Immigration and Border Policy',
            'LABEL_47': 'Justice and Civil Rights',
            'LABEL_44': 'Infrastructure and Transportation',
            'LABEL_8': 'Budget and Fiscal Responsibility'
        }
        
        # Load models
        print("Loading models...")
        self.issue_model = self.load_issue_model(issue_model_path)
        self.axis_model = self.load_axis_model(axis_model_path)
        
        # Create unique_issues list
        self.unique_issues = list(self.issue_model.config.id2label.values())
        print(f"Loaded {len(self.unique_issues)} unique issues")
        
        # Load tokenizers
        self.bert_tokenizer = BertTokenizer.from_pretrained(issue_model_path)
        self.roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        
        # Initialize data structures
        self.speeches = {}
        self.analysis_results = {}
        
    def load_issue_model(self, model_path: str):
        """Load the issue classification model"""
        return BertForSequenceClassification.from_pretrained(model_path)
    
    def load_axis_model(self, model_path: str):
        """Load the axis prediction model"""
        model_state = torch.load(model_path, map_location=self.device)
        model = PoliticalSpeechClassifier()
        model.load_state_dict(model_state['model_state_dict'])
        return model
    
    def load_data(self, base_paths: Dict[str, str], sample_size: int = 1000):
        """Load and sample speeches from each congress"""
        print("Loading congress data...")
        all_data = load_congress_data(self.congress_range, base_paths)
        
        # Sample speeches from each congress
        print("Sampling speeches...")
        for congress, speeches in all_data.items():
            # Split by party
            dem_speeches = {k: v for k, v in speeches.items() if v['party'] == 'D'}
            rep_speeches = {k: v for k, v in speeches.items() if v['party'] == 'R'}
            
            per_party = sample_size // 2
            sampled_speeches = {}
            
            # Only proceed if we have enough speeches from both parties
            if len(dem_speeches) >= per_party and len(rep_speeches) >= per_party:
                # Convert to list of items for sampling
                dem_items = list(dem_speeches.items())
                rep_items = list(rep_speeches.items())
                
                # Sample equally from each party
                dem_sample = dict(random.sample(dem_items, per_party))
                rep_sample = dict(random.sample(rep_items, per_party))
                
                sampled_speeches.update(dem_sample)
                sampled_speeches.update(rep_sample)
                
                self.speeches[congress] = sampled_speeches
            else:
                print(f"Warning: Not enough speeches from both parties in congress {congress}")
                print(f"Democratic speeches: {len(dem_speeches)}")
                print(f"Republican speeches: {len(rep_speeches)}")

# Part 2: Analysis Functions

    def analyze_speeches(self):
        """Analyze all loaded speeches using both models"""
        print("Analyzing speeches...")
        self.issue_model.to(self.device)
        self.axis_model.to(self.device)
        self.issue_model.eval()
        self.axis_model.eval()

        # Create unique_issues list before we need it
        self.unique_issues = list(self.issue_model.config.id2label.values())

        for congress, speeches in tqdm(self.speeches.items(), desc="Processing congresses"):
            congress_results = []

            # print length of speeches
            print(f"Number of speeches in congress {congress}: {len(speeches)}")
            
            # turn this for loop into one that uses tqdm
            for speech_id, speech_data in tqdm(speeches.items(), desc="Processing speeches"):
            # for speech_id, speech_data in speeches.items():
                try:
                    # Extract data correctly from the speech_data dictionary
                    speech_text = speech_data['speech']
                    party = speech_data['party']
                    
                    if not speech_text or not party:
                        continue

                    # Predict issues
                    issues = self.predict_issues(speech_text)
                    
                    # Predict axis scores
                    axis_scores = self.predict_axis_scores(speech_text)
                    
                    # Store results
                    result = {
                        'congress': int(congress),
                        'speech_id': speech_id,
                        'party': party,  # This should now be preserved
                        'issues': issues,
                        'emotional_intensity': axis_scores['emotional_intensity'],
                        'political_spectrum': axis_scores['political_spectrum'],
                        'emotional_confidence': axis_scores['emotional_confidence'],
                        'political_confidence': axis_scores['political_confidence']
                    }
                    
                    congress_results.append(result)
                    
                except Exception as e:
                    print(f"Error processing speech {speech_id}: {str(e)}")
                    continue
            
            if congress_results:
                # Create DataFrame with explicit column ordering
                self.analysis_results[congress] = pd.DataFrame(congress_results).fillna('')
                
                # Debug print to verify data
                print(f"\nCongress {congress} results:")
                print(f"Number of speeches processed: {len(congress_results)}")
                print("Columns:", self.analysis_results[congress].columns.tolist())
                print("Party distribution:", self.analysis_results[congress]['party'].value_counts())
    
    def predict_issues(self, speech_text: str, threshold: float = 0.5) -> List[str]:
        """Predict issues with mapping to standard names"""
        encoding = self.bert_tokenizer(
            speech_text,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        with torch.no_grad():
            inputs = {k: v.to(self.device) for k, v in encoding.items()}
            outputs = self.issue_model(**inputs)
            probabilities = torch.sigmoid(outputs.logits).cpu().numpy().flatten()
            
            # Get predictions and map to standard names
            raw_predictions = [
                self.issue_model.config.id2label[i]
                for i, prob in enumerate(probabilities)
                if prob >= threshold
            ]
            
            # Filter to only valid issues
            valid_predictions = [
                issue for issue in raw_predictions
                if issue in self.ISSUE_MAP
            ]
            
            return valid_predictions
    
    def predict_axis_scores(self, speech_text: str) -> Dict:
        """Predict axis scores for a single speech"""
        encoding = self.roberta_tokenizer(
            speech_text,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        with torch.no_grad():
            inputs = {k: v.to(self.device) for k, v in encoding.items()}
            emotional_logits, political_logits = self.axis_model(**inputs)
            
            emotional_probs = torch.softmax(emotional_logits, dim=1)
            political_probs = torch.softmax(political_logits, dim=1)
            
            emotional_pred = torch.argmax(emotional_probs, dim=1).item() + 1
            political_pred = torch.argmax(political_probs, dim=1).item() + 1
            
            emotional_conf = emotional_probs[0][emotional_pred-1].item()
            political_conf = political_probs[0][political_pred-1].item()
        
        return {
            'emotional_intensity': emotional_pred,
            'emotional_confidence': emotional_conf,
            'political_spectrum': political_pred,
            'political_confidence': political_conf
        }
    
    def analyze_framing_shifts(self, save_dir='analysis_results'):
        """Analyze framing shifts over time"""
        print("Analyzing framing shifts...")
        
        # Initialize storage for trends
        self.trends = {
            'by_party': defaultdict(list),
            'by_issue': defaultdict(list),
            'by_party_issue': defaultdict(lambda: defaultdict(list))
        }

        # Initialize metrics tracking
        self.metrics = {
            'global': {
                'emotional_volatility': [],
                'political_volatility': [],
                'party_divergence': []
            },
            'by_party': defaultdict(lambda: {
                'emotional_trend': [],
                'political_trend': []
            })
        }

        # Process each congress
        for congress, df in self.analysis_results.items():
            if 'party' not in df.columns:
                print(f"Warning: No party information for congress {congress}")
                continue
            congress_num = int(congress)
            
            # Analyze by party
            for party in ['D', 'R']:
                party_df = df[df['party'] == party]
                if not party_df.empty:
                    self.trends['by_party'][party].append({
                        'congress': congress_num,
                        'emotional_avg': party_df['emotional_intensity'].mean(),
                        'emotional_std': party_df['emotional_intensity'].std(),
                        'political_avg': party_df['political_spectrum'].mean(),
                        'political_std': party_df['political_spectrum'].std(),
                        'count': len(party_df)
                    })
                    
                    # Track party trends (moved inside party loop)
                    self.metrics['by_party'][party]['emotional_trend'].append({
                        'congress': congress_num,
                        'mean': party_df['emotional_intensity'].mean(),
                        'std': party_df['emotional_intensity'].std()
                    })
            
            # Analyze by issue
            for issue in self.unique_issues:
                if issue in self.ISSUE_MAP:
                    issue_mask = df['issues'].apply(lambda x: issue in x)
                    issue_df = df[issue_mask]
                    if not issue_df.empty:
                        self.trends['by_issue'][issue].append({
                            'congress': congress_num,
                            'emotional_avg': issue_df['emotional_intensity'].mean(),
                            'emotional_std': issue_df['emotional_intensity'].std(),
                            'political_avg': issue_df['political_spectrum'].mean(),
                            'political_std': issue_df['political_spectrum'].std(),
                            'count': len(issue_df)
                        })
                        
                        # Analyze by party within issue
                        for party in ['D', 'R']:
                            party_issue_df = issue_df[issue_df['party'] == party]
                            if not party_issue_df.empty:
                                self.trends['by_party_issue'][issue][party].append({
                                    'congress': congress_num,
                                    'emotional_avg': party_issue_df['emotional_intensity'].mean(),
                                    'emotional_std': party_issue_df['emotional_intensity'].std(),
                                    'political_avg': party_issue_df['political_spectrum'].mean(),
                                    'political_std': party_issue_df['political_spectrum'].std(),
                                    'count': len(party_issue_df)
                                })
            
            # Add volatility metrics
            self.metrics['global']['emotional_volatility'].append({
                'congress': congress_num,
                'std': df['emotional_intensity'].std()
            })
            self.metrics['global']['political_volatility'].append({
                'congress': congress_num,
                'std': df['political_spectrum'].std()
            })

        # Convert trend data to DataFrames
        self.trend_dfs = {
            'by_party': {
                party: pd.DataFrame(data)
                for party, data in self.trends['by_party'].items()
            },
            'by_issue': {
                issue: pd.DataFrame(data)
                for issue, data in self.trends['by_issue'].items()
            },
            'by_party_issue': {
                issue: {
                    party: pd.DataFrame(data)
                    for party, data in party_data.items()
                }
                for issue, party_data in self.trends['by_party_issue'].items()
            }
        }

        # Save enhanced metrics
        os.makedirs(save_dir, exist_ok=True)
        with open(os.path.join(save_dir, 'detailed_metrics.json'), 'w') as f:
            json.dump(self.metrics, f, indent=2)

# Part 3: Visualization and Metrics

    def calculate_polarization_metrics(self):
        """Calculate polarization metrics over time"""
        print("Calculating polarization metrics...")
        
        self.polarization_metrics = defaultdict(list)
        
        # Overall polarization
        for congress in sorted(self.analysis_results.keys()):
            df = self.analysis_results[congress]
            dem_df = df[df['party'] == 'D']
            rep_df = df[df['party'] == 'R']
            
            metrics = {
                'congress': int(congress),
                'emotional_gap': (rep_df['emotional_intensity'].mean() - 
                                dem_df['emotional_intensity'].mean()),
                'political_gap': (rep_df['political_spectrum'].mean() - 
                                dem_df['political_spectrum'].mean()),
                'emotional_overlap': self._calculate_distribution_overlap(
                    dem_df['emotional_intensity'], rep_df['emotional_intensity']
                ),
                'political_overlap': self._calculate_distribution_overlap(
                    dem_df['political_spectrum'], rep_df['political_spectrum']
                )
            }
            
            self.polarization_metrics['overall'].append(metrics)
        
        # By issue polarization
        for issue in self.unique_issues:
            if issue in self.ISSUE_MAP:
                for congress in sorted(self.analysis_results.keys()):
                    df = self.analysis_results[congress]
                    issue_mask = df['issues'].apply(lambda x: issue in x)
                    issue_df = df[issue_mask]
                    
                    if len(issue_df) > 10:  # Only calculate if enough samples
                        dem_df = issue_df[issue_df['party'] == 'D']
                        rep_df = issue_df[issue_df['party'] == 'R']
                        
                        if len(dem_df) > 5 and len(rep_df) > 5:
                            metrics = {
                                'congress': int(congress),
                                'emotional_gap': (rep_df['emotional_intensity'].mean() - 
                                                dem_df['emotional_intensity'].mean()),
                                'political_gap': (rep_df['political_spectrum'].mean() - 
                                                dem_df['political_spectrum'].mean()),
                                'emotional_overlap': self._calculate_distribution_overlap(
                                    dem_df['emotional_intensity'], rep_df['emotional_intensity']
                                ),
                                'political_overlap': self._calculate_distribution_overlap(
                                    dem_df['political_spectrum'], rep_df['political_spectrum']
                                )
                            }
                            
                            self.polarization_metrics[issue].append(metrics)
        
        # Convert to DataFrames
        self.polarization_dfs = {
            key: pd.DataFrame(data)
            for key, data in self.polarization_metrics.items()
        }
    
    def _calculate_distribution_overlap(self, dist1, dist2):
        """Calculate overlap between two distributions"""
        hist1, bins = np.histogram(dist1, bins=5, density=True)
        hist2, _ = np.histogram(dist2, bins=bins, density=True)
        return np.minimum(hist1, hist2).sum() * (bins[1] - bins[0])

    def _plot_issue_heatmaps(self, save_dir):
        """Create heatmaps showing issue prevalence and characteristics over time"""
        # Create directory for issue heatmaps
        heatmap_dir = os.path.join(save_dir, 'issue_heatmaps')
        os.makedirs(heatmap_dir, exist_ok=True)
        
        # Prepare data for heatmaps
        congresses = sorted(self.analysis_results.keys())
        issues = list(self.ISSUE_MAP.values())
        
        # Initialize matrices for different metrics
        prevalence_matrix = np.zeros((len(issues), len(congresses)))
        emotional_matrix = np.zeros((len(issues), len(congresses)))
        political_matrix = np.zeros((len(issues), len(congresses)))
        
        # Fill matrices
        for i, issue in enumerate(issues):
            for j, congress in enumerate(congresses):
                df = self.analysis_results[congress]
                # Find the LABEL that maps to this issue
                issue_label = [k for k, v in self.ISSUE_MAP.items() if v == issue][0]
                issue_mask = df['issues'].apply(lambda x: issue_label in x)
                issue_df = df[issue_mask]
                
                if not issue_df.empty:
                    prevalence_matrix[i, j] = len(issue_df) / len(df) * 100
                    emotional_matrix[i, j] = issue_df['emotional_intensity'].mean()
                    political_matrix[i, j] = issue_df['political_spectrum'].mean()
        
        # Plot heatmaps
        plt.figure(figsize=(15, 8))
        sns.heatmap(prevalence_matrix, 
                    xticklabels=congresses,
                    yticklabels=issues,
                    cmap='YlOrRd',
                    annot=True,
                    fmt='.1f')
        plt.title('Issue Prevalence Over Time (%)')
        plt.xlabel('Congress')
        plt.ylabel('Issue')
        plt.tight_layout()
        plt.savefig(f"{heatmap_dir}/issue_prevalence.png")
        plt.close()
        
        plt.figure(figsize=(15, 8))
        sns.heatmap(emotional_matrix,
                    xticklabels=congresses,
                    yticklabels=issues,
                    cmap='RdBu_r',
                    annot=True,
                    fmt='.2f',
                    vmin=1, vmax=5)
        plt.title('Average Emotional Intensity by Issue Over Time')
        plt.xlabel('Congress')
        plt.ylabel('Issue')
        plt.tight_layout()
        plt.savefig(f"{heatmap_dir}/issue_emotional.png")
        plt.close()
        
        plt.figure(figsize=(15, 8))
        sns.heatmap(political_matrix,
                    xticklabels=congresses,
                    yticklabels=issues,
                    cmap='RdBu_r',
                    annot=True,
                    fmt='.2f',
                    vmin=1, vmax=5)
        plt.title('Average Political Position by Issue Over Time')
        plt.xlabel('Congress')
        plt.ylabel('Issue')
        plt.tight_layout()
        plt.savefig(f"{heatmap_dir}/issue_political.png")
        plt.close()

    def calculate_issue_dynamics(self):
        """Calculate how issues change over time"""
        dynamics = {}
        
        for issue in self.ISSUE_MAP.values():
            dynamics[issue] = {
                'volatility': {
                    'emotional': [],
                    'political': []
                },
                'trend': {
                    'emotional': [],
                    'political': []
                },
                'party_gap': {
                    'emotional': [],
                    'political': []
                }
            }
            
            # Calculate metrics per congress
            for congress in sorted(self.analysis_results.keys()):
                df = self.analysis_results[congress]
                issue_label = [k for k, v in self.ISSUE_MAP.items() if v == issue][0]
                issue_mask = df['issues'].apply(lambda x: issue_label in x)
                issue_df = df[issue_mask]
                
                if len(issue_df) > 10:  # Only calculate if enough samples
                    # Volatility (standard deviation)
                    dynamics[issue]['volatility']['emotional'].append(
                        issue_df['emotional_intensity'].std()
                    )
                    dynamics[issue]['volatility']['political'].append(
                        issue_df['political_spectrum'].std()
                    )
                    
                    # Party differences
                    dem_df = issue_df[issue_df['party'] == 'D']
                    rep_df = issue_df[issue_df['party'] == 'R']
                    
                    if len(dem_df) > 5 and len(rep_df) > 5:
                        dynamics[issue]['party_gap']['emotional'].append(
                            rep_df['emotional_intensity'].mean() - dem_df['emotional_intensity'].mean()
                        )
                        dynamics[issue]['party_gap']['political'].append(
                            rep_df['political_spectrum'].mean() - dem_df['political_spectrum'].mean()
                        )
        
        return dynamics
    
    def plot_framing_trends(self, save_dir='plots'):
        """Generate plots for framing trends"""
        os.makedirs(save_dir, exist_ok=True)
        
        # Plot overall party trends
        self._plot_party_trends(save_dir)
        
        # Plot issue-specific trends
        self._plot_issue_trends(save_dir)
        
        # Plot polarization trends
        self._plot_polarization_trends(save_dir)

        # New issue analysis plots
        self._plot_issue_heatmaps(save_dir)

        # Calculate and save issue dynamics
        dynamics = self.calculate_issue_dynamics()
        with open(os.path.join(save_dir, 'issue_dynamics.json'), 'w') as f:
            json.dump(dynamics, f, indent=2)
    
    def _plot_party_trends(self, save_dir):
        """Plot party-level trends with standardized scales"""
        # Emotional Intensity by Party
        plt.figure(figsize=(12, 6))
        for party in ['D', 'R']:
            df = self.trend_dfs['by_party'][party]
            plt.plot(df['congress'], df['emotional_avg'], 
                    label=f"{'Democratic' if party == 'D' else 'Republican'}")
            plt.fill_between(df['congress'],
                            df['emotional_avg'] - df['emotional_std'],
                            df['emotional_avg'] + df['emotional_std'],
                            alpha=0.2)
        
        plt.title('Emotional Intensity by Party Over Time')
        plt.xlabel('Congress')
        plt.ylabel('Average Emotional Intensity')
        plt.ylim(1, 5)  # Set fixed scale
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.savefig(f"{save_dir}/emotional_intensity_by_party.png")
        plt.close()
        
        # Political Spectrum by Party
        plt.figure(figsize=(12, 6))
        for party in ['D', 'R']:
            df = self.trend_dfs['by_party'][party]
            plt.plot(df['congress'], df['political_avg'],
                    label=f"{'Democratic' if party == 'D' else 'Republican'}")
            plt.fill_between(df['congress'],
                            df['political_avg'] - df['political_std'],
                            df['political_avg'] + df['political_std'],
                            alpha=0.2)
        
        plt.title('Political Spectrum Position by Party Over Time')
        plt.xlabel('Congress')
        plt.ylabel('Average Political Spectrum Position')
        plt.ylim(1, 5)  # Set fixed scale
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.savefig(f"{save_dir}/political_spectrum_by_party.png")
        plt.close()
    
    def _plot_issue_trends(self, save_dir):
        """Plot issue-level trends"""
        # Skip empty or invalid issues
        if not self.trends['by_issue']:
            return

        for issue in self.ISSUE_MAP.values():  # Use our defined issue map
            # find the key in self.ISSUE_MAP that corresponds to the issue
            for key, value in self.ISSUE_MAP.items():
                if value == issue:
                    issue_key = key
                    break
            if issue_key not in self.trend_dfs['by_issue']:
                print(f"Skipping issue: {issue_key}, whose real name is {issue}")
                print(self.trend_dfs['by_issue'].keys())
                continue
                
            # Create directory for issue-specific plots
            issue_dir = os.path.join(save_dir, 'issues', issue.lower().replace(' ', '_'))
            os.makedirs(issue_dir, exist_ok=True)
            
            # Emotional Intensity
            plt.figure(figsize=(12, 6))
            for party in ['D', 'R']:
                if issue_key in self.trend_dfs['by_party_issue'] and \
                party in self.trend_dfs['by_party_issue'][issue_key]:
                    df = self.trend_dfs['by_party_issue'][issue_key][party]
                    plt.plot(df['congress'], df['emotional_avg'],
                            label=f"{'Democratic' if party == 'D' else 'Republican'}")
                    plt.fill_between(df['congress'],
                                df['emotional_avg'] - df['emotional_std'],
                                df['emotional_avg'] + df['emotional_std'],
                                alpha=0.2)
            
            plt.title(f'Emotional Intensity Over Time: {issue}')
            plt.xlabel('Congress')
            plt.ylabel('Average Emotional Intensity')
            plt.ylim(1, 5)
            plt.legend()
            plt.grid(True)
            plt.savefig(f"{issue_dir}/emotional_intensity.png")
            plt.close()
            
            # Political Spectrum
            plt.figure(figsize=(12, 6))
            for party in ['D', 'R']:
                if issue_key in self.trend_dfs['by_party_issue'] and \
                party in self.trend_dfs['by_party_issue'][issue_key]:
                    df = self.trend_dfs['by_party_issue'][issue_key][party]
                    plt.plot(df['congress'], df['political_avg'],
                            label=f"{'Democratic' if party == 'D' else 'Republican'}")
                    plt.fill_between(df['congress'],
                                df['political_avg'] - df['political_std'],
                                df['political_avg'] + df['political_std'],
                                alpha=0.2)
            
            plt.title(f'Political Spectrum Position Over Time: {issue}')
            plt.xlabel('Congress')
            plt.ylabel('Average Political Spectrum Position')
            plt.ylim(1, 5)
            plt.legend()
            plt.grid(True)
            plt.savefig(f"{issue_dir}/political_spectrum.png")
            plt.close()
    
    def _plot_polarization_trends(self, save_dir):
        """Plot polarization trends with standardized scales"""
        plt.figure(figsize=(12, 6))
        df = self.polarization_dfs['overall']
        plt.plot(df['congress'], df['emotional_gap'], label='Emotional Gap')
        plt.plot(df['congress'], df['political_gap'], label='Political Gap')
        plt.title('Party Polarization Over Time')
        plt.xlabel('Congress')
        plt.ylabel('Party Gap')
        plt.ylim(-4, 4)  # Maximum possible gap is ±4 on a 1-5 scale
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.savefig(f"{save_dir}/overall_polarization.png")
        plt.close()

In [None]:
# Part 4: Main Execution

def main():
    # Configuration
    base_paths = {
        'bound': "../hein-bound",  # Path to bound speeches (79-111)
        'daily': "../hein-daily"   # Path to daily speeches (112-114)
    }
    
    # Model paths
    issue_model_path = "../issue_classifier_eval/model/saved_issue_model"  # Path to saved issue classification model
    axis_model_path = "../large-training-output/model_artifacts_20241202_142615/model.pt"  # Path to saved axis prediction model
    
    # Initialize analysis
    print("Initializing analysis pipeline...")
    analyzer = CongressionalAnalysis(
        issue_model_path=issue_model_path,
        axis_model_path=axis_model_path,
        congress_range=range(79, 115)
    )
    
    # Load and process data
    print("\nLoading congressional data...")
    analyzer.load_data(
        base_paths=base_paths,
        sample_size=1000
    )
    
    # Run analysis
    print("\nAnalyzing speeches...")
    analyzer.analyze_speeches()
    
    # Analyze framing shifts
    print("\nAnalyzing framing shifts...")
    analyzer.analyze_framing_shifts()
    
    # Calculate polarization metrics
    print("\nCalculating polarization metrics...")
    analyzer.calculate_polarization_metrics()
    
    # Generate plots
    print("\nGenerating visualization plots...")
    analyzer.plot_framing_trends(save_dir='analysis_results/plots')
    
    # Save results
    print("\nSaving analysis results...")
    results_dir = 'analysis_results'
    os.makedirs(results_dir, exist_ok=True)
    
    # Save trend data
    trend_data = {
        'by_party': {
            party: df.to_dict('records')
            for party, df in analyzer.trend_dfs['by_party'].items()
        },
        'by_issue': {
            analyzer.ISSUE_MAP[issue]: df.to_dict('records') # issue_name = analyzer.ISSUE_MAP[issue]
            for issue, df in analyzer.trend_dfs['by_issue'].items()
        },
        'polarization': {
            (analyzer.ISSUE_MAP[issue] if issue in analyzer.ISSUE_MAP else issue): df.to_dict('records')
            for issue, df in analyzer.polarization_dfs.items()
        }
    }
    
    with open(f"{results_dir}/trend_data.json", 'w') as f:
        json.dump(trend_data, f, indent=2)
    
    # Generate summary report
    print("\nGenerating summary report...")
    with open(f"{results_dir}/summary_report.txt", 'w') as f:
        f.write("Congressional Speech Analysis Summary\n")
        f.write("===================================\n\n")
        
        f.write("Analysis Parameters:\n")
        f.write(f"- Congress Range: 79-114\n")
        f.write(f"- Speeches per Congress: 1000\n")
        f.write(f"- Total Speeches Analyzed: {sum(len(df) for df in analyzer.analysis_results.values())}\n\n")
        
        f.write("Overall Trends:\n")
        f.write("--------------\n")
        for party in ['D', 'R']:
            party_name = 'Democratic' if party == 'D' else 'Republican'
            df = analyzer.trend_dfs['by_party'][party]
            
            f.write(f"\n{party_name} Party:\n")
            f.write(f"- Emotional Intensity Change: {df['emotional_avg'].iloc[-1] - df['emotional_avg'].iloc[0]:.2f}\n")
            f.write(f"- Political Position Change: {df['political_avg'].iloc[-1] - df['political_avg'].iloc[0]:.2f}\n")
        
        f.write("\nPolarization Analysis:\n")
        f.write("---------------------\n")
        df = analyzer.polarization_dfs['overall']
        f.write(f"- Initial Emotional Gap: {df['emotional_gap'].iloc[0]:.2f}\n")
        f.write(f"- Final Emotional Gap: {df['emotional_gap'].iloc[-1]:.2f}\n")
        f.write(f"- Initial Political Gap: {df['political_gap'].iloc[0]:.2f}\n")
        f.write(f"- Final Political Gap: {df['political_gap'].iloc[-1]:.2f}\n")
        
        f.write("\nIssue-Specific Findings:\n")
        f.write("----------------------\n")
        for issue in analyzer.unique_issues:
            if issue in analyzer.polarization_dfs:
                issue_name = analyzer.ISSUE_MAP[issue]
                df = analyzer.polarization_dfs[issue]
                f.write(f"\n{issue_name}:\n")
                f.write(f"- Polarization Change: {df['political_gap'].iloc[-1] - df['political_gap'].iloc[0]:.2f}\n")
                f.write(f"- Emotional Intensity Change: {df['emotional_gap'].iloc[-1] - df['emotional_gap'].iloc[0]:.2f}\n")
    
    print("\nAnalysis complete! Results saved to 'analysis_results' directory.")
    print("\nGenerated files:")
    print("1. analysis_results/plots/ - Visualization plots")
    print("2. analysis_results/trend_data.json - Raw trend data")
    print("3. analysis_results/summary_report.txt - Analysis summary")

    return analyzer

if __name__ == "__main__":
    output_analyzer = main()

Initializing analysis pipeline...
Using device: cpu
Loading models...


  model_state = torch.load(model_path, map_location=self.device)
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded 74 unique issues

Loading congressional data...
Loading congress data...


Loading congress data:   3%|▎         | 1/36 [00:01<00:52,  1.51s/it]

Loaded 68687 speeches for congress 079


Loading congress data:   6%|▌         | 2/36 [00:02<00:46,  1.37s/it]

Loaded 55732 speeches for congress 080


Loading congress data:   8%|▊         | 3/36 [00:04<00:46,  1.42s/it]

Loaded 91333 speeches for congress 081


Loading congress data:  11%|█         | 4/36 [00:05<00:41,  1.29s/it]

Loaded 71327 speeches for congress 082


Loading congress data:  14%|█▍        | 5/36 [00:06<00:42,  1.37s/it]

Loaded 71976 speeches for congress 083


Loading congress data:  17%|█▋        | 6/36 [00:08<00:43,  1.44s/it]

Loaded 56304 speeches for congress 084


Loading congress data:  19%|█▉        | 7/36 [00:10<00:47,  1.63s/it]

Loaded 76595 speeches for congress 085


Loading congress data:  22%|██▏       | 8/36 [00:13<00:54,  1.93s/it]

Loaded 77945 speeches for congress 086


Loading congress data:  25%|██▌       | 9/36 [00:15<00:57,  2.11s/it]

Loaded 78954 speeches for congress 087


Loading congress data:  28%|██▊       | 10/36 [00:18<01:00,  2.31s/it]

Loaded 82647 speeches for congress 088


Loading congress data:  31%|███       | 11/36 [00:20<00:56,  2.27s/it]

Loaded 82540 speeches for congress 089


Loading congress data:  33%|███▎      | 12/36 [00:22<00:53,  2.21s/it]

Loaded 89339 speeches for congress 090


Loading congress data:  36%|███▌      | 13/36 [00:24<00:52,  2.28s/it]

Loaded 93230 speeches for congress 091


Loading congress data:  39%|███▉      | 14/36 [00:27<00:49,  2.26s/it]

Loaded 79993 speeches for congress 092


Loading congress data:  42%|████▏     | 15/36 [00:29<00:47,  2.28s/it]

Loaded 90806 speeches for congress 093


Loading congress data:  44%|████▍     | 16/36 [00:31<00:46,  2.32s/it]

Loaded 94878 speeches for congress 094


Loading congress data:  47%|████▋     | 17/36 [00:34<00:44,  2.32s/it]

Loaded 99770 speeches for congress 095


Loading congress data:  50%|█████     | 18/36 [00:36<00:41,  2.30s/it]

Loaded 88242 speeches for congress 096


Loading congress data:  53%|█████▎    | 19/36 [00:38<00:37,  2.21s/it]

Loaded 71751 speeches for congress 097


Loading congress data:  56%|█████▌    | 20/36 [00:40<00:34,  2.15s/it]

Loaded 73694 speeches for congress 098


Loading congress data:  58%|█████▊    | 21/36 [00:42<00:32,  2.17s/it]

Loaded 74726 speeches for congress 099


Loading congress data:  61%|██████    | 22/36 [00:44<00:30,  2.16s/it]

Loaded 68877 speeches for congress 100


Loading congress data:  64%|██████▍   | 23/36 [00:46<00:27,  2.10s/it]

Loaded 61333 speeches for congress 101


Loading congress data:  67%|██████▋   | 24/36 [00:48<00:24,  2.08s/it]

Loaded 59940 speeches for congress 102


Loading congress data:  69%|██████▉   | 25/36 [00:50<00:22,  2.03s/it]

Loaded 60331 speeches for congress 103


Loading congress data:  72%|███████▏  | 26/36 [00:53<00:20,  2.09s/it]

Loaded 72521 speeches for congress 104


Loading congress data:  75%|███████▌  | 27/36 [00:54<00:17,  1.98s/it]

Loaded 50320 speeches for congress 105


Loading congress data:  78%|███████▊  | 28/36 [00:56<00:15,  1.94s/it]

Loaded 50657 speeches for congress 106


Loading congress data:  81%|████████  | 29/36 [00:58<00:12,  1.83s/it]

Loaded 41234 speeches for congress 107


Loading congress data:  83%|████████▎ | 30/36 [00:59<00:10,  1.79s/it]

Loaded 45099 speeches for congress 108


Loading congress data:  86%|████████▌ | 31/36 [01:01<00:08,  1.76s/it]

Loaded 44499 speeches for congress 109


Loading congress data:  89%|████████▉ | 32/36 [01:03<00:07,  1.77s/it]

Loaded 49258 speeches for congress 110


Loading congress data:  92%|█████████▏| 33/36 [01:04<00:05,  1.71s/it]

Loaded 44920 speeches for congress 111


Loading congress data:  94%|█████████▍| 34/36 [01:06<00:03,  1.59s/it]

Loaded 34665 speeches for congress 112


Loading congress data:  97%|█████████▋| 35/36 [01:07<00:01,  1.45s/it]

Loaded 30332 speeches for congress 113


Loading congress data: 100%|██████████| 36/36 [01:08<00:00,  1.90s/it]

Loaded 27075 speeches for congress 114
Sampling speeches...






Analyzing speeches...
Analyzing speeches...


Processing congresses:   0%|          | 0/36 [00:00<?, ?it/s]

Number of speeches in congress 79: 1000


Processing speeches: 100%|██████████| 1000/1000 [08:11<00:00,  2.03it/s]
Processing congresses:   3%|▎         | 1/36 [08:11<4:46:43, 491.53s/it]


Congress 79 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 80: 1000


Processing speeches: 100%|██████████| 1000/1000 [08:18<00:00,  2.01it/s]
Processing congresses:   6%|▌         | 2/36 [16:30<4:40:53, 495.68s/it]


Congress 80 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 81: 1000


Processing speeches: 100%|██████████| 1000/1000 [08:27<00:00,  1.97it/s]
Processing congresses:   8%|▊         | 3/36 [24:57<4:35:30, 500.92s/it]


Congress 81 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 82: 1000


Processing speeches: 100%|██████████| 1000/1000 [08:31<00:00,  1.95it/s]
Processing congresses:  11%|█         | 4/36 [33:28<4:29:24, 505.15s/it]


Congress 82 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 83: 1000


Processing speeches: 100%|██████████| 1000/1000 [08:24<00:00,  1.98it/s]
Processing congresses:  14%|█▍        | 5/36 [41:53<4:20:49, 504.82s/it]


Congress 83 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 84: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:19<00:00,  1.79it/s]
Processing congresses:  17%|█▋        | 6/36 [51:12<4:21:40, 523.34s/it]


Congress 84 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 85: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:23<00:00,  1.78it/s]
Processing congresses:  19%|█▉        | 7/36 [1:00:35<4:19:16, 536.42s/it]


Congress 85 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 86: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:49<00:00,  1.70it/s]
Processing congresses:  22%|██▏       | 8/36 [1:10:25<4:18:14, 553.38s/it]


Congress 86 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 87: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:18<00:00,  1.79it/s]
Processing congresses:  25%|██▌       | 9/36 [1:19:43<4:09:41, 554.86s/it]


Congress 87 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 88: 1000


Processing speeches: 100%|██████████| 1000/1000 [24:15<00:00,  1.46s/it]
Processing congresses:  28%|██▊       | 10/36 [1:43:58<6:00:52, 832.79s/it]


Congress 88 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 89: 1000


Processing speeches: 100%|██████████| 1000/1000 [21:19<00:00,  1.28s/it]
Processing congresses:  31%|███       | 11/36 [2:05:18<6:43:56, 969.44s/it]


Congress 89 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 90: 1000


Processing speeches: 100%|██████████| 1000/1000 [08:36<00:00,  1.94it/s]
Processing congresses:  33%|███▎      | 12/36 [2:13:54<5:32:38, 831.62s/it]


Congress 90 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 91: 1000


Processing speeches: 100%|██████████| 1000/1000 [08:43<00:00,  1.91it/s]
Processing congresses:  36%|███▌      | 13/36 [2:22:37<4:42:58, 738.18s/it]


Congress 91 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 92: 1000


Processing speeches: 100%|██████████| 1000/1000 [08:54<00:00,  1.87it/s]
Processing congresses:  39%|███▉      | 14/36 [2:31:32<4:08:10, 676.83s/it]


Congress 92 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 93: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:41<00:00,  1.72it/s]
Processing congresses:  42%|████▏     | 15/36 [2:41:13<3:46:48, 648.03s/it]


Congress 93 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 94: 1000


Processing speeches: 100%|██████████| 1000/1000 [10:49<00:00,  1.54it/s]
Processing congresses:  44%|████▍     | 16/36 [2:52:03<3:36:09, 648.45s/it]


Congress 94 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 95: 1000


Processing speeches: 100%|██████████| 1000/1000 [11:02<00:00,  1.51it/s]
Processing congresses:  47%|████▋     | 17/36 [3:03:05<3:26:40, 652.65s/it]


Congress 95 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 96: 1000


Processing speeches: 100%|██████████| 1000/1000 [10:06<00:00,  1.65it/s]
Processing congresses:  50%|█████     | 18/36 [3:13:12<3:11:38, 638.80s/it]


Congress 96 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 97: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:27<00:00,  1.76it/s]
Processing congresses:  53%|█████▎    | 19/36 [3:22:39<2:54:53, 617.26s/it]


Congress 97 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 98: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:54<00:00,  1.68it/s]
Processing congresses:  56%|█████▌    | 20/36 [3:32:34<2:42:47, 610.48s/it]


Congress 98 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 99: 1000


Processing speeches: 100%|██████████| 1000/1000 [12:28<00:00,  1.34it/s]
Processing congresses:  58%|█████▊    | 21/36 [3:45:02<2:42:57, 651.85s/it]


Congress 99 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 100: 1000


Processing speeches: 100%|██████████| 1000/1000 [11:57<00:00,  1.39it/s]
Processing congresses:  61%|██████    | 22/36 [3:56:59<2:36:40, 671.44s/it]


Congress 100 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 101: 1000


Processing speeches: 100%|██████████| 1000/1000 [12:18<00:00,  1.35it/s]
Processing congresses:  64%|██████▍   | 23/36 [4:09:18<2:29:51, 691.66s/it]


Congress 101 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 102: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:19<00:00,  1.79it/s]
Processing congresses:  67%|██████▋   | 24/36 [4:18:37<2:10:23, 651.96s/it]


Congress 102 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 103: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:09<00:00,  1.82it/s]
Processing congresses:  69%|██████▉   | 25/36 [4:27:47<1:53:54, 621.28s/it]


Congress 103 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 104: 1000


Processing speeches: 100%|██████████| 1000/1000 [08:50<00:00,  1.88it/s]
Processing congresses:  72%|███████▏  | 26/36 [4:36:38<1:39:00, 594.08s/it]


Congress 104 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 105: 1000


Processing speeches: 100%|██████████| 1000/1000 [08:53<00:00,  1.87it/s]
Processing congresses:  75%|███████▌  | 27/36 [4:45:31<1:26:24, 576.01s/it]


Congress 105 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 106: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:20<00:00,  1.78it/s]
Processing congresses:  78%|███████▊  | 28/36 [4:54:52<1:16:11, 571.44s/it]


Congress 106 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 107: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:50<00:00,  1.69it/s]
Processing congresses:  81%|████████  | 29/36 [5:04:43<1:07:20, 577.26s/it]


Congress 107 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 108: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:48<00:00,  1.70it/s]
Processing congresses:  83%|████████▎ | 30/36 [5:14:32<58:04, 580.70s/it]  


Congress 108 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 109: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:49<00:00,  1.70it/s]
Processing congresses:  86%|████████▌ | 31/36 [5:24:21<48:36, 583.22s/it]


Congress 109 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 110: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:50<00:00,  1.69it/s]
Processing congresses:  89%|████████▉ | 32/36 [5:34:11<39:01, 585.37s/it]


Congress 110 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 111: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:50<00:00,  1.69it/s]
Processing congresses:  92%|█████████▏| 33/36 [5:44:02<29:20, 586.99s/it]


Congress 111 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 112: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:51<00:00,  1.69it/s]
Processing congresses:  94%|█████████▍| 34/36 [5:53:53<19:36, 588.22s/it]


Congress 112 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 113: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:47<00:00,  1.70it/s]
Processing congresses:  97%|█████████▋| 35/36 [6:03:40<09:47, 587.90s/it]


Congress 113 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64
Number of speeches in congress 114: 1000


Processing speeches: 100%|██████████| 1000/1000 [09:57<00:00,  1.67it/s]
Processing congresses: 100%|██████████| 36/36 [6:13:38<00:00, 622.73s/it]



Congress 114 results:
Number of speeches processed: 1000
Columns: ['congress', 'speech_id', 'party', 'issues', 'emotional_intensity', 'political_spectrum', 'emotional_confidence', 'political_confidence']
Party distribution: party
D    500
R    500
Name: count, dtype: int64

Analyzing framing shifts...
Analyzing framing shifts...

Calculating polarization metrics...
Calculating polarization metrics...

Generating visualization plots...

Saving analysis results...

Generating summary report...

Analysis complete! Results saved to 'analysis_results' directory.

Generated files:
1. analysis_results/plots/ - Visualization plots
2. analysis_results/trend_data.json - Raw trend data
3. analysis_results/summary_report.txt - Analysis summary


In [None]:
def remake_heatmaps(analyzer, save_dir='analysis_results/plots/issue_heatmaps'):
    """Recreate heatmaps using all congresses"""
    os.makedirs(save_dir, exist_ok=True)
    
    # Prepare data for heatmaps - Force numerical sorting of congresses
    congresses = sorted(analyzer.analysis_results.keys(), key=int)  # This is the key change
    issues = list(analyzer.ISSUE_MAP.values())
    
    # Initialize matrices for different metrics
    prevalence_matrix = np.zeros((len(issues), len(congresses)))
    emotional_matrix = np.zeros((len(issues), len(congresses)))
    political_matrix = np.zeros((len(issues), len(congresses)))
    
    # Fill matrices
    for i, issue in enumerate(issues):
        for j, congress in enumerate(congresses):
            df = analyzer.analysis_results[congress]
            issue_label = [k for k, v in analyzer.ISSUE_MAP.items() if v == issue][0]
            issue_mask = df['issues'].apply(lambda x: issue_label in x)
            issue_df = df[issue_mask]
            
            if not issue_df.empty:
                prevalence_matrix[i, j] = len(issue_df) / len(df) * 100
                emotional_matrix[i, j] = issue_df['emotional_intensity'].mean()
                political_matrix[i, j] = issue_df['political_spectrum'].mean()
    
    # Plot settings
    plt.rcParams['figure.figsize'] = (20, 10)
    plt.rcParams['figure.dpi'] = 100
    
    # Plot heatmaps with proper congress ordering
    plt.figure()
    sns.heatmap(prevalence_matrix, 
                xticklabels=congresses,  # These will now be in numerical order
                yticklabels=issues,
                cmap='YlOrRd',
                annot=True,
                fmt='.1f')
    plt.title('Issue Prevalence Over Time (%)')
    plt.xlabel('Congress')
    plt.ylabel('Issue')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"{save_dir}/issue_prevalence.png")
    plt.close()
    
    plt.figure()
    sns.heatmap(emotional_matrix,
                xticklabels=congresses,
                yticklabels=issues,
                cmap='RdBu_r',
                annot=True,
                fmt='.2f',
                vmin=1, vmax=5)
    plt.title('Average Emotional Intensity by Issue Over Time')
    plt.xlabel('Congress')
    plt.ylabel('Issue')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"{save_dir}/issue_emotional.png")
    plt.close()
    
    plt.figure()
    sns.heatmap(political_matrix,
                xticklabels=congresses,
                yticklabels=issues,
                cmap='RdBu_r',
                annot=True,
                fmt='.2f',
                vmin=1, vmax=5)
    plt.title('Average Political Position by Issue Over Time')
    plt.xlabel('Congress')
    plt.ylabel('Issue')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"{save_dir}/issue_political.png")
    plt.close()

# Run this with your existing analyzer object
remake_heatmaps(output_analyzer)