In [48]:
import os
import json
from typing import Dict, List
import openai
from tqdm import tqdm
from pydantic import BaseModel, Field
from dotenv import load_dotenv
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import pandas as pd

# Load environment variables
load_dotenv()
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [49]:
class Speech(BaseModel):
    speech_id: str
    emotional_intensity: int = Field(ge=1, le=10)  # Ensure value between 1 and 10
    political_spectrum: int = Field(ge=1, le=10)   # Ensure value between 1 and 10

class Speeches(BaseModel):
    speeches: List[Speech]

In [50]:
def read_speech_file(file_path: str) -> Dict[str, str]:
    """
    Reads a speech file and returns a dictionary mapping speech_ids to speeches.
    Only includes speeches with more than 30 words.
    """
    speeches = {}
    with open(file_path, 'r') as file:
        # Skip header line
        next(file)
        for line in file:
            # Split on pipe character
            parts = line.strip().split('|')
            if len(parts) == 2:
                speech_id, speech = parts
                # Only add speech if it has more than 30 words
                if len(speech.split()) > 30:
                    speeches[speech_id] = speech
    return speeches

In [51]:
def chunk_speeches(speeches: Dict[str, str], max_chunk_size: int = 20000) -> List[Dict[str, str]]:
    """
    Splits speeches into chunks while keeping individual speeches intact.
    """
    chunks = []
    current_chunk = {}
    current_size = 0
    
    for speech_id, speech in speeches.items():
        speech_size = len(speech)
        # If adding this speech would exceed max size and we already have speeches,
        # start a new chunk
        if current_size + speech_size > max_chunk_size and current_chunk:
            chunks.append(current_chunk)
            current_chunk = {}
            current_size = 0
        
        current_chunk[speech_id] = speech
        current_size += speech_size
    
    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

In [52]:
def analyze_speeches(speeches: Dict[str, str]):
    """
    Uses OpenAI's API to analyze emotional intensity and political spectrum of speeches.
    """
    # Prepare the speeches for analysis
    speeches_text = "\n\n".join([f"Speech ID: {id}\nContent: {text}" for id, text in speeches.items()])
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": """
                    Analyze each speech and rate it on two scales:

                    1. Emotional Intensity (1-5):
                    - 1: Neutral and technical; purely factual presentation with minimal personal expression
                    - 2: Mild emotional content; professional tone with clear stance and moderate conviction
                    - 3: Moderate emotional engagement; balanced but passionate delivery
                    - 4: Strong emotional content; powerful rhetoric and clear passion
                    - 5: Extremely emotional; intense passion, dramatic language, and strong calls to action

                    2. Political Spectrum (1-5):
                    - 1: Strongly Progressive (major reforms, significant system change, strong left policies)
                    - 2: Moderately Progressive (incremental changes, center-left policies)
                    - 3: Centrist (balance of progressive and traditional views)
                    - 4: Moderately Conservative (traditional values, center-right policies)
                    - 5: Strongly Conservative (emphasis on traditional values, major system preservation)

                    Consider factors like:
                    - Language and rhetoric used
                    - Policy positions expressed
                    - Values emphasized
                    - Economic and social views
                    - Treatment of traditional vs progressive values

                    Output should be in JSON format containing a list of objects, each with:
                    - speech_id
                    - emotional_intensity (integer 1-5)
                    - political_spectrum (integer 1-5)

                    Be objective and consistent in your ratings. Use the full range of the scale when appropriate - don't hesitate to use any number if it best matches the speech's content.
                    """
                },
                {
                    "role": "user",
                    "content": speeches_text
                }
            ],
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name":"_",
                    "schema": Speeches.model_json_schema()
                }
            },
            temperature=0.3,
            timeout=600  # 10 minute timeout
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error during API call: {str(e)}")
        raise

In [53]:
def process_speech_file(file_path: str, output_dir: str, max_chunks: int=None):
    """
    Process a speech file and save the results.
    """
    print(f"Processing file: {file_path}")
    # Read speeches
    speeches = read_speech_file(file_path)
    print(f"Found {len(speeches)} speeches")
    
    # Split into chunks
    chunks = chunk_speeches(speeches)
    print(f"Split into {len(chunks)} chunks")
    
    # Process each chunk and combine results
    all_results = {}
    for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
        try:
            # Get analysis for speeches in this chunk
            analysis_json = analyze_speeches(chunk)
            analysis_results = json.loads(analysis_json)
            
            # Combine speech text with analysis
            for speech in analysis_results["speeches"]:
                speech_id = speech["speech_id"]
                all_results[speech_id] = {
                    "speech": speeches[speech_id],
                    "emotional_intensity": speech["emotional_intensity"],
                    "political_spectrum": speech["political_spectrum"]
                }
            # print if i mod 10 == 0
            if i % 10 == 0:
                print(f"Successfully processed chunk {i+1}")
            
            # break after max_chunks
            if max_chunks and i == max_chunks:
                break
                
        except Exception as e:
            print(f"Error processing chunk {i+1}: {str(e)}")
            continue
    
    # Save results
    output_file = os.path.join(output_dir, os.path.basename(file_path).replace('.txt', '_gpt_axis_labels.json'))
    os.makedirs(output_dir, exist_ok=True)
    with open(output_file, 'w') as f:
        json.dump(all_results, f, indent=2)
    print(f"\nResults saved to {output_file}")

In [54]:
def process_speech_files(speech_files: List[str], input_dir: str, output_dir: str, max_chunks: int=None):
    """
    Process multiple speech files.
    """
    for file_name in tqdm(speech_files, desc="Processing files"):
        file_path = os.path.join(input_dir, file_name)
        process_speech_file(file_path, output_dir, max_chunks)

In [55]:
def create_visualizations(emotional_df: pd.DataFrame, political_df: pd.DataFrame):
    """
    Create visualizations for the rating distributions.
    """
    # Create a figure with two subplots
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    # Plot Emotional Intensity
    ax1.bar(emotional_df['Rating'], emotional_df['Percentage'])
    ax1.set_title('Distribution of Emotional Intensity Ratings')
    ax1.set_xlabel('Emotional Intensity Rating (1-5)')
    ax1.set_ylabel('Percentage of Speeches')
    ax1.set_xticks(range(1, 6))
    
    # Plot Political Spectrum
    ax2.bar(political_df['Rating'], political_df['Percentage'])
    ax2.set_title('Distribution of Political Spectrum Ratings')
    ax2.set_xlabel('Political Spectrum Rating (1=Far Left, 5=Far Right)')
    ax2.set_ylabel('Percentage of Speeches')
    ax2.set_xticks(range(1, 6))
    
    # Adjust layout and save
    plt.tight_layout()
    plt.savefig('rating_distributions.png')
    plt.close()

In [56]:
def print_distribution(counts: dict, total: int):
    """
    Print the distribution of ratings in a formatted way.
    """
    for rating in range(1, 6):
        count = counts.get(rating, 0)
        percentage = (count / total) * 100
        print(f"Rating {rating}: {count:4d} speeches ({percentage:5.1f}%)")

In [57]:
def analyze_ratings(input_dir: str):
    """
    Analyze the distribution of emotional intensity and political spectrum ratings
    from all JSON files in the input directory.
    """
    # Initialize counters for both metrics
    emotional_counts = defaultdict(int)
    political_counts = defaultdict(int)
    total_speeches = 0
    
    # Process each JSON file in the directory
    for filename in os.listdir(input_dir):
        if filename.endswith('_gpt_axis_labels.json'):
            file_path = os.path.join(input_dir, filename)
            with open(file_path, 'r') as f:
                data = json.load(f)
                
                # Count frequencies for each rating
                for speech_id, speech_data in data.items():
                    emotional_counts[speech_data['emotional_intensity']] += 1
                    political_counts[speech_data['political_spectrum']] += 1
                    total_speeches += 1
    
    # Convert to pandas DataFrames for easier analysis and visualization
    emotional_df = pd.DataFrame([
        {'Rating': rating, 'Count': count, 'Percentage': (count/total_speeches)*100}
        for rating, count in sorted(emotional_counts.items())
    ])
    
    political_df = pd.DataFrame([
        {'Rating': rating, 'Count': count, 'Percentage': (count/total_speeches)*100}
        for rating, count in sorted(political_counts.items())
    ])
    
    # Print summary statistics
    print(f"Total speeches analyzed: {total_speeches}\n")
    print("Emotional Intensity Distribution:")
    print_distribution(emotional_counts, total_speeches)
    print("\nPolitical Spectrum Distribution:")
    print_distribution(political_counts, total_speeches)
    
    # Create visualizations
    create_visualizations(emotional_df, political_df)
    
    return {
        'emotional_intensity': dict(emotional_counts),
        'political_spectrum': dict(political_counts),
        'total_speeches': total_speeches
    }

In [58]:
def calculate_correlations(input_dir: str):
    """
    Calculate correlation between emotional intensity and political spectrum ratings.
    """
    ratings_pairs = []
    
    for filename in os.listdir(input_dir):
        if filename.endswith('_gpt_axis_labels.json'):
            with open(os.path.join(input_dir, filename), 'r') as f:
                data = json.load(f)
                for speech_data in data.values():
                    ratings_pairs.append({
                        'emotional_intensity': speech_data['emotional_intensity'],
                        'political_spectrum': speech_data['political_spectrum']
                    })
    
    df = pd.DataFrame(ratings_pairs)
    correlation = df['emotional_intensity'].corr(df['political_spectrum'])
    
    print(f"\nCorrelation between Emotional Intensity and Political Spectrum: {correlation:.3f}")
    
    # Create correlation visualization
    plt.figure(figsize=(8, 6))
    plt.scatter(df['political_spectrum'], df['emotional_intensity'], alpha=0.5)
    plt.title('Correlation: Emotional Intensity vs Political Spectrum')
    plt.xlabel('Political Spectrum Rating (1=Far Left, 5=Far Right)')
    plt.ylabel('Emotional Intensity Rating')
    plt.grid(True)
    plt.savefig('rating_correlation.png')
    plt.close()

In [59]:
if __name__ == "__main__":
    input_dir = "../small_speech_data"
    output_dir = "outputs"
    speech_files = [f for f in os.listdir(input_dir) if f.startswith("speeches_") and f.endswith(".txt")]
    process_speech_files(speech_files, input_dir, output_dir, max_chunks=100) # doing 100 max_chunks instead of 10 now

    # axis gpt output analysis:

    input_dir = "outputs"  # Directory containing the analysis JSON files
    
    # Analyze distributions
    results = analyze_ratings(input_dir)
    
    # Calculate correlations
    calculate_correlations(input_dir)

Processing files:   0%|          | 0/2 [00:00<?, ?it/s]

Processing file: ../small_speech_data/speeches_113_trimmed.txt
Found 6563 speeches
Split into 891 chunks




Successfully processed chunk 1




Successfully processed chunk 11




Successfully processed chunk 21




Successfully processed chunk 31




Successfully processed chunk 41




Successfully processed chunk 51




Successfully processed chunk 61




Successfully processed chunk 71




Successfully processed chunk 81




Successfully processed chunk 91


Processing chunks:  11%|█         | 100/891 [06:03<47:53,  3.63s/it]
Processing files:  50%|█████     | 1/2 [06:03<06:03, 363.51s/it]

Successfully processed chunk 101

Results saved to outputs/speeches_113_trimmed_gpt_axis_labels.json
Processing file: ../small_speech_data/speeches_114_trimmed.txt
Found 5456 speeches
Split into 668 chunks




Successfully processed chunk 1




Successfully processed chunk 11




Successfully processed chunk 21




Successfully processed chunk 31




Successfully processed chunk 41




Successfully processed chunk 51




Successfully processed chunk 61




Successfully processed chunk 71




Successfully processed chunk 81




Successfully processed chunk 91


Processing chunks:  15%|█▍        | 100/668 [05:41<32:20,  3.42s/it]
Processing files: 100%|██████████| 2/2 [11:45<00:00, 352.62s/it]

Successfully processed chunk 101

Results saved to outputs/speeches_114_trimmed_gpt_axis_labels.json
Total speeches analyzed: 1812

Emotional Intensity Distribution:
Rating 1:  444 speeches ( 24.5%)
Rating 2:  289 speeches ( 15.9%)
Rating 3:  483 speeches ( 26.7%)
Rating 4:  525 speeches ( 29.0%)
Rating 5:   71 speeches (  3.9%)

Political Spectrum Distribution:
Rating 1:  176 speeches (  9.7%)
Rating 2:  402 speeches ( 22.2%)
Rating 3:  947 speeches ( 52.3%)
Rating 4:  186 speeches ( 10.3%)
Rating 5:  101 speeches (  5.6%)






Correlation between Emotional Intensity and Political Spectrum: -0.220
