In [32]:
import os
import json
from typing import Dict, List
import openai
from tqdm import tqdm
from pydantic import BaseModel
from dotenv import load_dotenv
from collections import Counter

# Load environment variables
load_dotenv()
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [33]:
class Speech(BaseModel):
    speech_id: str
    topics: List[str]
    
class Speeches(BaseModel):
    speeches: List[Speech]

In [34]:
def read_speech_file(file_path: str) -> Dict[str, str]:
    """
    Reads a speech file and returns a dictionary mapping speech_ids to speeches.
    Only includes speeches with more than 30 words.
    """
    speeches = {}
    with open(file_path, 'r') as file:
        # Skip header line
        next(file)
        for line in file:
            # Split on pipe character
            parts = line.strip().split('|')
            if len(parts) == 2:
                speech_id, speech = parts
                # Only add speech if it has more than 30 -> 35 words
                if len(speech.split()) > 35:
                    speeches[speech_id] = speech
    return speeches


In [35]:
def chunk_speeches(speeches: Dict[str, str], max_chunk_size: int = 20000) -> List[Dict[str, str]]:
    """
    Splits speeches into chunks while keeping individual speeches intact.
    """
    chunks = []
    current_chunk = {}
    current_size = 0
    
    for speech_id, speech in speeches.items():
        speech_size = len(speech)
        
        # If adding this speech would exceed max size and we already have speeches,
        # start a new chunk
        if current_size + speech_size > max_chunk_size and current_chunk:
            chunks.append(current_chunk)
            current_chunk = {}
            current_size = 0
            
        current_chunk[speech_id] = speech
        current_size += speech_size
    
    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

In [36]:
def extract_speech_topics(speeches: Dict[str, str]):
    """
    Uses OpenAI's API to extract topics from the given speeches.
    """
    # Prepare the speeches for analysis
    speeches_text = "\n\n".join([f"Speech ID: {id}\nContent: {text}" for id, text in speeches.items()])
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": """
                    Analyze each speech and identify its relevant topics. Topics should be chosen from this fixed list:
                    - Governance and Democracy
                    - Economy and Jobs
                    - Health and Social Services
                    - Education and Innovation
                    - Environment and Energy
                    - Defense and Security
                    - Immigration and Border Policy
                    - Justice and Civil Rights
                    - Infrastructure and Transportation
                    - Budget and Fiscal Responsibility

                    For each speech, assign one or more topics that best match its content.
                    Output should be in JSON format containing a list of objects, each with a speech_id and its corresponding topics list.
                    Be precise and thorough in topic assignment.
                    """
                },
                {
                    "role": "user",
                    "content": speeches_text
                }
            ],
            response_format={
                "type": "json_schema",
                "json_schema": 
                    {
                        "name":"_", 
                        "schema": Speeches.model_json_schema()
                    }
            },
            temperature=0.3,
            timeout=600  # 10 minute timeout
        )
        
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error during API call: {str(e)}")
        raise

In [37]:
def process_speech_file(file_path: str, output_dir: str, max_chunks: int=None):
    """
    Process a speech file and save the results.
    """
    print(f"Processing file: {file_path}")
    
    # Read speeches
    speeches = read_speech_file(file_path)
    print(f"Found {len(speeches)} speeches")
    
    # Split into chunks
    chunks = chunk_speeches(speeches)
    print(f"Split into {len(chunks)} chunks")
    
    # Process each chunk and combine results
    all_results = {}
    
    for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
        try:
            # Get topics for speeches in this chunk
            topics_json = extract_speech_topics(chunk)
            topics_results = json.loads(topics_json)
            
            # Combine speech text with topics
            for speech in topics_results["speeches"]:
                speech_id = speech["speech_id"]
                all_results[speech_id] = {
                    "speech": speeches[speech_id],
                    "topics": speech["topics"]
                }
            
            # print if chunk mod 10 == 0
            if (i+1) % 10 == 0:
                print(f"Successfully processed chunk {i+1}")

            if max_chunks and i == max_chunks:
                break
        except Exception as e:
            print(f"Error processing chunk {i+1}: {str(e)}")
            continue
    
    # Save results
    output_file = os.path.join(output_dir, os.path.basename(file_path).replace('.txt', '_gpt_topic_labels.json'))
    os.makedirs(output_dir, exist_ok=True)
    
    with open(output_file, 'w') as f:
        json.dump(all_results, f, indent=2)
    
    print(f"\nResults saved to {output_file}")

In [38]:
def process_speech_files(speech_files: List[str], input_dir: str, output_dir: str, max_chunks: int=None):
    """
    Process multiple speech files.
    """
    for file_name in tqdm(speech_files, desc="Processing files"):
        file_path = os.path.join(input_dir, file_name)
        process_speech_file(file_path, output_dir, max_chunks)

In [49]:
def analyze_topic_frequencies(input_dir: str) -> Dict[str, int]:
    """
    Analyze JSON files in the input directory and count topic frequencies.
    
    Args:
        input_dir: Directory containing the speech analysis JSON files
        
    Returns:
        Dictionary mapping topics to their frequencies
    """
    # Initialize counter for topics
    topic_counter = Counter()
    
    # Get all JSON files in directory
    json_files = [f for f in os.listdir(input_dir) if f.endswith('_gpt_topic_labels.json')]
    
    # Process each file
    for filename in tqdm(json_files, desc="Processing files"):
        file_path = os.path.join(input_dir, filename)
        
        with open(file_path, 'r') as f:
            data = json.load(f)
            
            # Go through each speech
            for speech_id, speech_data in data.items():
                # Add each topic to our counter
                topic_counter.update(speech_data['topics'])
    
    # Convert Counter to regular dictionary and sort by frequency
    topic_frequencies = dict(sorted(topic_counter.items(), key=lambda x: x[1], reverse=True))
    
    return topic_frequencies

In [50]:
def print_topic_statistics(topic_frequencies: Dict[str, int]):
    """
    Print formatted statistics about topic frequencies.
    """
    print("\nTopic Frequencies:")
    print("-" * 40)
    
    # Find the longest topic name for formatting
    max_topic_length = max(len(topic) for topic in topic_frequencies.keys())
    
    # Print each topic and its count
    for topic, count in topic_frequencies.items():
        print(f"{topic:<{max_topic_length}} : {count:>6}")
    
    print("-" * 40)
    print(f"Total topics mentioned: {sum(topic_frequencies.values())}")

In [52]:
if __name__ == "__main__":
    input_dir = "../small_speech_data"
    output_dir = "outputs"
    speech_files = [f for f in os.listdir(input_dir) if f.startswith("speeches_") and f.endswith(".txt")]
    process_speech_files(speech_files, input_dir, output_dir, max_chunks=100) # changed from 10 to 100

    # Analyze topic frequencies

    input_dir_freq = "outputs"
    topic_frequencies = analyze_topic_frequencies(input_dir_freq)
    print_topic_statistics(topic_frequencies)

Processing files: 100%|██████████| 2/2 [00:00<00:00, 95.11it/s]


Topic Frequencies:
----------------------------------------
Economy and Jobs                  :    512
Governance and Democracy          :    464
Health and Social Services        :    438
Justice and Civil Rights          :    359
Budget and Fiscal Responsibility  :    321
Defense and Security              :    223
Environment and Energy            :    180
Infrastructure and Transportation :    146
Education and Innovation          :    118
Immigration and Border Policy     :     51
Trade Policy                      :      4
Culture and Community             :      2
Community and Social Services     :      1
Culture and Arts                  :      1
Technology and Innovation         :      1
Housing and Community Development :      1
----------------------------------------
Total topics mentioned: 2822



