In [None]:
import os
import json
import openai
from tqdm import tqdm
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import Union
load_dotenv()

client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [None]:
class Member(BaseModel):
    member_name: str
    member_role: str
    member_state: str

class Staff(BaseModel):
    staff_name: str
    staff_role: str
    staff_state: str

class Subcommittee(BaseModel):
    subcommittee_name: str
    subcommittee_members: list[Union[Member, Staff]]

class Committee(BaseModel):
    committee_name: str
    subcommittees: list[Subcommittee]

class committees_json_schema(BaseModel):
    committees: list[Committee]

def extract_committee_info(text_chunk):
    """
    Uses OpenAI's API to extract committees, subcommittees, staff, member names, and roles from the given text.
    Returns the information in JSON format as specified by the implementation.

    Args:
        text_chunk: str
            Chunk of text to process
    
    Returns:
        response: str
            Extracted information in JSON format
    """
    print(f"Processing chunk of size: {len(text_chunk)} characters")
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": """
                    Extract committees, subcommittees, member information, and staff information from this text into JSON format.
                    Be thorough in extracting all relevant information (don't miss any names).
                    For each committee:
                    1. Find the committee name
                    2. IMPORTANT: First process the main committee members and staff (all members and staff listed BEFORE any subcommittee section)
                    - Create a subcommittee with the same name as the committee
                    - Include all members and staff listed at the start of the committee section

                    STAFF PROCESSING INSTRUCTIONS:
                    - Look for major staff sections marked by 'STAFF', 'Majority Staff', 'Minority Staff', or similar headers
                    - Process ALL staff hierarchically - Director level, Deputy level, Professional Staff, Administrative Staff, etc.
                    - Pay special attention to indented staff listings which indicate reporting relationships
                    - Look for staff listings in office-specific sections (e.g., "Clerk's Office:", "Communications:", etc.)
                    - Process ALL contact information sections as they often contain additional staff listings
                    - Watch for staff sections that continue across multiple pages

                    For lines with two-column formats:
                    * Process both the left and right sides of the line
                    * Look for names separated by multiple spaces or tabs
                    * Each side typically ends with a state and period

                    For names that are split across lines with state information:
                    * Check for entries where the state appears indented on the next line
                    * Combine name and state information even when split by line breaks

                    3. Then process any subcommittee section if it exists
                    4. For committees with NO subcommittees, use the committee name as the subcommittee name
                    5. Include everything until the next committee name appears
                    6. After processing main sections, check for:
                    - Additional staff listings at the end of committee sections
                    - Staff listings in footnotes or supplementary sections
                    - Professional staff members listed under special sections


                    For each committee/subcommittee, record:
                    - Members and their roles (Chair, Vice Chair, etc., use 'Member' if no explicit role listed)
                    - States for members (use 'N/A' if no state listed)
                    - Staff (names listed under 'STAFF' sections) and their roles (use 'Staff' if no explicit role listed)
                    - States for staff (use 'N/A' if no state listed)

                    Important details:
                    - Include each name in every committee/subcommittee they appear in
                    - Process BOTH columns when lines are formatted in two columns
                    - Look for multiple names per line (separated by commas, periods, or large spaces)
                    - Check if entries continue on next line
                    - Keep line indentation in mind when grouping information
                    - Remember that the main committee members and staff come BEFORE any subcommittee listings
                    - For two-column layouts, process right column with same care as left column
                    - DON'T FORGET TO INCLUDE THE STAFF, most committees/subcommittees have staff listed under 'STAFF' sections

                    Output the results in the existing JSON structure provided.
                    """,
                },
                {
                    "role": "user",
                    "content": text_chunk
                }
            ],
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"_", 
                        "schema": committees_json_schema.model_json_schema()
                    }
            },  
            temperature=0.3,
            # max_tokens=10000,
            timeout=600  # 10 minute timeout per chunk
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error during API call: {str(e)}")
        raise

In [None]:
def process_committee_file(file_name, input_dir, output_dir):
    # Read file content
    file_path = os.path.join(input_dir, file_name)
    with open(file_path, 'r') as file:
        content = file.read()
        print(f"File loaded. Size: {len(content)} characters")
    
    print("Splitting content into chunks...")
    chunks = chunk_text(content)
    print(f"Split into {len(chunks)} chunks")
    
    all_committees = {"committees": []}
    
    # Process each chunk
    for chunk_num, chunk in enumerate(chunks, 1):
        print(f"\nProcessing chunk {chunk_num}/{len(chunks)}")
        try:
            # Extract committee information in JSON format
            committee_info = extract_committee_info(chunk)
            
            try:
                # Parse the JSON response
                chunk_data = json.loads(committee_info)
                
                # Merge committees from this chunk
                if "committees" in chunk_data:
                    all_committees["committees"].extend(chunk_data["committees"])
                
                print(f"Successfully processed chunk {chunk_num}")
                
            except json.JSONDecodeError as e:
                print(f"JSON Decode Error in chunk {chunk_num}: {str(e)}")
                # Save the problematic chunk
                error_file_path = os.path.join(output_dir, f'{file_name}_chunk{chunk_num}_error.txt')
                with open(error_file_path, 'w') as error_file:
                    error_file.write(committee_info)
                print(f"Problematic chunk saved to {error_file_path}")
                continue
            
        except Exception as e:
            print(f"Error processing chunk {chunk_num}: {str(e)}")
            continue
    
    # Save the combined results
    json_file_path = os.path.join(output_dir, f'{file_name}_output.json') # output file name

    os.makedirs(output_dir, exist_ok=True)

    with open(json_file_path, 'w') as json_file:
        json.dump(all_committees, json_file, indent=2)
    print(f"\nCombined results saved to {json_file_path}")

In [None]:
def process_committee_files(congress_number, committee_files):
    """
    Process committee files for a given Congress number and type (Senate or House).
    Extract committee information from each file and save the output in JSON format.

    Args:
        congress_number: int
            Congress number to process
        committee_files: dict
            Dictionary containing committee file names for Senate or House
    """
    # folder to save the output JSON files
    output_dir = f"outputs/{congress_number}"
    os.makedirs(output_dir, exist_ok=True)

    for file_name in tqdm(committee_files[f'congress_{congress_number}']):
        input_dir = f'congressional_directory_files/congress_{congress_number}/txt'
        process_committee_file(file_name, input_dir, output_dir)

In [9]:
import os
import json
from typing import Dict, List
import openai
from tqdm import tqdm
from pydantic import BaseModel
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

class Speech(BaseModel):
    speech_id: str
    topics: List[str]
    
class Speeches(BaseModel):
    speeches: List[Speech]

def read_speech_file(file_path: str) -> Dict[str, str]:
    """
    Reads a speech file and returns a dictionary mapping speech_ids to speeches.
    Only includes speeches with more than 30 words.
    """
    speeches = {}
    with open(file_path, 'r') as file:
        # Skip header line
        next(file)
        for line in file:
            # Split on pipe character
            parts = line.strip().split('|')
            if len(parts) == 2:
                speech_id, speech = parts
                # Only add speech if it has more than 30 words
                if len(speech.split()) > 30:
                    speeches[speech_id] = speech
    return speeches

def chunk_speeches(speeches: Dict[str, str], max_chunk_size: int = 15000) -> List[Dict[str, str]]:
    """
    Splits speeches into chunks while keeping individual speeches intact.
    """
    chunks = []
    current_chunk = {}
    current_size = 0
    
    for speech_id, speech in speeches.items():
        speech_size = len(speech)
        
        # If adding this speech would exceed max size and we already have speeches,
        # start a new chunk
        if current_size + speech_size > max_chunk_size and current_chunk:
            chunks.append(current_chunk)
            current_chunk = {}
            current_size = 0
            
        current_chunk[speech_id] = speech
        current_size += speech_size
    
    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

def extract_speech_topics(speeches: Dict[str, str]):
    """
    Uses OpenAI's API to extract topics from the given speeches.
    """
    # Prepare the speeches for analysis
    speeches_text = "\n\n".join([f"Speech ID: {id}\nContent: {text}" for id, text in speeches.items()])
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": """
                    Analyze each speech and identify its relevant topics. Topics should be chosen from this fixed list:
                    - Governance and Democracy
                    - Economy and Jobs
                    - Health and Social Services
                    - Education and Innovation
                    - Environment and Energy
                    - Defense and Security
                    - Immigration and Border Policy
                    - Justice and Civil Rights
                    - Infrastructure and Transportation
                    - Budget and Fiscal Responsibility

                    For each speech, assign one or more topics that best match its content.
                    Output should be in JSON format containing a list of objects, each with a speech_id and its corresponding topics list.
                    Be precise and thorough in topic assignment.
                    """
                },
                {
                    "role": "user",
                    "content": speeches_text
                }
            ],
            response_format={
                "type": "json_schema",
                "json_schema": 
                    {
                        "name":"_", 
                        "schema": Speeches.model_json_schema()
                    }
            },
            temperature=0.3,
            timeout=600  # 10 minute timeout
        )
        
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error during API call: {str(e)}")
        raise

def process_speech_file(file_path: str, output_dir: str):
    """
    Process a speech file and save the results.
    """
    print(f"Processing file: {file_path}")
    
    # Read speeches
    speeches = read_speech_file(file_path)
    print(f"Found {len(speeches)} speeches")
    
    # Split into chunks
    chunks = chunk_speeches(speeches)
    print(f"Split into {len(chunks)} chunks")
    
    # Process each chunk and combine results
    all_results = {}
    
    for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
        try:
            # Get topics for speeches in this chunk
            topics_json = extract_speech_topics(chunk)
            topics_results = json.loads(topics_json)
            
            # Combine speech text with topics
            for speech in topics_results["speeches"]:
                speech_id = speech["speech_id"]
                all_results[speech_id] = {
                    "speech": speeches[speech_id],
                    "topics": speech["topics"]
                }
            
            print(f"Successfully processed chunk {i+1}")
            # break after 10 chunks
            if i == 10:
                break
        except Exception as e:
            print(f"Error processing chunk {i+1}: {str(e)}")
            continue
    
    # Save results
    output_file = os.path.join(output_dir, os.path.basename(file_path).replace('.txt', '_analysis.json'))
    os.makedirs(output_dir, exist_ok=True)
    
    with open(output_file, 'w') as f:
        json.dump(all_results, f, indent=2)
    
    print(f"\nResults saved to {output_file}")

def process_speech_files(speech_files: List[str], input_dir: str, output_dir: str):
    """
    Process multiple speech files.
    """
    for file_name in tqdm(speech_files, desc="Processing files"):
        file_path = os.path.join(input_dir, file_name)
        process_speech_file(file_path, output_dir)

# Example usage:
if __name__ == "__main__":
    input_dir = "../small_speech_data"
    output_dir = "outputs"
    speech_files = [f for f in os.listdir(input_dir) if f.startswith("speeches_") and f.endswith(".txt")]
    process_speech_files(speech_files, input_dir, output_dir)

Processing files:   0%|          | 0/2 [00:00<?, ?it/s]

Processing file: ../small_speech_data/speeches_113_trimmed.txt
Found 6563 speeches
Split into 1179 chunks




Successfully processed chunk 1




Successfully processed chunk 2




Successfully processed chunk 3




Successfully processed chunk 4




Successfully processed chunk 5




Successfully processed chunk 6




Successfully processed chunk 7




Successfully processed chunk 8




Successfully processed chunk 9




Successfully processed chunk 10


Processing chunks:   1%|          | 10/1179 [00:29<57:19,  2.94s/it]
Processing files:  50%|█████     | 1/2 [00:29<00:29, 29.55s/it]

Successfully processed chunk 11

Results saved to outputs/speeches_113_trimmed_analysis.json
Processing file: ../small_speech_data/speeches_114_trimmed.txt
Found 5456 speeches
Split into 913 chunks




Successfully processed chunk 1




Successfully processed chunk 2




Successfully processed chunk 3




Successfully processed chunk 4




Successfully processed chunk 5




Successfully processed chunk 6




Successfully processed chunk 7




Successfully processed chunk 8




Successfully processed chunk 9




Successfully processed chunk 10


Processing chunks:   1%|          | 10/913 [00:25<37:59,  2.52s/it]
Processing files: 100%|██████████| 2/2 [00:54<00:00, 27.46s/it]

Successfully processed chunk 11

Results saved to outputs/speeches_114_trimmed_analysis.json





In [10]:
import json
import os
from collections import Counter
from typing import Dict
from tqdm import tqdm

def analyze_topic_frequencies(input_dir: str) -> Dict[str, int]:
    """
    Analyze JSON files in the input directory and count topic frequencies.
    
    Args:
        input_dir: Directory containing the speech analysis JSON files
        
    Returns:
        Dictionary mapping topics to their frequencies
    """
    # Initialize counter for topics
    topic_counter = Counter()
    
    # Get all JSON files in directory
    json_files = [f for f in os.listdir(input_dir) if f.endswith('_analysis.json')]
    
    # Process each file
    for filename in tqdm(json_files, desc="Processing files"):
        file_path = os.path.join(input_dir, filename)
        
        with open(file_path, 'r') as f:
            data = json.load(f)
            
            # Go through each speech
            for speech_id, speech_data in data.items():
                # Add each topic to our counter
                topic_counter.update(speech_data['topics'])
    
    # Convert Counter to regular dictionary and sort by frequency
    topic_frequencies = dict(sorted(topic_counter.items(), key=lambda x: x[1], reverse=True))
    
    return topic_frequencies

def print_topic_statistics(topic_frequencies: Dict[str, int]):
    """
    Print formatted statistics about topic frequencies.
    """
    print("\nTopic Frequencies:")
    print("-" * 40)
    
    # Find the longest topic name for formatting
    max_topic_length = max(len(topic) for topic in topic_frequencies.keys())
    
    # Print each topic and its count
    for topic, count in topic_frequencies.items():
        print(f"{topic:<{max_topic_length}} : {count:>6}")
    
    print("-" * 40)
    print(f"Total topics mentioned: {sum(topic_frequencies.values())}")

if __name__ == "__main__":
    input_dir = "outputs"
    
    # Analyze frequencies
    topic_frequencies = analyze_topic_frequencies(input_dir)
    
    # Print results
    print_topic_statistics(topic_frequencies)

Processing files: 100%|██████████| 2/2 [00:00<00:00, 1014.71it/s]


Topic Frequencies:
----------------------------------------
Governance and Democracy          :    125
Economy and Jobs                  :     43
Justice and Civil Rights          :     36
Budget and Fiscal Responsibility  :     30
Health and Social Services        :     20
Infrastructure and Transportation :      4
Defense and Security              :      2
Environment and Energy            :      1
----------------------------------------
Total topics mentioned: 261



