In [9]:
import pandas as pd
import json
import glob
from datetime import datetime
import re

def load_jsonl_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]
    return pd.DataFrame(data)

def standardize_names(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'\b[Tt]ramp\b', 'Trump', text)
    text = re.sub(r'\b[Jj]oe\s+[Ww]iden\b', 'Joe Biden', text)
    return text

def clean_speech_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = standardize_names(text)
    text = text.lower()
    text = re.sub(r'\btrump\b', 'Trump', text)
    text = re.sub(r'\bjoe biden\b', 'Joe Biden', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\w\s.,!?]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'applause|cheering|laughter', '', text)
    return text

def process_all_files():
    biden_speeches = pd.DataFrame()
    trump_speeches = pd.DataFrame()
    
    biden_files = glob.glob('*rawtext_JoeBiden.jsonl')
    for file in biden_files:
        print(f"Processing {file}...")
        df = load_jsonl_file(file)
        biden_speeches = pd.concat([biden_speeches, df], ignore_index=True)
    
    trump_files = glob.glob('*rawtext_DonaldTrump.jsonl')
    for file in trump_files:
        print(f"Processing {file}...")
        df = load_jsonl_file(file)
        trump_speeches = pd.concat([trump_speeches, df], ignore_index=True)
    
    print("Cleaning Biden speeches...")
    biden_speeches['cleaned_text'] = biden_speeches['RawText'].apply(clean_speech_text)
    print("Cleaning Trump speeches...")
    trump_speeches['cleaned_text'] = trump_speeches['RawText'].apply(clean_speech_text)
    
    biden_speeches = biden_speeches[biden_speeches['cleaned_text'].str.len() > 0]
    trump_speeches = trump_speeches[trump_speeches['cleaned_text'].str.len() > 0]
    
    biden_speeches['Date'] = pd.to_datetime(biden_speeches['Date'], unit='ms')
    trump_speeches['Date'] = pd.to_datetime(trump_speeches['Date'], unit='ms')
    
    biden_speeches = biden_speeches.sort_values('Date')
    trump_speeches = trump_speeches.sort_values('Date')
    
    biden_speeches = biden_speeches.drop_duplicates(subset=['SpeechID'])
    trump_speeches = trump_speeches.drop_duplicates(subset=['SpeechID'])
    
    return biden_speeches, trump_speeches

def save_processed_speeches(biden_df, trump_df):
    # Convert 'Date' column to ISO 8601 string format for JSON compatibility
    biden_df['Date'] = biden_df['Date'].dt.strftime('%Y-%m-%dT%H:%M:%S')
    trump_df['Date'] = trump_df['Date'].dt.strftime('%Y-%m-%dT%H:%M:%S')
    
    # Save as CSV
    biden_df.to_csv('processed_biden_speeches.csv', index=False)
    trump_df.to_csv('processed_trump_speeches.csv', index=False)
    
    # Save as JSONL
    with open('processed_biden_speeches.jsonl', 'w', encoding='utf-8') as f:
        for _, row in biden_df.iterrows():
            json.dump(row.to_dict(), f)
            f.write('\n')
            
    with open('processed_trump_speeches.jsonl', 'w', encoding='utf-8') as f:
        for _, row in trump_df.iterrows():
            json.dump(row.to_dict(), f)
            f.write('\n')

if __name__ == "__main__":
    print("Starting speech processing...")
    biden_speeches, trump_speeches = process_all_files()
    
    print(f"\nBiden Speeches: {len(biden_speeches)}")
    print(f"Trump Speeches: {len(trump_speeches)}")
    
    print(f"\nBiden speeches with empty text: {biden_speeches['RawText'].isna().sum()}")
    print(f"Trump speeches with empty text: {trump_speeches['RawText'].isna().sum()}")
    
    print("\nSaving processed speeches...")
    save_processed_speeches(biden_speeches, trump_speeches)
    print("Processing complete!")
    
    print(f"\nBiden speeches date range: {biden_speeches['Date'].min()} to {biden_speeches['Date'].max()}")
    print(f"Trump speeches date range: {trump_speeches['Date'].min()} to {trump_speeches['Date'].max()}")

Starting speech processing...
Processing 1rawtext_JoeBiden.jsonl...
Processing 3rawtext_JoeBiden.jsonl...
Processing 2rawtext_JoeBiden.jsonl...
Processing 4rawtext_JoeBiden.jsonl...
Processing 1rawtext_DonaldTrump.jsonl...
Processing 3rawtext_DonaldTrump.jsonl...
Processing 2rawtext_DonaldTrump.jsonl...
Cleaning Biden speeches...
Cleaning Trump speeches...

Biden Speeches: 511
Trump Speeches: 683

Biden speeches with empty text: 0
Trump speeches with empty text: 0

Saving processed speeches...
Processing complete!

Biden speeches date range: 2019-04-18T00:00:00 to 2021-01-29T00:00:00
Trump speeches date range: 2019-01-02T00:00:00 to 2021-01-20T00:00:00


In [7]:
import pandas as pd
import re
import json

def correct_typos(text):
    # Biden corrections
    text = re.sub(r'\bmayor butti gieg\b', 'Mayor Pete Buttigieg', text, flags=re.IGNORECASE)
    text = re.sub(r'\binaudible conversations inaudible\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\binaudible\b', '', text, flags=re.IGNORECASE)
    return text

# Correct typos in CSV
for file_name in ['processed_biden_speeches.csv']:
    print(f"Processing CSV file: {file_name}")
    df = pd.read_csv(file_name)
    df['cleaned_text'] = df['cleaned_text'].apply(correct_typos)
    corrected_csv_file = f'corrected_{file_name}'
    df.to_csv(corrected_csv_file, index=False)
    print(f"Corrected CSV saved as: {corrected_csv_file}")

# Correct typos in JSONL
for file_name in ['processed_biden_speeches.jsonl']:
    print(f"Processing JSONL file: {file_name}")
    corrected_jsonl_file = f'corrected_{file_name}'
    
    with open(file_name, 'r', encoding='utf-8') as f_in, open(corrected_jsonl_file, 'w', encoding='utf-8') as f_out:
        for line in f_in:
            record = json.loads(line)
            if 'cleaned_text' in record:
                record['cleaned_text'] = correct_typos(record['cleaned_text'])
            json.dump(record, f_out)
            f_out.write('\n')
    
    print(f"Corrected JSONL saved as: {corrected_jsonl_file}")

Processing CSV file: processed_biden_speeches.csv
Corrected CSV saved as: corrected_processed_biden_speeches.csv
Processing JSONL file: processed_biden_speeches.jsonl
Corrected JSONL saved as: corrected_processed_biden_speeches.jsonl
