File for Creating Labeled Speeches

In [24]:
import pandas as pd

In [48]:
# File paths for the original files
speeches_file = "../small_speech_data/speeches_113_trimmed.txt"
topics_file = "../data_exploration/speech_topic_mapping.csv"
# File path for the new output file
output_file = "formatted_speeches_with_topics.csv"

In [52]:
speeches_df = pd.read_csv(speeches_file, delimiter="|", header=0, names=["speech_id", "speech_content"])
topics_df = pd.read_csv(topics_file, header=0, names=["speech_id", "topics"])


# Ensure 'speech_id' is of the same type in both DataFrames (convert to string in both)
speeches_df["speech_id"] = speeches_df["speech_id"].astype(str)
topics_df["speech_id"] = topics_df["speech_id"].astype(str)

# Remove any rows where 'speech_id' equals the column name (e.g., 'speech_id')
topics_df = topics_df[topics_df["speech_id"] != "speech_id"]

# Remove rows with NaN values in the 'topics' column
topics_df = topics_df.dropna(subset=["topics"])

# Create a new DataFrame with the desired format by matching IDs
formatted_df = pd.merge(topics_df, speeches_df, on="speech_id", how="inner")

# Reorder the columns for the desired output format
formatted_df = formatted_df[["speech_id", "speech_content", "topics"]]

# Preview the formatted DataFrame
print(formatted_df.head())

# Save to CSV (optional)
output_file = "formatted_speeches_with_topics.csv"
formatted_df.to_csv(output_file, index=False)
print(f"New file created: {output_file}")

    speech_id                                     speech_content  \
0  1130000002  As directed by law. the Clerk of the House has...   
1  1130000004  Credentials form. have been received election ...   
2  1130000005  The Clerk now recognizes the gentleman from Ca...   
3  1130000006  Madam Clerk. this is the peoples House. and ev...   
4  1130000007  The names of the Honorable JOHN A. BOEHNER. a ...   

                                       topics  
0                         federalism, justice  
1                                     foreign  
2                                     justice  
3  elections, federalism, justice, minorities  
4                         federalism, justice  
New file created: formatted_speeches_with_topics.csv


Empty DataFrame
Columns: [speech_id, speech_content, topics]
Index: []
