In [13]:
import pandas as pd
df = pd.read_csv("extracted_keywords.csv", index_col=0)
df["keyphrases"] = df["keyphrases"].apply(lambda x: [phrase[1:-1] for phrase in x[1:-1].split(", ")])
df["keyphrases"] = df["keyphrases"].apply(lambda x: [phrase for phrase in x if "congress" not in phrase])
df = df[df["year"] >= 2013]
df["keyphrases"][0]

['israeli hostage rescue',
 'israeli arab hostage',
 'hamas captivity',
 'eighth hostage',
 'hostage home',
 'qaid farhan alkadi',
 'hamas today',
 'israeli arab',
 'joy sherman statement',
 'hostage held captive',
 'israeli military',
 'rescued alive',
 'gazawashington',
 'rescue',
 'bravery secured']

In [33]:
keyphrase_data = []
for _, row in df.iterrows():
    for keyphrase in row['keyphrases']:
        keyphrase_data.append({'keyphrase': keyphrase})

keyphrase_df = pd.DataFrame(keyphrase_data)

list(dict(keyphrase_df.groupby('keyphrase')['keyphrase'].count().sort_values(ascending=False)[:150]).keys())
# dict(keyphrase_df.groupby('keyphrase')['keyphrase'].count().sort_values(ascending=False)[:150])


['veteran',
 'federal funding',
 'bill',
 'bipartisan bill',
 'funding',
 'national security',
 'legislation',
 'biden administration',
 'representative',
 'bipartisan legislation',
 'american samoa',
 'grant',
 'house committee',
 'affordable care act',
 'epa',
 'health care',
 'federal government',
 'trump administration',
 'president biden',
 'veteran affair',
 'supreme court',
 'statement',
 'medicare',
 'farm bill',
 'federal grant',
 'covid pandemic',
 'senate',
 'president trump',
 'representative passed',
 'amata',
 'federal agency',
 'bipartisan infrastructure law',
 'homeland security',
 'rep johnson',
 'gun violence',
 'federal fund',
 'american rescue plan',
 'district',
 'aumua amata',
 'bill passed',
 'covid',
 'house appropriation subcommittee',
 'california',
 'environmental protection agency',
 'house appropriation committee',
 'va',
 'amendment',
 'bipartisan support',
 'president',
 'repeal',
 'infrastructure investment',
 'climate change',
 'obama administration',
 

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import imageio
from pathlib import Path
import numpy as np


keyphrase_data = []
for _, row in df.iterrows():
    for keyphrase in row['keyphrases']:
        keyphrase_data.append({'year': row['year'], 'month': row['month'], 'keyphrase': keyphrase})

keyphrase_df = pd.DataFrame(keyphrase_data)

# Add quarter and time columns
keyphrase_df['time'] = pd.to_datetime(keyphrase_df[['year', 'month']].assign(day=1))
keyphrase_df['quarter'] = keyphrase_df['time'].dt.to_period('Q')

# Aggregate counts by quarter
quarterly_counts = (
    keyphrase_df
    .groupby(['quarter', 'keyphrase'])
    .size()
    .reset_index(name='count')
)

# Add year and quarter for sorting
quarterly_counts['year'] = quarterly_counts['quarter'].dt.year
quarterly_counts['quarter_num'] = quarterly_counts['quarter'].dt.quarter

# Sort by time
quarterly_counts = quarterly_counts.sort_values(['year', 'quarter_num'])

# Find top 3 keyphrases for each quarter
top_per_quarter = (
    quarterly_counts.groupby('quarter')
    .apply(lambda x: x.nlargest(10, 'count'))
    .reset_index(drop=True)
)

# Smooth transitions by interpolating counts
unique_quarters = quarterly_counts['quarter'].unique()
frames = []
output_dir = Path("frames")
output_dir.mkdir(exist_ok=True)

for i in range(len(unique_quarters) - 1):
    start_quarter = unique_quarters[i]
    end_quarter = unique_quarters[i + 1]
    
    start_data = top_per_quarter[top_per_quarter['quarter'] == start_quarter]
    end_data = top_per_quarter[top_per_quarter['quarter'] == end_quarter]

    # Merge start and end data to interpolate
    merged_data = pd.merge(
        start_data, 
        end_data, 
        on='keyphrase', 
        suffixes=('_start', '_end'), 
        how='outer'
    ).fillna(0)

    for alpha in np.linspace(0, 1, 10):  # Smooth transition with 10 frames
        merged_data['count_interp'] = (
            (1 - alpha) * merged_data['count_start'] + alpha * merged_data['count_end']
        )
        
        # Plot the interpolated frame
        plt.figure(figsize=(10, 6))
        sns.barplot(
            data=merged_data,
            x='count_interp',
            y='keyphrase',
            order=merged_data.sort_values('count_interp', ascending=False)['keyphrase']
        )
        
        plt.title(f"Top Keyphrases: Transition from {start_quarter} to {end_quarter}")
        plt.xlabel("Interpolated Count")
        plt.ylabel("Keyphrase")
        plt.tight_layout()
        
        # Save the frame
        frame_filename = output_dir / f"frame_{start_quarter}_{end_quarter}_{alpha:.2f}.png"
        plt.savefig(frame_filename)
        frames.append(frame_filename)
        plt.close()

# Create animated GIF
with imageio.get_writer("keyphrases_quarterly.gif", mode="I", duration=1) as writer:
    for frame in frames:
        image = imageio.imread(frame)
        writer.append_data(image)

# Clean up
for frame in frames:
    frame.unlink()

print("Animated GIF saved as 'keyphrases_quarterly.gif'")


  .apply(lambda x: x.nlargest(10, 'count'))
  image = imageio.imread(frame)


Animated GIF saved as 'keyphrases_quarterly.gif'


In [11]:
top_per_quarter["keyphrase"].unique()

array(['house appropriation committee', 'national debt', 'budget',
       'larson statement', 'president obama',
       'lifelong farmer representing california', 'farm bill', 'veteran',
       'affordable care act', 'veteran affair', 'government shutdown',
       'syria', 'bill', 'federal government', 'president',
       'unemployment benefit', 'va', 'pocan statement', 'smith', 'grant',
       'islamic state', 'obama administration',
       'representative gary palmer', 'obamacare', 'supreme court', 'epa',
       'planned parenthood', 'iran', 'nuclear weapon',
       'national security', 'environmental protection agency', 'amata',
       'rep norcross', 'amendment', 'american samoa', 'tehama county',
       'st century cure act', 'mccollum statement',
       'rep donald norcross nj', 'funding', 'president trump', 'repeal',
       'health care', 'trump administration', 'health insurance', 'daca',
       'committee sewell', 'tax cut', 'tax reform', 'tax code',
       'legislation', 'ric

In [39]:
predefined_keyphrases  = ['agriculture usda', 'prescription drug', 'national defense authorization act ndaa', 'infrastructure committee', 
              'law enforcement', 'cuba', 'federal resource', 'farmer', 'supreme court', 'military academy', 'ukraine', 'hamas', 
														'coronavirus', 'supply chain', 'tehama county', 'usda', 'president trump', 'health insurance', 'epa', 'honor', 'impeachment', 
														'daca', 'border', 'national security', 'federal grant', 'connecticut', 'coronavirus pandemic', 'navy', 'infrastructure investment', 
														'medicaid', 'planned parenthood', 'mental health', 'paycheck protection program', 'national debt', 'service member', 'medicare part', 
														'military family', 'trump administration', 'tax code', 'nuclear weapon', 'tax cut', 'coronavirus outbreak', 'abortion', 'small business', 
														'farm bill', 'georgia', 'farm workforce modernization act', 'russia', 'poland ukraine border', 'southern border', 'bipartisan legislation', 
														'national defense', 'taxpayer dollar', 'health care', 'president biden', 'gun violence', 'care act', 'house agriculture committee', 
														'appropriation committee', 'biden harris administration', 'government shutdown', 'medicare', 'appropriation bill', 
														'environmental protection agency', 'iran', 'bipartisan bill', 'american rescue plan', 'student', 'affordable care act', 'agriculture', 
														'biden administration', 'cuban people', 'border security', 'budget', 'coronavirus aid relief', 'climate change', 'infrastructure', 
														'house appropriation subcommittee', 'vaccine', 'gaza', 'forest service', 'obama administration', 'american samoa', 'pandemic', 'washington', 
														'president obama', 'federal government', 'irs', 'iowa', 'israel', 'homeland security', 'water infrastructure', 'firearm', 'water resource', 
														'afghanistan', 'climate crisis', 'covid pandemic', 'syria', 'tax reform', 'yemen', 'service academy', 'covid vaccine', 'rural community', 
														'federal land', 'covid', 
              'bipartisan infrastructure law', 'national defense authorization act', 'house appropriation committee', 'obamacare', 'inflation reduction act']

In [None]:
# Create a multi-hot encoded column
df['keyphrase_vector'] = df['keyphrases'].apply(
    lambda x: [1 if keyphrase in x else 0 for keyphrase in predefined_keyphrases]
)

# Check an example row
print(df['keyphrase_vector'].iloc[0])

# Save the DataFrame
df.to_parquet('keyphrase_vectors.parquet', index=False)  # Save compactly

In [38]:
len(keyphrases)

110