In [1]:
import pandas as pd
df = pd.read_csv("preprocessed_for_keywords.csv", index_col=0)
df.head()

Unnamed: 0,representative_party,year,month,full_text
0,D,2024,8,a rare moment of joy sherman statement on isra...
1,R,2023,3,ag perspective on kfrm am radioit wa great joi...
2,D,2019,12,astonishing moral cowardice sander and khanna ...
3,D,2024,9,finally brought to justice congressman sherman...
4,D,2024,9,i stand with the armenian people sherman meet ...


In [2]:
df.groupby(["year"])["year"].count()

year
2000       4
2001       7
2002      11
2003      42
2004      49
2005      64
2006      54
2007      88
2008      58
2009     173
2010     206
2011     452
2012     405
2013     756
2014     704
2015    1301
2016     967
2017    2321
2018    1679
2019    2180
2020    2164
2021    3097
2022    3015
2023    3384
2024    2463
Name: year, dtype: int64

In [3]:
df = df[df["year"]>=2015]

In [4]:
predefined_keyphrases  = ['agriculture usda', 'prescription drug', 'national defense authorization act ndaa', 'infrastructure committee', 'cuba',
                          'supreme court', 'military academy', 'ukraine', 'hamas', 
														'coronavirus', 'supply chain', 'usda', 'president trump', 'health insurance', 'impeachment', 
														'daca', 'border', 'national security', 'federal grant',  'coronavirus pandemic', 'navy', 'infrastructure investment', 
														'medicaid', 'planned parenthood', 'mental health', 'paycheck protection program', 'national debt', 'medicare part', 
														'military family', 'trump administration', 'tax code', 'nuclear weapon', 'tax cut', 'coronavirus outbreak', 'abortion',
														'farm bill', 'farm workforce modernization act', 'russia', 'poland ukraine border', 'southern border',
														'national defense', 'taxpayer dollar', 'president biden', 'gun violence', 'care act', 'house agriculture committee', 
														'appropriation committee', 'biden harris administration', 'government shutdown', 'medicare', 'appropriation bill', 
														'environmental protection agency', 'iran', 'american rescue plan', 'affordable care act',
														'biden administration', 'cuban people', 'border security', 'coronavirus aid relief', 'climate change', 
														'house appropriation subcommittee', 'vaccine', 'gaza', 'forest service', 'obama administration', 'american samoa', 'pandemic',
														'president obama',  'israel', 'homeland security', 'water infrastructure', 'firearm', 'water resource', 
														'afghanistan', 'climate crisis', 'covid pandemic', 'syria', 'tax reform', 'yemen', 'service academy', 'covid vaccine', 'rural community', 
														 'covid', 'national defense authorization act', 'house appropriation committee', 
														'water crisis','water supply', 'obamacare', 'inflation reduction act', 'zika virus', 'hillary clinton', 'earthquake',
														'terrorism']

In [None]:
df['keyphrase_vector'] = df['full_text'].apply(
    lambda x: [1 if keyphrase in x else 0 for keyphrase in predefined_keyphrases]
)
df


Unnamed: 0,representative_party,year,month,full_text,keyphrase_vector
0,D,2024,8,a rare moment of joy sherman statement on isra...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,R,2023,3,ag perspective on kfrm am radioit wa great joi...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,D,2019,12,astonishing moral cowardice sander and khanna ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,D,2024,9,finally brought to justice congressman sherman...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,D,2024,9,i stand with the armenian people sherman meet ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
25639,D,2020,2,trump s public charge rule is cruel and comple...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
25640,R,2016,6,we must have a tax code built for growth weste...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
25641,D,2022,10,we need to close the aliso canyon facility now...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
25642,R,2021,7,we are not a dumping ground for other county s...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import imageio
from pathlib import Path
import numpy as np

# Decode Multi-Hot Vector into keyphrases for each row
def decode_keyphrases(vector):
    return [keyphrase for idx, keyphrase in enumerate(predefined_keyphrases) if vector[idx] == 1]

# Assuming df already has a 'keyphrase_vector' column
df['decoded_keyphrases'] = df['keyphrase_vector'].apply(decode_keyphrases)

# Create a flattened DataFrame for keyphrase counts
keyphrase_data = []
for _, row in df.iterrows():
    for keyphrase in row['decoded_keyphrases']:
        keyphrase_data.append({'time': pd.Timestamp(year=row['year'], month=row['month'], day=1), 'keyphrase': keyphrase})

keyphrase_df = pd.DataFrame(keyphrase_data)

# Define a function to calculate cumulative counts for the last 6 months
def cumulative_counts(df, reference_time, period_months=6):
    start_time = reference_time - pd.DateOffset(months=period_months)
    recent_data = df[(df['time'] > start_time) & (df['time'] <= reference_time)]
    cumulative_counts = (
        recent_data.groupby('keyphrase')
        .size()
        .reset_index(name='count')
        .sort_values(by='count', ascending=False)
    )
    return cumulative_counts

# Get all unique times in order for plotting
unique_times = keyphrase_df['time'].sort_values().unique()

# Assign specific colors to keyphrases
keyphrase_palette = sns.color_palette("tab10", len(predefined_keyphrases))
keyphrase_colors = {keyphrase: color for keyphrase, color in zip(predefined_keyphrases, keyphrase_palette)}

# Create frames for the animation with interpolation
frames = []
output_dir = Path("frames")
output_dir.mkdir(exist_ok=True)

# Number of interpolated steps between consecutive time points
n_interpolations = 15  # More interpolations make it smoother
frame_duration = 0.3  # Duration of each frame in seconds

for i in range(len(unique_times) - 1):
    current_time = unique_times[i]
    next_time = unique_times[i + 1]
    
    # Calculate cumulative counts for the current and next time points
    current_data = cumulative_counts(keyphrase_df, reference_time=current_time, period_months=6)
    next_data = cumulative_counts(keyphrase_df, reference_time=next_time, period_months=6)
    
    # Merge data to ensure all keyphrases are represented in both time points
    merged_data = pd.merge(
        current_data, next_data, 
        on='keyphrase', suffixes=('_current', '_next'), how='outer'
    ).fillna(0)
    
    # Interpolate counts for smoother transitions
    for alpha in np.linspace(0, 1, n_interpolations):
        merged_data['count_interp'] = (
            (1 - alpha) * merged_data['count_current'] + alpha * merged_data['count_next']
        )
        # Keep only top 10 keyphrases for each interpolated step
        interpolated_data = merged_data.nlargest(10, 'count_interp')
        
        # Plot the frame
        plt.figure(figsize=(10, 6))
        sns.barplot(
            data=interpolated_data,
            x='count_interp',
            y='keyphrase',
            palette=[keyphrase_colors.get(k, 'gray') for k in interpolated_data['keyphrase']]
        )
        
        for i, iterrow in enumerate(interpolated_data.iterrows()):
            index, row = iterrow
            plt.text(
                x=row['count_interp'] + 1,  # Position slightly to the right of the bar
                y=i,                       # Align with the bar height
                s=f"{int(row['count_interp'])}",  # Format value to 1 decimal place
                va='center',                  # Vertical alignment
                ha='left'                     # Horizontal alignment
            )
            
        plt.title(f"{current_time.strftime('%Y-%m')}")
        plt.xlabel("Cumulative Count (Last 6 Months)")
        plt.ylabel("Keyphrase")
        plt.xlim(0, int(max(interpolated_data[['count_interp']].max())) + 20)  # Consistent x-axis
        plt.yticks(ticks=plt.yticks()[0], labels=['\n'.join(x.split(' ')) for x in interpolated_data['keyphrase']])
        plt.tight_layout()
    
        # Save the frame
        frame_filename = output_dir / f"frame_{current_time.strftime('%Y-%m')}_{next_time.strftime('%Y-%m')}_{alpha:.2f}.png"
        plt.savefig(frame_filename)
        frames.append(frame_filename)
        plt.close()

# # Create animated GIF
# with imageio.get_writer("keyphrases_cumulative_smooth.gif", mode="I", duration=frame_duration) as writer:
#     for frame in frames:
#         image = imageio.imread(frame)
#         writer.append_data(image)

# # Clean up
# for frame in frames:
#     frame.unlink()

# print("Animated GIF saved as 'keyphrases_cumulative_smooth.gif'")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. A

In [29]:
import cv2
import os

image_folder = 'frames'
video_name = 'keyphrases.mp4'

images = sorted([img for img in os.listdir(image_folder) if img.endswith(".png")])
frame = cv2.imread(os.path.join(image_folder, images[0]))
height, width, layers = frame.shape
print(height, width)
# cv2.VideoWriter_fourcc('H','2','6','4')
# video = cv2.VideoWriter(video_name, 0, 24, (width,height))
video =cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc('m', 'p', '4', 'v'), 15.0, (width,height))

for image in images:
    video.write(cv2.imread(os.path.join(image_folder, image)))

cv2.destroyAllWindows()
video.release()

600 1000
