## Top Keywords

In [5]:
import pandas as pd
import numpy as np
import tqdm
import glob
import os

from collections import Counter
from concurrent.futures import ProcessPoolExecutor, as_completed

In [6]:
from helpers import *

In [7]:
DATA_DIR = '../usc-x-24-us-election'
MOST_COMMON = 15

# Keywords extraction function (must be at top level for multiprocessing)
def process_folder(folder_path, top_common=MOST_COMMON):
    folder_counter = Counter()
    csvs = glob.glob(f'{folder_path}/*.csv.gz')
    for csv in csvs:
        try:
            df = pd.read_csv(csv,compression='gzip',low_memory=False)
            folder_counter = top_keywords(df,folder_counter,top_common=MOST_COMMON)
        except Exception as e:
            print(f"Error processing {csv}: {e}")
    return folder_counter

# Launch multiprocessing
all_folders = [os.path.join(DATA_DIR, folder) for folder in os.listdir(DATA_DIR)]

global_counter = Counter()
with ProcessPoolExecutor() as executor:
    futures = [executor.submit(process_folder, folder) for folder in all_folders]
    for future in tqdm.tqdm(as_completed(futures), total=len(futures),desc='Processing Threads'):
        result = future.result()
        global_counter += result

# Get top 15 overall keywords
top_15 = Counter(dict(global_counter.most_common(MOST_COMMON)))

print(top_15)

Processing Threads: 100%|██████████| 45/45 [39:48<00:00, 53.08s/it]  

Counter({'biden': 15018330, 'trump': 9641858, 'maga': 4761552, 'donald': 3075045, 'harris': 2839659, 'like': 2604392, 'joe': 2588240, 'president': 2574401, 'people': 2547230, 'kamala': 2519796, 'gop': 2225485, 'dont': 1689266, 'amp': 1572238, 'party': 1130653, 'would': 1006517})



