## Top Hashtags

In [4]:
import pandas as pd
import numpy as np
import tqdm
import glob
import os

from collections import Counter
from concurrent.futures import ProcessPoolExecutor, as_completed

In [5]:
from helpers import *

In [6]:
DATA_DIR = '../usc-x-24-us-election'
MOST_COMMON = 15

# Hashtag extraction function (must be at top level for multiprocessing)
def process_folder(folder_path, top_common=MOST_COMMON):
    folder_counter = Counter()
    csvs = glob.glob(f'{folder_path}/*.csv.gz')
    for csv in csvs:
        try:
            df = pd.read_csv(csv,compression='gzip',low_memory=False)
            folder_counter = top_hashtags(df,folder_counter,top_common=MOST_COMMON)
        except Exception as e:
            print(f"Error processing {csv}: {e}")
    return folder_counter

# Launch multiprocessing
all_folders = [os.path.join(DATA_DIR, folder) for folder in os.listdir(DATA_DIR)]

global_counter = Counter()
with ProcessPoolExecutor() as executor:
    futures = [executor.submit(process_folder, folder) for folder in all_folders]
    for future in tqdm.tqdm(as_completed(futures), total=len(futures),desc='Processing Threads'):
        result = future.result()
        global_counter += result

# Get top 15 overall hashtags
top_15 = Counter(dict(global_counter.most_common(MOST_COMMON)))

print(top_15)

Processing Threads: 100%|██████████| 45/45 [24:11<00:00, 32.26s/it]

Counter({'#MAGA': 781194, '#Trump2024': 648646, '#Trump': 243330, '#Biden': 193240, '#BidenHarris2024': 160904, '#maga': 84903, '#DonaldTrump': 47425, '#KamalaHarris': 47223, '#TRUMP2024': 41122, '#GOP': 37104, '#MAHA': 32187, '#Biden2024': 27913, '#trump': 25005, '#USA': 23604, '#TrumpVance2024': 20432})



