## Top Urls

In [4]:
import pandas as pd
import numpy as np
import tqdm
import glob
import os

from collections import Counter
from concurrent.futures import ProcessPoolExecutor, as_completed

In [5]:
from helpers import *

In [6]:
DATA_DIR = '../usc-x-24-us-election'
MOST_COMMON = 15

# URL extraction function (must be at top level for multiprocessing)
def process_folder(folder_path, top_common=MOST_COMMON):
    folder_counter = Counter()
    csvs = glob.glob(f'{folder_path}/*.csv.gz')
    for csv in csvs:
        try:
            df = pd.read_csv(csv, compression='gzip', low_memory=False)
            folder_counter = top_urls(df, folder_counter, top_common=MOST_COMMON)
        except Exception as e:
            print(f"Error processing {csv}: {e}")
    return Counter(dict(folder_counter.most_common(top_common)))

# Launch multiprocessing
all_folders = [os.path.join(DATA_DIR, folder) for folder in os.listdir(DATA_DIR)]

global_counter = Counter()
with ProcessPoolExecutor() as executor:
    futures = [executor.submit(process_folder, folder) for folder in all_folders]
    for future in tqdm.tqdm(as_completed(futures), total=len(futures),desc='Processing Threads'):
        result = future.result()
        global_counter += result

# Get top 15 overall urls
top_15 = Counter(dict(global_counter.most_common(MOST_COMMON)))

print(top_15)

Processing Threads: 100%|██████████| 45/45 [24:07<00:00, 32.16s/it]

Counter({'https://t.co/5ZGk6mKmUI': 131072, 'https://t.co/KfuOwtFw1r': 65538, 'https://t.co/r6jjoU8acW': 32769, 'https://t.co/TK47ZUpBl9': 32769, 'https://t.co/HqafrbZzXH': 32768, 'https://t.co/cbdcv5piQT': 32768, 'https://t.co/kU0xaWQhHQ': 32768, 'https://t.co/O5M40uYNYt': 32768, 'https://t.co/9roTFrdgPN': 16385, 'https://t.co/WjF8Ch6UCR': 16384, 'https://t.co/vi8Ku0jP09': 16384, 'https://t.co/5aNUmJrxy7': 10923, 'https://t.co/vx8dKr17Wl': 1163, 'https://t.co/YSlWqbDkMY': 1030, 'https://t.co/tWyRwI00oL': 928})



