Setup

In [1]:
exps_folder = 'pickles/experiments/'
hash_tables_folder = 'pickles/hash_tables/'
hadamard_matrices_folder = 'pickles/hadamard_matrices/'
priors_folder = 'pickles/priors/'
imgs_folder = 'imgs/'
p_omegas_folder = 'pickles/p_omegas/'

In [2]:
!pip3 install numpy requests scipy
import requests
import numpy as np
from bs4 import BeautifulSoup
from collections import defaultdict
from scipy.stats import zipfian

You should consider upgrading via the '/Users/pun/.pyenv/versions/3.10.2/bin/python3.10 -m pip install --upgrade pip' command.[0m


Helper Functions

In [3]:
def get_subreddits_ranking(subs_wanted): ## returns dict of ["subreddit": number_of_users]
    ranking = []
    subs_count = 0
    for i in range(1, 5):
        url = f"https://www.reddit.com/best/communities/{i}/"
        response = requests.get(url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            parent_div = soup.find("div", class_="community-list")

            if parent_div:
                subreddit = parent_div.find_all("div", recursive=False)
                for div in subreddit:
                    if subs_count == subs_wanted:
                        return ranking
                    text_div = div.find_all("div")[0]
                    sreddit_name = text_div.find_all("a")[0].text.strip()
                    h6s = text_div.find_all("h6")
                    num_members = int(h6s[1].find("faceplate-number").get("number"))
                    ranking.append((sreddit_name[2:], num_members))
                    subs_count +=1
        else:
            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
    return ranking

def get_subreddits_by_category(subs_wanted, target_category): ## returns dict of ["category":[r1,r2,r3],...]
    sreddit_in_cat = defaultdict(list)
    subs_count = 0

    for i in range(1, 5):
        url = f"https://www.reddit.com/best/communities/{i}/"
        response = requests.get(url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            parent_div = soup.find("div", class_="community-list")

            if parent_div:
                subreddit = parent_div.find_all("div", recursive=False)
                for div in subreddit:
                    if subs_count == subs_wanted:
                        return sreddit_in_cat
                    text_div = div.find_all("div")[0]
                    sreddit_name = text_div.find_all("a")[0].text.strip()
                    category = text_div.find_all("h6")[0].text.strip()
                    if category in target_category:
                        sreddit_in_cat[category].append(sreddit_name[2:])
                    else:
                        sreddit_in_cat["Others"].append(sreddit_name[2:])
                    subs_count +=1
        else:
            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
    return sreddit_in_cat

def top_k_keys(input_dict, k):
    # Count the number of items for each key
    counts = {key: len(value) for key, value in input_dict.items()}

    # Sort the keys based on counts in descending order and select the top k
    top_keys = sorted(counts, key=counts.get, reverse=True)[:k]
    return top_keys


def get_pool_sizes(categories, sreddit_map):
    pool_sizes = [0]*len(categories)
    for i in range(len(pool_sizes)):
        pool_sizes[i] = len(sreddit_map[categories[i]])
    
    return pool_sizes

def get_prior(rankings, categories):
    # Create a mapping from subreddit to user count using rankings
    user_count_map = {subreddit: count for subreddit, count in rankings}

    # Create a list for ordered subreddits and their user counts based on categories
    ordered_subreddits = []
    for category in categories.keys():
        for subreddit in categories[category]:
            if subreddit in user_count_map:  # Check if subreddit is in the rankings
                ordered_subreddits.append((subreddit, user_count_map[subreddit]))

    # Extract user counts and apply logarithmic scale
    user_counts = np.array([count for _, count in ordered_subreddits])
    log_counts = np.log(user_counts)
    probabilities = log_counts / log_counts.sum()

    # Assign probabilities to ordered subreddits
    prior = {subreddit: prob for subreddit, prob in zip([sub for sub, _ in ordered_subreddits], probabilities)}

    return prior

In [4]:
def zip_pdf(kappa, n):
    universe = np.arange(1, n + 1)
    x = zipfian.pmf(universe, kappa, n)
    return x / x.sum()

Pick 10 categories with the most subreddits and get their pool sizes

In [6]:
#from preprocessing, these categories has the most subreddits
target_category = ['Internet Culture and Memes', 'Gaming', 'Technology', 'Funny/Humor', 'Art', 'Animals and Pets', 'Sports', 'Place', 'Food and Drink', 'Learning and Education'] 

#map subreddits to each target category, putting other subreddits in 'Others'
sreddit_map = get_subreddits_by_category(1000, target_category)
all_category = target_category + ['Others']
pool_sizes = get_pool_sizes(all_category, sreddit_map)
print("Categories: ", all_category)
print("Pool sizes: ", pool_sizes)

Categories:  ['Internet Culture and Memes', 'Gaming', 'Technology', 'Funny/Humor', 'Art', 'Animals and Pets', 'Sports', 'Place', 'Food and Drink', 'Learning and Education', 'Others']
Pool sizes:  [115, 88, 58, 57, 52, 42, 41, 34, 32, 30, 451]


Calculate Prior

In [11]:
# Ordering the keys of the dictionary to be the same order as the list all_category. This is to ensure Prior[0] and P_Omega[0] refer to the same subreddit
ordered_dict = {}
for category in all_category:
    ordered_dict[category] = sreddit_map[category]
for k in ordered_dict.keys():
    print(k)
rankings = get_subreddits_ranking(1000)
prior = get_prior(rankings, ordered_dict)

Internet Culture and Memes
Gaming
Technology
Funny/Humor
Art
Animals and Pets
Sports
Place
Food and Drink
Learning and Education
Others


Calculate P_Omega

In [12]:
U, m, k, eps = 1000, 1024, 65536, 4
kappa = 1.2

p_omega = []
for category in all_category:  # Ensure this follows the same order as in the prior
    num_subreddits = len(sreddit_map[category])
    p_omega.extend(zip_pdf(kappa, num_subreddits))

# Normalize p_omega (should sum to 1)
p_omega = np.array(p_omega) / np.sum(p_omega)

In [15]:
len(prior) == len(p_omega)

True