In [2]:
import pandas as pd

In [3]:
ratings = pd.read_csv('data/ml-32m/ratings.csv')

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [4]:
movie_descs = pd.read_csv('data/movies_with_description.csv')

# Filter all ratings that not in movie_descs

ratings = ratings[ratings['movieId'].isin(movie_descs['movieId'])]

In [5]:
from collections import defaultdict
from tqdm.notebook import tqdm, trange
from concurrent.futures import ThreadPoolExecutor, as_completed

# Create an edgelist from the dataframe
edges = defaultdict(lambda: 0)

ratings_filtered = ratings.query("rating >= 4")

def process_group(movie_id, group):
    local_edges = defaultdict(lambda: 0)
    users = group['userId'].values
    for i in range(len(users)):
        for j in range(i + 1, len(users)):
            local_edges[(users[i].item(), users[j].item())] += 1
            local_edges[(users[j].item(), users[i].item())] += 1
    return local_edges

with ThreadPoolExecutor() as executor:
    print("Creating tasks")
    futures = {executor.submit(process_group, movie_id, group): movie_id for movie_id, group in tqdm(ratings_filtered.groupby('movieId'), desc="Creating tasks")}
    print("Processing ratings")
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing ratings"):
        local_edges = future.result()
        for key, value in local_edges.items():
            edges[key] += value


Creating tasks


Creating tasks:   0%|          | 0/33491 [00:00<?, ?it/s]

: 

: 