In [3]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

plt.rcParams['figure.figsize'] = (12, 10)
# pd.options.display.max_columns = 50

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

sns.set()

In [4]:
movie_df = pd.read_csv('movies.csv')
rating_df = pd.read_csv('ratings.csv')

In [5]:
movie_df = movie_df.iloc[:1000]

In [7]:
movie_count = movie_df['id'].max() + 1
movie2userset = [set() for _ in range(movie_count)]

In [27]:
for row in rating_df.itertuples(index=True, name='Pandas'):
    if row.movie_id < movie_count:
        movie2userset[row.movie_id].add(row.user_id)

In [29]:
G = nx.DiGraph()
G.add_nodes_from(range(movie_count), nodetype = int)

In [None]:
# intersection / A
# threshold = 0.02
threshold = 0.6
for i in tqdm(range(movie_count)):
    for j in range(i + 1, movie_count):
        if len(movie2userset[i]) > 0:
            w = len(movie2userset[i] & movie2userset[j]) / len(movie2userset[i])
            if w > threshold:
                G.add_edge(i, j, weight=w)
        if len(movie2userset[j]) > 0:
            w = len(movie2userset[i] & movie2userset[j]) / len(movie2userset[j])
            if w > threshold:
                G.add_edge(j, i, weight=w)

In [33]:
# intersection / min
threshold = 0.02
for i in tqdm(range(movie_count)):
    for j in range(i + 1, movie_count):
        if len(movie2userset[i]) > 0 and len(movie2userset[j]) > 0:
            w = len(movie2userset[i] & movie2userset[j]) / min(len(movie2userset[i]), len(movie2userset[j]))
            if w > threshold:
                G.add_edge(i, j, weight=w)
                G.add_edge(j, i, weight=w)

100%|██████████████████████████████████████| 1000/1000 [00:01<00:00, 681.40it/s]


In [34]:
print(G.number_of_edges())
print(movie_count)
print(G.number_of_edges() / (movie_count * (movie_count - 1)))

634552
1000
0.6351871871871871


In [35]:
in_edge_count = [len(G.in_edges(i)) for i in range(movie_count)]
in_edge_count = np.array(in_edge_count)

out_edge_count = [len(G.out_edges(i)) for i in range(movie_count)]
out_edge_count = np.array(out_edge_count)

In [36]:
zero_in_edge_node_count = (in_edge_count == 0).sum()
print(f'Nodes with 0 incoming edges : {zero_in_edge_node_count}')
zero_out_edge_node_count = (out_edge_count == 0).sum()
print(f'Nodes with 0 outgoing edges : {zero_out_edge_node_count}')

Nodes with 0 incoming edges : 1
Nodes with 0 outgoing edges : 1


In [37]:
# Normalize weights

for i in tqdm(range(movie_count)):
    total_w = sum(data['weight'] for u, v, data in G.out_edges(i, data=True))
    if total_w > 0:
        for u, v, data in G.out_edges(i, data=True):
            G[u][v]['weight'] = data['weight'] / total_w

100%|██████████████████████████████████████| 1000/1000 [00:01<00:00, 949.87it/s]


In [38]:
# Normal Pagerank
result = nx.pagerank(G, weight='weight')
movie_df['pagerank'] =  pd.Series(result)

In [39]:
movie_df

Unnamed: 0,id,movielens_id,title,genres,imdb_id,tmdb_id,pagerank
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,0.002308
1,1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,0.001531
2,2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,0.001225
3,3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,0.000711
4,4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,0.001042
...,...,...,...,...,...,...,...
995,995,1297,Real Genius (1985),Comedy,89886,14370.0,0.001047
996,996,1298,Pink Floyd: The Wall (1982),Drama|Musical,84503,12104.0,0.000975
997,997,1299,"Killing Fields, The (1984)",Drama|War,87553,625.0,0.001231
998,998,1300,My Life as a Dog (Mitt liv som hund) (1985),Comedy|Drama,89606,8816.0,0.000953


In [42]:
pd.set_option('display.expand_frame_repr', False)
movie_df.nlargest(10, 'Comedy')

Unnamed: 0,id,movielens_id,title,genres,imdb_id,tmdb_id,pagerank,Animation,Western,Horror,Fantasy,Musical,Romance,Adventure,Thriller,Comedy,Action,Documentary,Mystery,Crime,War,Drama,Sci-Fi,Children
314,314,356,Forrest Gump (1994),Comedy|Drama|Romance|War,109830,13.0,0.002642,0.00178,0.001712,0.001705,0.001727,0.001803,0.003691,0.001771,0.001733,0.002905,0.001703,0.002089,0.00177,0.001808,0.010127,0.002632,0.001686,0.001886
257,257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,110912,680.0,0.002688,0.001747,0.001814,0.001968,0.001753,0.001725,0.001793,0.001721,0.004045,0.002904,0.001697,0.002404,0.001887,0.005055,0.001866,0.002716,0.001705,0.001814
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,0.002308,0.012391,0.001377,0.001699,0.007317,0.001557,0.001617,0.004713,0.001479,0.002718,0.001442,0.002433,0.001462,0.001438,0.001458,0.001616,0.001463,0.005695
520,520,608,Fargo (1996),Comedy|Crime|Drama|Thriller,116282,275.0,0.002237,0.001508,0.001455,0.001414,0.001465,0.001492,0.001488,0.001363,0.003715,0.002596,0.001375,0.002485,0.001583,0.004746,0.001475,0.002357,0.001449,0.001415
337,337,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,111503,36955.0,0.002161,0.001442,0.001422,0.001737,0.00142,0.001375,0.003314,0.00464,0.003727,0.00255,0.004021,0.001368,0.001391,0.001418,0.001301,0.001428,0.001393,0.001547
969,969,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi,88763,105.0,0.002059,0.001387,0.001332,0.001368,0.001349,0.001462,0.001357,0.004504,0.001298,0.002494,0.001294,0.001486,0.001401,0.00132,0.001413,0.0014,0.007122,0.00141
964,964,1265,Groundhog Day (1993),Comedy|Fantasy|Romance,107048,137.0,0.00195,0.00134,0.001239,0.001239,0.007066,0.0014,0.003204,0.001234,0.001216,0.002436,0.001204,0.001434,0.001303,0.001334,0.001282,0.001331,0.00124,0.001311
325,325,367,"Mask, The (1994)",Action|Comedy|Crime|Fantasy,110475,854.0,0.001925,0.00147,0.001243,0.001563,0.007183,0.001316,0.001221,0.001391,0.001269,0.00243,0.00385,0.000931,0.00118,0.004533,0.001217,0.001216,0.001283,0.001518
436,436,500,Mrs. Doubtfire (1993),Comedy|Drama,107614,788.0,0.001899,0.001417,0.001245,0.001221,0.0013,0.001359,0.001225,0.001305,0.001166,0.002417,0.001204,0.00098,0.00118,0.001258,0.001207,0.002017,0.001171,0.001481
505,505,587,Ghost (1990),Comedy|Drama|Fantasy|Romance|Thriller,99653,251.0,0.001953,0.001203,0.001222,0.001214,0.007016,0.001293,0.00324,0.001245,0.003477,0.002406,0.001225,0.001242,0.001219,0.001314,0.001211,0.002106,0.001153,0.001332


In [41]:
# Creating personalization sets
with open('genre.txt') as file:
    genre_set = set(line.strip() for line in file)

genre2movies = {genre:[] for genre in genre_set}
for movie_id, genre_str in zip(movie_df['id'], movie_df['genres']):
    genre_lst = genre_str.split('|')
    for genre in genre_lst:
        if genre in genre2movies:
            genre2movies[genre].append(movie_id)

for genre in tqdm(genre2movies):
    personalization_dict = {movie_id:1 for movie_id in genre2movies[genre]}
    result = nx.pagerank(G, alpha=0.6, weight='weight', personalization=personalization_dict)
    movie_df[genre] =  pd.Series(result)

100%|███████████████████████████████████████████| 17/17 [00:26<00:00,  1.54s/it]


In [None]:
# LEGACY

pgrank_id_pair_lst = [(value, key) for key, value in result.items()]
pgrank_id_pair_lst.sort(reverse=True)

In [None]:
pgrank_id_pair_lst

In [None]:
top_movie_id_lst = [idx for pagerank, idx in pgrank_id_pair_lst[:50]]
top = movie_df.query('id in @top_movie_id_lst').copy()
pagerank_column = [(idx, pagerank) for pagerank, idx in pgrank_id_pair_lst[:50]]
pagerank_column.sort()
pagerank_column = [pagerank for idx, pagerank in pagerank_column]
top['pagerank'] = pagerank_column
top

In [None]:
top_movie_id_lst = [idx for pagerank, idx in pgrank_id_pair_lst[:50]]
top = movie_df.query('id in @top_movie_id_lst').copy()
pagerank_column = [(idx, pagerank) for pagerank, idx in pgrank_id_pair_lst[:50]]
pagerank_column.sort()
pagerank_column = [pagerank for idx, pagerank in pagerank_column]
top['pagerank'] = pagerank_column
top

In [None]:
sns.distplot(in_edge_count)
plt.show()
sns.distplot(out_edge_count)
plt.show()

In [None]:
# LEGACY CODE

w_lst = []
for i in tqdm(range(movie_count)):
    for j in range(i + 1, movie_count):
        if len(movie2userset[i]) > 0:
            w = len(movie2userset[i] & movie2userset[j]) / len(movie2userset[i])
        else:
            w = 0
        w_lst.append(w)
        if len(movie2userset[j]) > 0:
            w = len(movie2userset[i] & movie2userset[j]) / len(movie2userset[j])
        else:
            w = 0
        w_lst.append(w)
w_lst = np.array(w_lst)
w_lst_sample = np.random.choice(w_lst, size=1_000_000)
sns.distplot(w_lst_sample)