In [6]:
import networkx as nx
import pandas as pd
import numpy as np
from tqdm import tqdm

DIFFERENT_ALPHA_POLICY = 0
SAME_ALPHA_POLICY = 1

policy = DIFFERENT_ALPHA_POLICY
prob_teleport = 0.25

In [7]:
movie_df = pd.read_csv('movies.csv')
movie_count = movie_df.shape[0]

In [2]:
edge_df = pd.read_csv('edges.csv')
df_append = pd.DataFrame({'source':edge_df['destination'], 
                          'destination':edge_df['source'],
                          'weight':edge_df['weight']})
edge_df = edge_df.append(df_append)

In [4]:
G = nx.from_pandas_edgelist(edge_df, 
                            source='source', 
                            target='destination', 
                            edge_attr='weight',
                            create_using=nx.DiGraph)
edge_df = None

In [28]:
print(f'Edge count: {G.number_of_edges()}')
print(f'Vertex count: {movie_count}')

density = G.number_of_edges() / (movie_count * (movie_count - 1))

print(f'Density: {density:.4f}')

Edge count: 1339532
Vertex count: 1572
Density: 0.5424


In [10]:
in_edge_count = [len(G.in_edges(idx)) for idx in movie_df['id']]
in_edge_count = np.array(in_edge_count)

out_edge_count = [len(G.out_edges(idx)) for idx in movie_df['id']]
out_edge_count = np.array(out_edge_count)

zero_in_edge_node_count = (in_edge_count == 0).sum()
print(f'Nodes with 0 incoming edges : {zero_in_edge_node_count}')
zero_out_edge_node_count = (out_edge_count == 0).sum()
print(f'Nodes with 0 outgoing edges : {zero_out_edge_node_count}')

Nodes with 0 incoming edges : 0
Nodes with 0 outgoing edges : 0


In [15]:
# Normalize weights

for i in tqdm(movie_df.index):
    total_w = sum(data['weight'] for u, v, data in G.out_edges(i, data=True))
    if total_w > 0:
        for u, v, data in G.out_edges(i, data=True):
            G[u][v]['weight'] = data['weight'] / total_w

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1572/1572 [00:02<00:00, 642.39it/s]


In [16]:
# Normal Pagerank
teleport_prob = 0.15
alpha = 1 - teleport_prob
result = nx.pagerank(G, weight='weight', alpha=alpha)
movie_df['pagerank'] =  pd.Series(result)

In [19]:
# Creating personalization sets
with open('genre.txt') as file:
    genre_set = set(line.strip() for line in file)


genre2movies = {genre:[] for genre in genre_set}
for movie_id, genre_str in zip(movie_df['id'], movie_df['genres']):
    genre_lst = genre_str.split('|')
    for genre in genre_lst:
        if genre in genre2movies:
            genre2movies[genre].append(movie_id)

genre2count = {genre:len(movies) for genre, movies in genre2movies.items()}

In [21]:

if policy == DIFFERENT_ALPHA_POLICY:
    # Different alpha values
    prob_teleport_each_movie = prob_teleport / genre2count['Thriller']
    genre2teleport_prob = {genre:prob_teleport_each_movie * count for genre, count in genre2count.items()}
    print(genre2teleport_prob)
    prob_teleport_each_movie = {genre:prob_teleport_each_movie for genre, count in genre2count.items()}
    print(prob_teleport_each_movie)
else:
    # Same alpha value
    prob_teleport_each_movie = {genre:prob_teleport/count for genre, count in genre2count.items()}
    genre2teleport_prob = {genre:prob_teleport for genre, count in genre2count.items()}

{'Horror': 0.06162790697674418, 'War': 0.0377906976744186, 'Crime': 0.1436046511627907, 'Romance': 0.17034883720930233, 'Adventure': 0.22616279069767442, 'Children': 0.08488372093023255, 'Action': 0.2947674418604651, 'Thriller': 0.25, 'Fantasy': 0.10755813953488372, 'Drama': 0.3808139534883721, 'Western': 0.020930232558139535, 'Sci-Fi': 0.1686046511627907, 'Comedy': 0.3627906976744186, 'Musical': 0.03662790697674419, 'Mystery': 0.07383720930232558}
{'Horror': 0.0005813953488372093, 'War': 0.0005813953488372093, 'Crime': 0.0005813953488372093, 'Romance': 0.0005813953488372093, 'Adventure': 0.0005813953488372093, 'Children': 0.0005813953488372093, 'Action': 0.0005813953488372093, 'Thriller': 0.0005813953488372093, 'Fantasy': 0.0005813953488372093, 'Drama': 0.0005813953488372093, 'Western': 0.0005813953488372093, 'Sci-Fi': 0.0005813953488372093, 'Comedy': 0.0005813953488372093, 'Musical': 0.0005813953488372093, 'Mystery': 0.0005813953488372093}


In [23]:
for genre in tqdm(genre2movies):
    personalization_dict = {movie_id:1 for movie_id in genre2movies[genre]}
    alpha = 1 - genre2teleport_prob[genre]
    result = nx.pagerank(G, alpha=alpha, weight='weight', personalization=personalization_dict)
    movie_df[genre] =  pd.Series(result)

  result = nx.pagerank_numpy(G, alpha=alpha, weight='weight', personalization=personalization_dict)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [01:01<00:00,  4.11s/it]


In [25]:
movie_df.to_csv('movies_pr.csv')