In [None]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

plt.rcParams['figure.figsize'] = (6, 6)

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

sns.set()

In [None]:
movie_df = pd.read_csv('movies.csv')
rating_df = pd.read_csv('ratings.csv')

In [None]:
movie_count = movie_df['id'].max() + 1
movie2userset = [set() for _ in range(movie_count)]

In [None]:
for row in rating_df.itertuples(index=True, name='Pandas'):
    movie2userset[row.movie_id].add(row.user_id)

movie2view = {movie:len(userset) for movie, userset in enumerate(movie2userset)}

In [None]:
movie_df['view_count'] = pd.Series(movie2view)
movie_count = 1000
movie_df = movie_df.nlargest(movie_count, 'view_count')
rating_df = None

In [None]:
G = nx.DiGraph()
G.add_nodes_from(movie_df['id'], nodetype = int)

In [None]:
# LEGACY
# intersection / A
# threshold = 0.02
threshold = 0.6
for i in tqdm(range(movie_count)):
    for j in range(i + 1, movie_count):
        if len(movie2userset[i]) > 0:
            w = len(movie2userset[i] & movie2userset[j]) / len(movie2userset[i])
            if w > threshold:
                G.add_edge(i, j, weight=w)
        if len(movie2userset[j]) > 0:
            w = len(movie2userset[i] & movie2userset[j]) / len(movie2userset[j])
            if w > threshold:
                G.add_edge(j, i, weight=w)

In [None]:
# LEGACY
# intersection / min
# threshold = 0.4
threshold = 0.02
for i in tqdm(range(movie_count)):
    for j in range(i + 1, movie_count):
        if len(movie2userset[i]) > 0 and len(movie2userset[j]) > 0:
            w = len(movie2userset[i] & movie2userset[j]) / min(len(movie2userset[i]), len(movie2userset[j]))
            if w > threshold:
                G.add_edge(i, j, weight=w)
                G.add_edge(j, i, weight=w)

In [None]:
threshold = 0.4
for i in tqdm(range(movie_count)):
    i_id = movie_df.iloc[i]['id']
    for j in range(i + 1, movie_count):
        j_id = movie_df.iloc[j]['id']
        if len(movie2userset[i_id]) > 0 and len(movie2userset[j_id]) > 0:
            w = len(movie2userset[i_id] & movie2userset[j_id]) / min(len(movie2userset[i_id]), len(movie2userset[j_id]))
            if w > threshold:
                G.add_edge(i_id, j_id, weight=w)
                G.add_edge(j_id, i_id, weight=w)

In [None]:
print(G.number_of_edges())
print(movie_count)
print(G.number_of_edges() / (movie_count * (movie_count - 1)))

In [None]:
in_edge_count = [len(G.in_edges(idx)) for idx in movie_df['id']]
in_edge_count = np.array(in_edge_count)

out_edge_count = [len(G.out_edges(idx)) for idx in movie_df['id']]
out_edge_count = np.array(out_edge_count)

In [None]:
zero_in_edge_node_count = (in_edge_count == 0).sum()
print(f'Nodes with 0 incoming edges : {zero_in_edge_node_count}')
zero_out_edge_node_count = (out_edge_count == 0).sum()
print(f'Nodes with 0 outgoing edges : {zero_out_edge_node_count}')

In [None]:
# Normalize weights

for i in tqdm(movie_df.index):
    total_w = sum(data['weight'] for u, v, data in G.out_edges(i, data=True))
    if total_w > 0:
        for u, v, data in G.out_edges(i, data=True):
            G[u][v]['weight'] = data['weight'] / total_w

In [None]:
# Normal Pagerank
teleport_prob = 0.15
alpha = 1 - teleport_prob
result = nx.pagerank(G, weight='weight', alpha=alpha)
movie_df['pagerank'] =  pd.Series(result)

movie_df['pagerank_adj'] = movie_df['pagerank'] - teleport_prob / movie_count
movie_df['pagerank_adj'] = movie_df['pagerank_adj'] / movie_df['pagerank_adj'].sum()

In [None]:
# Creating personalization sets
with open('genre.txt') as file:
    genre_set = set(line.strip() for line in file)


genre2movies = {genre:[] for genre in genre_set}
for movie_id, genre_str in zip(movie_df['id'], movie_df['genres']):
    genre_lst = genre_str.split('|')
    for genre in genre_lst:
        if genre in genre2movies:
            genre2movies[genre].append(movie_id)
            
# genre2movies['Animation'].remove(0)

genre2count = {genre:len(movies) for genre, movies in genre2movies.items()}

In [None]:
genre2count

In [None]:
# Different alpha values
prob_teleport = 0.25
prob_teleport_each_movie = prob_teleport / genre2count['Thriller']
genre2teleport_prob = {genre:prob_teleport_each_movie * count for genre, count in genre2count.items()}
print(genre2teleport_prob)
prob_teleport_each_movie = {genre:prob_teleport_each_movie for genre, count in genre2count.items()}
print(prob_teleport_each_movie)

In [None]:
# Same alpha value
prob_teleport = 0.25
prob_teleport_each_movie = {genre:prob_teleport/count for genre, count in genre2count.items()}
genre2teleport_prob = {genre:0.25 for genre, count in genre2count.items()}

In [None]:
# NOW LEGACY
# Same alpha value
prob_teleport = 0.15
prob_teleport_each_movie = {genre:prob_teleport/count for genre, count in genre2count.items()}
genre2teleport_prob = {genre:0.15 for genre, count in genre2count.items()}



# ----------

particular_node_weights = {src:data['weight'] for src, dst, data in G.in_edges(0, data=True)}
particular_node_df = movie_df.query('id in @particular_node_weights').copy()
particular_node_df['out_weight'] = pd.Series(particular_node_weights)
particular_node_df.sort_values(by=['out_weight'], ascending=False)

In [None]:
for genre in tqdm(genre2movies):
    personalization_dict = {movie_id:1 for movie_id in genre2movies[genre]}
    alpha = 1 - genre2teleport_prob[genre]
    result = nx.pagerank(G, alpha=alpha, weight='weight', personalization=personalization_dict)
    movie_df[genre] =  pd.Series(result)

for genre in tqdm(genre2movies):
    genre2movies
    movie_df[f'{genre}_adj'] = movie_df[genre]
    movie_df.loc[genre2movies[genre], f'{genre}_adj'] -= prob_teleport_each_movie[genre]
    movie_df[f'{genre}_adj'] = movie_df[f'{genre}_adj'] / movie_df[f'{genre}_adj'].sum()

In [None]:
# NUMPY

for genre in tqdm(genre2movies):
    personalization_dict = {movie_id:1 for movie_id in genre2movies[genre]}
    alpha = 1 - genre2teleport_prob[genre]
    result = nx.pagerank_numpy(G, alpha=alpha, weight='weight', personalization=personalization_dict)
    movie_df[genre] =  pd.Series(result)

for genre in tqdm(genre2movies):
    genre2movies
    movie_df[f'{genre}_adj'] = movie_df[genre]
    movie_df.loc[genre2movies[genre], f'{genre}_adj'] -= prob_teleport_each_movie[genre]
    movie_df[f'{genre}_adj'] = movie_df[f'{genre}_adj'] / movie_df[f'{genre}_adj'].sum()

In [None]:
movie_df

In [None]:
x = np.array(movie_df['pagerank_adj'])
y = np.array(movie_df['Western_adj'] / movie_df['pagerank_adj'])
plt.scatter(x, y)

m, b = np.polyfit(x, y, deg=1)
x_fit = np.array([x.min(), x.max()])
y_fit = m * x_fit + b
plt.plot(x_fit, y_fit)

plt.show()

new_y = y / (m * x + b)

In [None]:
genre_column_lst = []
for genre in genre_set:
    genre_column_lst.append(genre)
    genre_column_lst.append(f'{genre}_adj')
genre_column_lst.append('pagerank')
genre_column_lst.append('pagerank_adj')
score_df = movie_df.drop(columns=genre_column_lst)

In [None]:
for genre in genre_set:
    
    x = np.array(movie_df['pagerank_adj'])
    y = np.array(movie_df[f'{genre}_adj'] / movie_df['pagerank_adj'])
    m, b = np.polyfit(x, y, deg=1)
    new_y = y / (m * x + b)
    
    score_df[genre] = new_y - 1

In [None]:
for genre in genre_set:
    
    x = np.array(movie_df['pagerank'])
    y = np.array(movie_df[f'{genre}'] / movie_df['pagerank'])
    m, b = np.polyfit(x, y, deg=1)
    new_y = y / (m * x + b)
    
    score_df[genre] = new_y - 1

In [None]:
# This the one that works best

for genre in genre_set:
    
    x = np.array(movie_df['pagerank'])
    y = np.array(movie_df[f'{genre}'])
    m, b = np.polyfit(x, y, deg=1)
    new_y = y / (m * x + b)
    
    score_df[genre] = new_y - 1

In [None]:
for genre in genre_set:
    score_df[genre] = movie_df[f'{genre}_adj'] / movie_df['pagerank_adj'] - 1

In [None]:
for genre in genre_set:
    score_df[genre] = movie_df[f'{genre}'] / movie_df['pagerank'] - 1

In [None]:
score_df

In [None]:
score_df.loc[score_df['tmdb_id'] == 747]

In [None]:
# pd.set_option('display.expand_frame_repr', False)
# score_df.nlargest(10, 'Horror')
score_df.nlargest(20, 'Horror')

In [None]:
colors = sns.color_palette('tab20')
genre_lst = list(genre_set)
genre_lst.sort()
genre2color = {genre:colors[i] for i, genre in enumerate(genre_lst)}

In [None]:
some_movie_id = [5335, 314, 277, 257, 1939, 659, 4137, 520, 8063, 6062, 7075, 4918, 789, 2547, 6770, 7562, 2439]

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#    print(movie_df[['id', 'title']])

In [None]:
for index in some_movie_id:
    title = score_df.loc[index]['title']
    genre_lst = []
    score_lst = []
    color_lst = []
    
    print(score_df.loc[index]['genres'])

    for genre in genre_set:
        score = score_df.loc[index][genre]
        if score > 0:
            genre_lst.append(genre)
            score_lst.append(score)
            color_lst.append(genre2color[genre])

    plt.pie(score_lst, labels = genre_lst, colors=color_lst, autopct='%.1f%%')
    plt.title(title)
    plt.show()

In [None]:
for index in some_movie_id:
    title = score_df.loc[index]['title']
    genre_lst = []
    score_lst = []
    color_lst = []
    
    print(score_df.loc[index]['genres'])

    for genre in genre_set:
        score = score_df.loc[index][genre]
        if score > 0:
            genre_lst.append(genre)
            score_lst.append(score)
            color_lst.append(genre2color[genre])

    plt.pie(score_lst, labels = genre_lst, colors=color_lst, autopct='%.1f%%')
    plt.title(title)
    plt.show()

In [None]:
# LEGACY

pgrank_id_pair_lst = [(value, key) for key, value in result.items()]
pgrank_id_pair_lst.sort(reverse=True)

In [None]:
pgrank_id_pair_lst

In [None]:
top_movie_id_lst = [idx for pagerank, idx in pgrank_id_pair_lst[:50]]
top = movie_df.query('id in @top_movie_id_lst').copy()
pagerank_column = [(idx, pagerank) for pagerank, idx in pgrank_id_pair_lst[:50]]
pagerank_column.sort()
pagerank_column = [pagerank for idx, pagerank in pagerank_column]
top['pagerank'] = pagerank_column
top

In [None]:
top_movie_id_lst = [idx for pagerank, idx in pgrank_id_pair_lst[:50]]
top = movie_df.query('id in @top_movie_id_lst').copy()
pagerank_column = [(idx, pagerank) for pagerank, idx in pgrank_id_pair_lst[:50]]
pagerank_column.sort()
pagerank_column = [pagerank for idx, pagerank in pagerank_column]
top['pagerank'] = pagerank_column
top

In [None]:
sns.distplot(in_edge_count)
plt.show()
sns.distplot(out_edge_count)
plt.show()

In [None]:
# LEGACY CODE

w_lst = []
for i in tqdm(range(movie_count)):
    for j in range(i + 1, movie_count):
        if len(movie2userset[i]) > 0:
            w = len(movie2userset[i] & movie2userset[j]) / len(movie2userset[i])
        else:
            w = 0
        w_lst.append(w)
        if len(movie2userset[j]) > 0:
            w = len(movie2userset[i] & movie2userset[j]) / len(movie2userset[j])
        else:
            w = 0
        w_lst.append(w)
w_lst = np.array(w_lst)
w_lst_sample = np.random.choice(w_lst, size=1_000_000)
sns.distplot(w_lst_sample)