In [None]:
import mga
import mga.dataset

In [None]:
import mga.pagerank

In [None]:
mga.pagerank?

In [None]:
mga.dataset.download_ml_small(force_download=True)

In [None]:
mga.clean()

In [None]:
!./cpp/generate ratings.mtx
# Creates edges.csv

In [None]:
import pandas as pd
import numpy as np
import sknetwork as skn
from tqdm import tqdm

In [None]:
movie_df = pd.read_csv('movies.csv')
movie_count = movie_df.shape[0]

In [None]:
edge_df = pd.read_csv('edges.csv')

In [None]:
G = skn.data.convert_edge_list(edge_df.values, directed=True)
print('Graph is imported')
M_adj = G.adjacency
M_adj = M_adj + M_adj.T

In [None]:
edge_count = M_adj.nnz

print(f'Edge count: {edge_count}')
print(f'Vertex count: {movie_count}')

density = edge_count / (movie_count * (movie_count - 1))

print(f'Density: {density:.4f}')

zero_in_edge_node_count = (M_adj.sum(axis=0) == 0).sum()
print(f'Nodes with 0 incoming edges : {zero_in_edge_node_count}')
zero_out_edge_node_count = (M_adj.sum(axis=1) == 0).sum()
print(f'Nodes with 0 outgoing edges : {zero_out_edge_node_count}')

# Normalize weights

row_sum = np.asarray(M_adj.sum(axis=1)).squeeze()
row_sum[row_sum == 0] = 1
M_adj.data /= row_sum[M_adj.nonzero()[0]]

G.adjacency = M_adj

In [None]:
# PARAMETERS

DIFFERENT_ALPHA_POLICY = 0
SAME_ALPHA_POLICY = 1

teleport_prob = 0.15
unlabeled_movie_set = set()
policy = DIFFERENT_ALPHA_POLICY
disable_progress_bar = False

In [None]:
damping_factor = 1 - teleport_prob

pagerank = skn.ranking.pagerank.PageRank(damping_factor=damping_factor, n_iter=50)
scores = pagerank.fit_transform(G.adjacency)
movie_df['pagerank'] = pd.Series(scores)

print('Classic pagerank completed')

In [None]:
# Creating personalization sets
with open('genre.txt') as file:
    genre_set = set(line.strip() for line in file)

genre2movies = {genre: [] for genre in genre_set}
for movie_id, genre_str in zip(movie_df['id'], movie_df['genres']):

    if movie_id in unlabeled_movie_set:
        continue

    genre_lst = genre_str.split('|')
    for genre in genre_lst:
        if genre in genre2movies:
            genre2movies[genre].append(movie_id)

genre2count = {genre: len(movies) for genre, movies in genre2movies.items()}

# Topic-specific Pagerank teleport set generation
if policy == DIFFERENT_ALPHA_POLICY:
    # Different alpha values
    prob_teleport_each_movie = teleport_prob / genre2count['Thriller']
    genre2teleport_prob = {genre: prob_teleport_each_movie * count for genre, count in genre2count.items()}
    print(genre2teleport_prob)
    prob_teleport_each_movie = {genre: prob_teleport_each_movie for genre, count in genre2count.items()}
    print(prob_teleport_each_movie)
else:
    # Same alpha value
    prob_teleport_each_movie = {genre: teleport_prob / count for genre, count in genre2count.items()}
    genre2teleport_prob = {genre: teleport_prob for genre, count in genre2count.items()}

print('Teleport sets are created')

In [None]:
# Topic-specific Pagerank
for genre in tqdm(genre2movies, disable=disable_progress_bar):
    personalization_dict = {movie_id: 1 for movie_id in genre2movies[genre]}
    damping_factor = 1 - genre2teleport_prob[genre]
    
    pagerank = skn.ranking.PageRank(damping_factor=damping_factor, n_iter=50)
    result = pagerank.fit_transform(G.adjacency, seeds=personalization_dict)
    movie_df[genre] = pd.Series(result)