In [12]:
import os 
import pandas as pd 
import numpy as np 
import networkx as nx 
from itertools import permutations
from math import factorial
from sklearn.model_selection import train_test_split
from functools import reduce



ROOT = os.getenv('ROOT_FOLDER')
df = pd.read_csv(os.path.join(ROOT, "app/resources/rating.csv"), usecols=['userId', 'movieId', 'timestamp',
                                                                          'rating'])
df = df.assign(timestamp=pd.to_datetime(df.timestamp),
              movieId=df.movieId.astype(str))

In [2]:
df.userId.unique().shape, df.movieId.unique().shape

((138493,), (26744,))

###  Train test 

Steps 

1. Randomly sample 10% of users and movies
2. Keep movies and users that have atleast 10 number of edges in the dataset (this helps to stratify between train and test set)
3. Train, test splits with stratification on movieIds (to ensure that there are no new movies in the test set)



In [3]:
def movie_filter(df, col, thresh):
    m = df[col].value_counts()>thresh
    return df[df[col].isin(m[m].index)]


def get_subsample(df, col, size=0.10):
    uniques = np.unique(df[col]) if col else np.unique(df.index)
    subsample = np.random.choice(uniques, size=int(len(uniques)*size), replace=False)
    return df[df[col].isin(subsample)] if col else df[df.index.isin(subsample)]
        

In [4]:
 


df_sample = get_subsample(df, 'userId', 0.10)
df_sample = get_subsample(df_sample, 'movieId', 0.10)
df_sample = movie_filter(df_sample, 'movieId', 10)
df_sample = movie_filter(df_sample, 'userId', 10)



In [5]:
df_sample.shape,df_sample.userId.unique().shape,df_sample.movieId.unique().shape

((144207, 3), (4551,), (840,))

In [6]:
train, test = train_test_split(df_sample, 
                                 test_size=0.2, 
                                 stratify=df_sample[['movieId']], 
                              random_state=42)

In [7]:
train.shape, test.shape

((115365, 3), (28842, 3))

In [8]:
train.userId.unique().shape,train.movieId.unique().shape

((4551,), (840,))

In [9]:
test.userId.unique().shape,test.movieId.unique().shape

((4442,), (840,))

In [10]:
set(test.userId).difference(set(train.userId))

set()

In [11]:
set(test.movieId).difference(set(train.movieId))

set()

In [None]:
len(set(zip(test.userId, test.movieId)).difference(set(zip(train.userId, train.movieId))))

###  User item Bipartite graph

In [12]:
def user_item_bipartite(df):
    
    B = nx.Graph()
    B.add_nodes_from(df.userId.unique(), bipartite=0)
    B.add_nodes_from(df.movieId.unique(), bipartite=1)
    # add edges only between nodes of opposite sets
    B.add_edges_from(list(zip(df.userId, df.movieId)))
    return B
    

In [13]:
B = user_item_bipartite(train)

### Item Item graph

In [14]:

def nPr(n, r):
    return int(factorial(n)/factorial(n-r))


def get_edge_weight_bool(graph, item1, item2):
    
    g = nx.common_neighbors(graph, item1, item2)
    
    if_edge = False
    try:
        if_edge = True if next(g) else if_edge
    except Exception as e:
        print(f"No edge found between the items {item1} & {item2}")

    return if_edge

def get_edge_weight_float(graph, item1, item2):
    
    """
    item1 -> item 2 : P(item2|item1) = P(item2 and item1)/P(item1)
    """
    probab_items = len(list(nx.common_neighbors(graph, item1, item2)))
    return 0 if probab_items==0 else len(list(graph.neighbors(item1)))/probab_items
    


In [15]:
def build_item_item_graph(permutations_generator, user_item_graph, edge_weight_fn):
    
    out_graph = nx.DiGraph()
    for item1, item2 in permutations_generator:
        wt = edge_weight_fn(user_item_graph, item1, item2)
        
        if wt:
            out_graph.add_edge(item1, item2) if wt is True else out_graph.add_edge(item1, item2, weight=wt)
            
    return out_graph



def preference_vector(user, user_item_graph):
    "we have no weights in user_item graphs because we assume model is ratings agnostic"
    
    nitems = user_item_graph.degree(user)
    
    return {item:1/nitems for item in user_item_graph.neighbors(user)}

In [17]:
%%time 

ss = train.movieId.unique()
pairs_generator = permutations(ss, r=2)
I = build_item_item_graph(pairs_generator, B, get_edge_weight_float)

CPU times: user 2min 13s, sys: 618 ms, total: 2min 14s
Wall time: 2min 16s


In [18]:
nx.info(I)


  nx.info(I)


'DiGraph with 840 nodes and 569114 edges'

In [70]:
%%time 

scores_100 = get_page_rank_scores(test.userId.unique()[:100], I, B)

CPU times: user 2min 14s, sys: 3.87 s, total: 2min 18s
Wall time: 2min 23s


In [71]:
def get_page_rank_scores(users, item_item_graph, user_item_graph):
    
    scores = {user:
              {movie:score for movie,score in nx.pagerank(item_item_graph, 
                                      personalization=preference_vector(user, user_item_graph)).items()
              if movie not in list(user_item_graph.neighbors(user))}
              
          for user in users}
    return scores
    
    
def helper(args):
    return get_page_rank_scores(*args)


def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        

def merge_dicts(dicts):
    return reduce(lambda x,y: {**x, **y}, dicts)

### Calculate pagerank for all users in the test set using multiprocessing

In [None]:
import multiprocessing as mp 

mp.set_start_method("fork")

In [105]:

from concurrent.futures import ProcessPoolExecutor

executor=ProcessPoolExecutor(max_workers=4)



In [106]:
%%time 

args = [(batch, I, B) for batch in chunks(test.userId.unique(), 100)]
out = [result for result in executor.map(helper, args)]

out_merged = merge_dicts(out)

CPU times: user 40.6 s, sys: 10.2 s, total: 50.8 s
Wall time: 1h 1min 48s


Process ForkProcess-8:
Process ForkProcess-9:
Process ForkProcess-7:
Process ForkProcess-10:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/siddhanttandon/.pyenv/versions/3.8.0/lib/python3.8/multiprocessing/process.py", line 313, in _bootstrap
    self.run()


### Evaluation

In [2]:
from app.core.evaluation import Ndcg

In [27]:
ndcg = Ndcg(k=10, user_col='userId', preds_col='score', relevance_col='rating')

In [8]:
def to_dataframe(test_results):

    xx = {k:pd.DataFrame.from_dict(v, orient='index', 
                              columns=['score'],).reset_index().rename(columns={'index':'movieId'})
            for k,v in test_results.items()
        }
    return pd.concat(xx).rename_axis(['userId', None]).reset_index(level=0)

In [7]:

out_merged = pd.read_pickle(os.path.join(ROOT, "app/resources/test_results.p"))


In [9]:
user_scores = to_dataframe(out_merged)

In [23]:
x = user_scores.merge(df, on=['userId', 'movieId'])

In [30]:
# mean ndcg across all users
pd.DataFrame.from_dict(ndcg.calculate_metrics(x), orient='index').mean()[0]

0.9161496048116643