In [1]:
import os 
import pandas as pd 
import numpy as np 
import networkx as nx 
from itertools import permutations
from math import factorial
from sklearn.model_selection import train_test_split
from functools import reduce




ROOT = os.getenv('ROOT_FOLDER')
df = pd.read_csv(os.path.join(ROOT, "app/resources/rating.csv"), usecols=['userId', 'movieId', 'timestamp',
                                                                          'rating'])
df = df.assign(timestamp=pd.to_datetime(df.timestamp),
              movieId=df.movieId.astype(str))

### Sub sample and train test split

In [2]:
from app.core.processing import SubSampler, Splitter, ColumnsDropper

sampler = SubSampler(userCol='userId', itemCol='movieId', size=0.10, frequency_thresh=10, seed=42)
splitter = Splitter(test_size=0.2, itemCol='movieId')
dropper = ColumnsDropper(cols_to_drop=['rating'])

In [3]:
df_sample = sampler.apply(df)
train, test = splitter.split(df_sample)
X_train = dropper.apply(train)

In [4]:
train.shape, test.shape

((124316, 4), (31080, 4))

In [5]:
train.userId.unique().shape,train.movieId.unique().shape

((4941,), (852,))

In [6]:
set(test.userId).difference(set(train.userId))

set()

In [7]:
set(test.movieId).difference(set(train.movieId))

set()

In [8]:
len(set(zip(test.userId, test.movieId)).difference(set(zip(train.userId, train.movieId))))

31080

###  Fit & predict

In [9]:
from app.core.estimator import Estimator

In [10]:
%%time 

e = Estimator()
model = e.fit(X_train)

CPU times: user 2min 27s, sys: 912 ms, total: 2min 28s
Wall time: 2min 33s


In [11]:
sample_test = test[test.userId.isin(test.userId.unique()[:10])]

In [12]:
preds = model.predict(sample_test)

In [14]:
preds.head()

Unnamed: 0,userId,movieId,score,rating,timestamp
0,95501,608,0.000204,4.0,2007-12-09 03:41:00
1,95501,1676,0.000347,4.5,2008-03-06 11:08:07
2,95501,904,0.00033,3.5,2007-12-09 02:46:13
3,95501,1884,0.000526,2.5,2008-07-27 12:48:02
4,95501,8910,0.000674,3.0,2007-12-09 03:00:13


### Evaluate

In [15]:
from app.core.evaluation import Ndcg

In [16]:
ndcg = Ndcg(k=10, user_col='userId', preds_col='score', relevance_col='rating')

In [17]:
ndcg.calculate_metrics(preds)

{19621: 0.8231952626070935,
 22495: 0.9555001783072599,
 22697: 0.9462257242770947,
 31244: 0.6844802528659866,
 36460: 0.942467349171208,
 87678: 0.9239109680111085,
 95501: 0.8006641055145592,
 95581: 0.9492147449622255,
 112309: 0.8829458575408894,
 127167: 0.6888651962806803}

In [18]:
pd.DataFrame.from_dict(ndcg.calculate_metrics(preds), orient='index').mean()[0]

0.8597469639538107