In [2]:
import os
import shutil
import sys

import json

import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sn
sn.set()

import pandas as pd
import implicit

from tqdm.notebook import tqdm

### Load data

In [263]:
# Just choose the name of the dataset directory
dataset  = 'ml-20m'
DATA_DIR = '/Users/tomas/Documents/FEUP/Tese/data/' + dataset
PARSE_DATA_DIR = os.path.join(DATA_DIR, 'processed_70_10_20')

In [264]:
unique_sid = list()
with open(os.path.join(PARSE_DATA_DIR, 'unique_sid.txt'), 'r') as f:
    for line in f:
        unique_sid.append(line.strip())

n_items = len(unique_sid)

In [265]:
def load_csv_data_to_sparse(csv_file):
    tp = pd.read_csv(csv_file)
    n_users = tp['uid'].max() + 1

    rows, cols = tp['uid'], tp['sid']
    data = sparse.csr_matrix((np.ones_like(rows),
                             (rows, cols)), dtype='float64',
                             shape=(n_users, n_items))
    return data, n_users

In [269]:
#TRAIN
#user - items
train_data, n_users = load_csv_data_to_sparse(os.path.join(PARSE_DATA_DIR, 'train.csv'))

In [267]:
train_data.shape

(129757, 11518)

In [283]:
df = pd.read_csv(os.path.join(PARSE_DATA_DIR, 'train.csv'))

In [285]:
df.loc[df['movieId'] == 84137]

Unnamed: 0,uid,sid
0,0,997
1,0,2700
2,0,3619
3,0,144
4,0,1273


In [281]:
#VALIDATION
#user - items
validation_data = pd.read_csv(os.path.join(PARSE_DATA_DIR, 'validation.csv'))

In [226]:
#TEST
#user - items
test_data = pd.read_csv(os.path.join(PARSE_DATA_DIR, 'test.csv'))

In [227]:
t = pd.read_csv(os.path.join(PARSE_DATA_DIR, 'train.csv'))
df = t.groupby('uid').size().reset_index(name='cont')
df = df.drop('uid', 1)
df['colFromIndex'] = df.index
df = df.sort_values(by=['cont', 'colFromIndex'], ascending=True)
df.head()

Unnamed: 0,cont,colFromIndex
8,7,8
19,7,19
33,7,33
35,7,35
43,7,43


In [228]:
t.head()

Unnamed: 0,uid,sid
0,0,997
1,0,2700
2,0,3619
3,0,144
4,0,1273


### Evaluation

In [14]:
def get_score(eval_df, pred, k, algo):
    """Compute the metrics for all the users in pred and compares it with the eval_df. 
    Args:
        eval_df: pandas df with interaction format (uid,sid). It will be used to compare the with the prediction  
        pred: numpy array with the predicions for each user
        k: Number of results to consider
        algo: Name of the algorithm
        
    Returns:
        Discounted cumulative gain
    """
    
    algo = algo + "_ndcg"
    n_users = eval_df['uid'].max() + 1
    grouped = eval_df.groupby('uid')

    pbar = tqdm()
    pbar.reset(total=n_users)
    


    df = pd.DataFrame(columns=['uid', algo]) 

    for i, (_, group) in enumerate(grouped):

        relevance = 1*np.isin(pred[i],group['sid'])
        ndcg = ndcg_at_k(relevance, k)
        df = df.append({'uid': i,algo: ndcg }, ignore_index=True)
        pbar.update()


    pbar.refresh()
    print("--- ", algo, " STATS ---")
    print("mean: ", df[algo].mean())
    print("max: ", df[algo].max())
    print("min: ", df[algo].min())
    return df

In [15]:
def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.
 

In [16]:
def ndcg_at_k(r, k=100, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max or dcg_max == 0:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

## Modelling

### ALS

In [229]:
NUM_THREADS = 0
ALS_TOP_N = 20
ALS_Factors = 50

In [230]:
als_model = implicit.als.AlternatingLeastSquares(factors=ALS_Factors,calculate_training_loss=True)

In [231]:
als_model.fit(train_data.T)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [232]:
als_pred = als_model.recommend_all(user_items=train_data, N=ALS_TOP_N)

HBox(children=(IntProgress(value=0, max=129757), HTML(value='')))




In [233]:
als_results_validation = get_score(validation_data,als_pred,ALS_TOP_N, 'als')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

---  als_ndcg  STATS ---
mean:  0.2729638592463805
max:  1.0
min:  0.0


In [234]:
als_results_test = get_score(test_data,als_pred,ALS_TOP_N, 'als')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

---  als_ndcg  STATS ---
mean:  0.21386842966922898
max:  1.0
min:  0.0


## Comparison

| Algorithm | Parameters | NDCG TEST mean |
| --- | --- | --- |
| ALS | TOP_N = 20 Factors = 100 | 0.17689 |


### Bayesian Personalized Ranking

In [235]:
BPR_TOP_N = 20
BPR_Factors = 100

In [236]:
model_bpr = implicit.bpr.BayesianPersonalizedRanking(factors=BPR_Factors)

In [237]:
model_bpr.fit(train_data.T)

HBox(children=(IntProgress(value=0), HTML(value='')))




In [238]:
pred_bpr = model_bpr.recommend_all(user_items=train_data, N=BPR_TOP_N)

HBox(children=(IntProgress(value=0, max=129757), HTML(value='')))




In [239]:
results_validation_bpr = get_score(validation_data, pred_bpr, BPR_TOP_N, 'bpr')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

---  bpr_ndcg  STATS ---
mean:  0.21130273550454912
max:  1.0
min:  0.0


In [240]:
results_test_bpr = get_score(test_data, pred_bpr, BPR_TOP_N, 'bpr')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

---  bpr_ndcg  STATS ---
mean:  0.16758552343923036
max:  1.0
min:  0.0


In [241]:
total_results =  pd.merge(als_results_test, results_test_bpr, on='uid')

In [242]:
total_results.head()

Unnamed: 0,uid,als_ndcg,bpr_ndcg
0,0.0,0.0,0.0
1,1.0,0.450584,0.0
2,2.0,0.0,0.231378
3,3.0,0.0,0.0
4,4.0,0.279651,0.0


### Logistic Matrix Factorization

In [243]:
LMF_TOP_N = 20
LMF_FACTORS = 20
#we are going to use the standard factors

In [244]:
model_LMF = implicit.lmf.LogisticMatrixFactorization(factors=LMF_FACTORS)

In [245]:
model_LMF.fit(train_data.T)

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




In [246]:
pred_LMF = model_LMF.recommend_all(user_items=train_data, N=LMF_TOP_N)

HBox(children=(IntProgress(value=0, max=129757), HTML(value='')))




In [247]:
results_validation_lmf = get_score(validation_data, pred_LMF, LMF_TOP_N, 'lmf')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

---  lmf_ndcg  STATS ---
mean:  0.1605447040832025
max:  1.0
min:  0.0


In [248]:
results_test_lmf = get_score(test_data, pred_LMF, LMF_TOP_N, 'lmf')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

---  lmf_ndcg  STATS ---
mean:  0.13301901612493933
max:  1.0
min:  0.0


In [249]:
total_results = pd.merge(total_results, results_test_lmf, on='uid')

In [250]:
total_results.head()

Unnamed: 0,uid,als_ndcg,bpr_ndcg,lmf_ndcg
0,0.0,0.0,0.0,0.0
1,1.0,0.450584,0.0,0.0
2,2.0,0.0,0.231378,0.0
3,3.0,0.0,0.0,0.0
4,4.0,0.279651,0.0,0.0


### Baseline Most Popular

In [251]:
most_popular_TOP_N = 20

In [252]:
def get_most_popular(train_data, k=100):
    """Compute recommendations for all users based on the baseline most popular  
        Args:
            train_data: user_items matrix. it will be used to filter the items already clicked  
            pred: numpy array with the predicions for each user
            k: Number of results to consider

        Returns:
        np.array with the recommendations for all users
    """
    print('-- Start --')
    train_array = np.array(train_data.toarray())
    print('-- converted train_data to np.array --')
    
    #get the item popularity saved in a json file
    with open(os.path.join(PARSE_DATA_DIR, 'map_item_pop.json')) as f:
        map_item_pop = json.load(f)
    print('-- read json with item popularity --')
    
    #parse map_item_pop to dataframe may be useful, get the popularity as a np.array  
    df_pop = pd.DataFrame(list(map_item_pop.items()),columns = ['sid','pop'])
    item_popularity_np = np.array(df_pop['pop'])
    print('-- created np.array with the item popularity --')

    #invert the sparse matrix, in this way we can clean which items did a user already clicked
    not_train_array = 1*np.logical_not(train_array)
    print('-- inverted train_np_array --')

    #Compute a np array per user with the amount of each item was clicked 
    #in this way we can sort and with the indices get the most popular for each user
    clicks_item_per_user = np.multiply(not_train_array,item_popularity_np)
    not_train_array = None
    item_popularity_np = None
    train_array = None    
    print('-- multiplied inverted array * item popularity --')


    #prepare variables
    recommendations_list = []
    n_users = clicks_item_per_user.shape[0]
    
    print('-- gets the recommendations for all users --')
    #start progress bar
    pbar = tqdm()
    pbar.reset(total=n_users)

    for i in range(0,n_users):
        # for each user get the most clicked s_id 
        sorted_indices_array = np.argsort(clicks_item_per_user[i])
        top_k = sorted_indices_array[-k:]
        recommendations_list.append(top_k[::-1])
        pbar.update()


    recommendations_np = np.array(recommendations_list)
    pbar.refresh()
    print('-- Im finished master --')
    return recommendations_np

In [253]:
pred_most_popular = get_most_popular(train_data=train_data, k=most_popular_TOP_N)

-- Start --
-- converted train_data to np.array --
-- read json with item popularity --
-- created np.array with the item popularity --
-- inverted train_np_array --
-- multiplied inverted array * item popularity --
-- gets the recommendations for all users --


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

-- Im finished master --


In [254]:
results_test_most_popular = get_score(test_data, pred_most_popular, most_popular_TOP_N, 'most_popular')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

---  most_popular_ndcg  STATS ---
mean:  0.1704757587311793
max:  1.0
min:  0.0


In [255]:
total_results = pd.merge(total_results, results_test_most_popular, on='uid')

In [256]:
total_results.head(10)

Unnamed: 0,uid,als_ndcg,bpr_ndcg,lmf_ndcg,most_popular_ndcg
0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.450584,0.0,0.0,0.315465
2,2.0,0.0,0.231378,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0
4,4.0,0.279651,0.0,0.0,0.371962
5,5.0,0.235409,0.311131,0.277639,0.0
6,6.0,0.331027,0.0,0.0,0.242704
7,7.0,0.333043,0.315465,0.30103,0.0
8,8.0,0.289065,0.63093,0.0,0.235409
9,9.0,0.0,0.0,0.0,0.278943


### KNN Item-Item

In [257]:
knn_K = 10
cos = implicit.nearest_neighbours.CosineRecommender(K=knn_K, num_threads=NUM_THREADS)
tfidf = implicit.nearest_neighbours.TFIDFRecommender(K=knn_K, num_threads=NUM_THREADS)
bm25 = implicit.nearest_neighbours.BM25Recommender(K=knn_K, num_threads=NUM_THREADS)

models = [cos, tfidf, bm25]

In [258]:
for model in models:
    model.fit(train_data.T)

HBox(children=(IntProgress(value=0, max=11518), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11518), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11518), HTML(value='')))




In [259]:
def compute_knn_recomendations(train_data, model, TOP_N=100):
    recommendations_list = []
    n_users = train_data.shape[0]

    pbar = tqdm()
    pbar.reset(total=n_users)
    for i in range(0,n_users):
        user_recomendations_tuples = model.recommend(i,train_data, N=TOP_N)
        user_recomendations_sid = [t[0] for t in user_recomendations_tuples]
        recommendations_list.append(user_recomendations_sid)
        pbar.update()

    pbar.refresh()
    recomendations_np = np.array(recommendations_list)
    return recomendations_np

In [260]:
pbar = tqdm()
pbar.reset(total=len(models))

for model, name in zip(models, ['knn_cos', 'knn_tfidf', 'knn_bm25']):
    pred_knn = compute_knn_recomendations(train_data, model)
    results_knn = get_score(test_data, pred_knn, knn_K, name)
    print("model finished")
    pbar.update()
    total_results = pd.merge(total_results, results_knn, on='uid')
    
pbar.refresh()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

---  knn_cos_ndcg  STATS ---
mean:  0.1066076822756109
max:  1.0
min:  0.0
model finished


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

---  knn_tfidf_ndcg  STATS ---
mean:  0.10280660096982122
max:  1.0
min:  0.0
model finished


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

---  knn_bm25_ndcg  STATS ---
mean:  0.07123673939645861
max:  1.0
min:  0.0
model finished


True

In [261]:
total_results.head()

Unnamed: 0,uid,als_ndcg,bpr_ndcg,lmf_ndcg,most_popular_ndcg,knn_cos_ndcg,knn_tfidf_ndcg,knn_bm25_ndcg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157732
1,1.0,0.450584,0.0,0.0,0.315465,0.425859,0.316745,0.0
2,2.0,0.0,0.231378,0.0,0.0,0.0,0.0,0.215338
3,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.279651,0.0,0.0,0.371962,0.108618,0.0,0.0


In [262]:
#this is a file in each columns has the evaluation ndcg for each algorithm
total_results.to_csv(os.path.join(PARSE_DATA_DIR, 'total_results_k_20.csv'), index=False)