In [1]:
from pymongo  import MongoClient
import pandas as pd
import numpy as np
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn import decomposition, naive_bayes, preprocessing, model_selection, metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection
#! pip install scikit-surprise
from surprise import NormalPredictor, BaselineOnly, SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

# Chargement des données AVIS

In [2]:
client = MongoClient(host="localhost", port=27017)
db = client["PLDAC_01"]
collection = db["avis"]

In [3]:
df_avis = pd.DataFrame(list(collection.find())).loc[:,["author","title","note"]]
df_avis.head(5)

Unnamed: 0,author,title,note
0,Monsieur Guillaume,Mariposas,8.0
1,morlockbob,Mariposas,7.0
2,SwatSh,Mariposas,7.0
3,Timi JeuxATheme,Mariposas,8.0
4,prunelles,Mariposas,9.0


In [4]:
num_users  = df_avis["author"].nunique()
num_items  = df_avis["title"].nunique()

print(f"there are {num_users} users and {num_items} items")

there are 13623 users and 10709 items


## Suppression des jeux qui ont été notés moins de 10 fois

In [113]:
print(f"Taille du df avant {len(df_avis)}")
# Taille du df avant 246524

titles = df_avis['title'].value_counts()
titles = titles[titles >= 50].index.to_list()

df_avis_k = df_avis[df_avis['title'].isin(titles)]
print(f"Taille du df après {len(df_avis_k)}")
# Taille du df après 5925

Taille du df avant 246524
Taille du df après 169394


In [114]:
# on elimine les lignes où un author a note plusieurs fois un jeu
# on fait la note moyenne

df_avis_k_unique = df_avis_k.groupby(['author','title'])['note'].mean().to_dict()
avis_un = []

for at,note in df_avis_k_unique.items():
  author,title = at
  avis_un.append((author,title,note))

df_avis_un = pd.DataFrame(avis_un,columns=df_avis_k.columns)
df_avis_un.head()

Unnamed: 0,author,title,note
0,#yuyu#,Bubblee Pop,8.5
1,#yuyu#,KARMAKA,8.8
2,$hadow,Le Dilemme du Roi,9.0
3,$lebat@$,Deluxe Camping,10.0
4,*FitzChevalerie*,"7 Wonders - Extension ""Cities""",8.0


## Test/Train split

In [115]:

X_train, X_test = model_selection.train_test_split(df_avis_un, test_size=0.2, random_state=0)
df_avis.head()

Unnamed: 0,author,title,note
0,Monsieur Guillaume,Mariposas,8.0
1,morlockbob,Mariposas,7.0
2,SwatSh,Mariposas,7.0
3,Timi JeuxATheme,Mariposas,8.0
4,prunelles,Mariposas,9.0


## Mean Reciprocal Rank 

$$ MRR = \frac{1}{|Q|}\sum^{|Q|}_{i=1}\frac{1}{\text{rank}_i} $$

In [116]:
def rr(list_items):
    relevant_indexes = np.asarray(list_items).nonzero()[0]
    
    if len(relevant_indexes) > 0:
        
        #NOTE:
        # relevant_indexes[0] <= Contains the index of the 1st relevant item ([0,0,1] => 2)
        
        return 1 / (relevant_indexes[0] + 1)
    else:
        return 0

def mrr(list_list_items):
    rr_list = [rr(list_items) for list_items in list_list_items]
    return np.mean(rr_list)

## Discounted Cumulative Gain

$$DCG_p = \sum^p_{i=1}\frac{rel_i}{\log_2{(i+1)}} = rel_1 + \sum^p_{i=2}\frac{rel_i}{\log_2{(i+1)}}$$

In [117]:
def dcg_at_k(r, k):
    """Score is discounted cumulative gain (dcg)
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        
    """
    r = np.asfarray(r)[:k]
    if r.size:
        return r[0] +  np.sum(r[1:] / np.log2(np.arange(3, r.size + 2)))
        
    return 0.

$$ nDCG_p = \frac{DCG_p}{IDCG_p} $$


$$ IDCG_p = max(DCG_p) $$

In [118]:
def ndcg_at_k(r, k):
    """
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
    """
    dcg_max =  np.max(dcg_at_k(sorted(r)[::-1], k))
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k)/dcg_max

- `already_seen`: Items that were already seen by users. This is for training and not recommending them again
- `ground_truth`: Items that will be seen and liked (rating >= 10) by users. This is our ground truth to evaluate our predictions.

In [119]:
already_seen = (
    X_train
    .groupby("author")["title"]
    .apply(list)
    .to_dict()
    )

ground_truth = (
    X_test[X_test.note >= 10] 
    .groupby("author")["title"]
    .apply(list)
    .to_dict()
    )

### We also need the set of all items that can be recommended

In [120]:
existing_items = set(X_train["title"].unique())
print("The recommender system will have to pick a few items from",len(existing_items),"possible items")

The recommender system will have to pick a few items from 1324 possible items


#  Surprise SVD recommender

In [121]:
from surprise import Dataset

data = Dataset.load_from_df(X_train[['author', 'title', 'note']], Reader(rating_scale=(1, 10)))
model = SVD()
model.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x13ebb39d0>

In [122]:
def svd_rating_pred(user_item):
    user = user_item["author"]
    item = user_item["title"]
    
    prediction = model.predict(user,item)
    
    return prediction.est

X_test["svd_prediction"] = X_test[["author","title"]].apply(svd_rating_pred,axis=1) 

In [123]:
mse = ((X_test["note"] - X_test["svd_prediction"])**2).mean()
mae = ((X_test["note"] - X_test["svd_prediction"]).abs()).mean()

print(f"MSE: {mse} -- MAE: {mae}")

# MSE: 2.3801853240165443 -- MAE: 1.085783922356087

MSE: 3.38992377623618 -- MAE: 1.3984831743531199


In [124]:
def model_rating_pred(model,user,item):
    prediction = model.predict(user,item)
    return prediction.est

###  the relevance list for our MRR function

In [125]:
list_of_rel = []
    

for user,will_see in ground_truth.items():
    rel_list = []
    will_see = set(will_see)
    if user not in already_seen:
        continue # si l'utilisateur n'apparait pas dans Xtrain
    has_seen = set(already_seen[user])
    can_see = [(mid,model_rating_pred(model,user,mid)) for mid in existing_items - has_seen]
    
    
    for movie,score in reversed(sorted(can_see,key=lambda x:x[1])):
        if movie in will_see:
            rel_list.append(1)
            break
        else:
            rel_list.append(0)        
    rel_list[-1] = 1 # when no relevant item exist
    list_of_rel.append(rel_list)
    

svd_mrr = mrr(list_of_rel)

In [126]:
f"On average, the {int(round(1/svd_mrr,0))}th proposed item is relevant (on {len(existing_items)})"

'On average, the 36th proposed item is relevant (on 1324)'

En éliminant les utilisateurs ayant noter moins de 500 jeux nous avons une mrr de 8 sur 5546. </br>

En éliminant les utilisateurs ayant noter moins de 100 jeux nous avons une mrr de 13 sur 8497. </br>

En éliminant les utilisateurs ayant noter moins de 10 jeux nous avons une mrr de 27 sur 688.

# Implicit baseline: popular items

In [127]:
item_counts = X_train.groupby('title')["note"].count().sort_values(ascending=False)
popular_item_list = item_counts.index.tolist()

In [128]:
print(len(popular_item_list))

1324


In [129]:
print(popular_item_list[:10])

['Shogun', 'Pandemic Legacy Saison 1', 'Cyclades', 'Dixit', 'Le Roi des Nains', 'Skull', 'Snow Tails', 'Die Baumeister: Mittelalter', 'Non Merci', 'Mille Bornes - Fun & Speed - Voyage']


### popular recommendation relevance list per user

In [130]:
list_of_rel = []

for user,will_see in ground_truth.items():
    rel_list = []
    will_see = set(will_see)
    if user not in already_seen:
        continue
    has_seen = set(already_seen[user])
    
    for movie in popular_item_list:
        if movie in has_seen:         # User has already seen movie -> Can filter prediction
            continue
        elif movie in will_see:       # User will see, spot on suggestion !         
            rel_list.append(1) # To Complete
            break
        else:                         # No clue.
            rel_list.append(0) # To Complete
            
    if rel_list[-1] == 1:             # when no relevant item exist, no need to take it into account.
        list_of_rel.append(rel_list)

In [131]:
pop_mrr = mrr(list_of_rel)
f"On average, the {int(round(1/pop_mrr,0))}th proposed item is relevant (on {len(existing_items)})"

'On average, the 40th proposed item is relevant (on 1324)'

En éliminant les utilisateurs ayant noter moins de 100 jeux nous avons une mrr de 13 sur 8497. </br>

En éliminant les utilisateurs ayant noter moins de 10 jeux nous avons une mrr de 49 sur 9786.

# Implicit Collaborative Filtering 

## Interaction train/test dataset within the framework

In [132]:
from lightfm.data import Dataset

# (a) Create a dataset
dataset = Dataset()


# (b) Create an internal mapping for users and items (We need to consider train + test)
dataset.fit((x for x in df_avis_k["author"]),
            (x for x in df_avis_k["title"]))

# (c) Create the interaction matrices
(train_interactions, weights) = dataset.build_interactions(
    ((x.author, x.title) for x in X_train.itertuples() if x.note >= 10) # We only consider 5's as interactions
) 
(test_interactions, weights) = dataset.build_interactions(
    ((x.author, x.title) for x in X_test.itertuples() if x.note >= 10)  # We only consider 5's as interactions
) 

In [133]:
print(f"train interactions: {train_interactions.shape}")
print(f"test interactions : {test_interactions.shape}")

train interactions: (10876, 1324)
test interactions : (10876, 1324)


In [134]:
print(f"train interactions: {sum(X_train['note']>=10)}")
print(f"train interactions: {sum(X_test['note']>=10)}")


train interactions: 19703
train interactions: 4984


## Train the lightFM model

In [135]:
from lightfm import LightFM

model = LightFM(loss='bpr',random_state=50000)
model.fit(train_interactions)

<lightfm.lightfm.LightFM at 0x141455af0>

In [136]:
from lightfm.evaluation import reciprocal_rank
bpr_mrr = reciprocal_rank(model, test_interactions, train_interactions).mean()

In [137]:
f"On average, the {int(round(1/bpr_mrr,0))}th proposed item is relevant (on {len(existing_items)})"

'On average, the 59th proposed item is relevant (on 1324)'

En éliminant les utilisateurs ayant noter moins de 500 jeux nous avons une mrr de 26 sur 5546. </br>

En éliminant les utilisateurs ayant noter moins de 100 jeux nous avons une mrr de 14 sur 8497. </br>

En éliminant les utilisateurs ayant noter moins de 10 jeux nous avons une mrr de 56 sur 9786.

## We consider EVERY rating as one interaction

In [138]:
# Create the interaction matrix
(train_interactions_all, weights) = dataset.build_interactions(
    ((x.author, x.title) for x in X_train.itertuples()) # We only consider 5's as interactions
) 
(test_interactions_all, weights) = dataset.build_interactions(
    ((x.author, x.title) for x in X_test.itertuples())  # We only consider 5's as interactions
) 

from lightfm import LightFM


model_bpr_all = LightFM(loss='bpr',random_state=50000)
model_bpr_all.fit(train_interactions_all)

bpr_mrr_all = reciprocal_rank(model_bpr_all, test_interactions_all, train_interactions_all).mean()

In [139]:
f"On average, the {int(round(1/bpr_mrr_all,0))}th proposed item is relevant (on {len(existing_items)})"

# 'On average, the 34th proposed item is relevant (on 9819)'

'On average, the 47th proposed item is relevant (on 1324)'

En éliminant les utilisateurs ayant noter moins de 100 jeux nous avons une mrr de 3 sur 5546. </br>

En éliminant les utilisateurs ayant noter moins de 100 jeux nous avons une mrr de 6 sur 8497. </br>

En éliminant les utilisateurs ayant noter moins de 10 jeux nous avons une mrr de 34 sur 9786.