In [5]:
from pymongo  import MongoClient
from wordcloud import WordCloud
import pandas as pd
import re
import collections
from pathlib import Path
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import string
import scipy.stats as st
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn import decomposition, naive_bayes, preprocessing, model_selection, metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection
from tqdm.notebook import tqdm
#! pip install scikit-surprise
from surprise import NormalPredictor, BaselineOnly, SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/souleymbaye/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Chargement des données AVIS

In [6]:
client = MongoClient(host="localhost", port=27017)
db = client["PLDAC_01"]
collection = db["avis"]

In [7]:
df_avis = pd.DataFrame(list(collection.find())).loc[:,["author","title","note"]]
df_avis.head(5)

Unnamed: 0,author,title,note
0,Monsieur Guillaume,Mariposas,8.0
1,morlockbob,Mariposas,7.0
2,SwatSh,Mariposas,7.0
3,Timi JeuxATheme,Mariposas,8.0
4,prunelles,Mariposas,9.0


In [9]:
df_avis['author'].value_counts().describe()

count    13623.000000
mean        18.096161
std         62.454982
min          1.000000
25%          2.000000
50%          4.000000
75%         11.000000
max       2194.000000
Name: author, dtype: float64

# Suppression des autheurs qui ont noté moins de 5 jeux

In [10]:
print(f"Taille du df avant {len(df_avis)}")
# Taille du df avant 246524

authors = df_avis['author'].value_counts()
authors = authors[authors >= 10].index.to_list()

df_avis_k = df_avis[df_avis['author'].isin(authors)]
print(f"Taille du df après {len(df_avis_k)}")
# Taille du df après 5925

Taille du df avant 246524
Taille du df après 216410


# Test/Train split


In [11]:
X_train, X_test = model_selection.train_test_split(df_avis_k, test_size=0.2, random_state=0)
df_avis.head()

Unnamed: 0,author,title,note
0,Monsieur Guillaume,Mariposas,8.0
1,morlockbob,Mariposas,7.0
2,SwatSh,Mariposas,7.0
3,Timi JeuxATheme,Mariposas,8.0
4,prunelles,Mariposas,9.0


## Basic Recsys **mean** baselines implementations


In [12]:
MEAN = X_train["note"].mean()
USER_MEANS = X_train.groupby("author")["note"].mean()
ITEM_MEANS = X_train.groupby("title")["note"].mean()


def mean_rating_pred(user_item):
    user = user_item["author"]
    item = user_item["title"]
    
    return MEAN

def user_mean_rating_pred(user_item):
    user = user_item["author"]
    item = user_item["title"]
    
    return USER_MEANS.get(user,default=MEAN)

def item_mean_rating_pred(user_item):
    user = user_item["author"]
    item = user_item["title"]
    
    return ITEM_MEANS.get(item,default=MEAN)


In [13]:
#We create the rating prediction columns
X_test["mean_prediction"] = X_test[["author","title"]].apply(mean_rating_pred,axis=1)
X_test["muser_prediction"] = X_test[["author","title"]].apply(user_mean_rating_pred,axis=1) 
X_test["mitem_prediction"] = X_test[["author","title"]].apply(item_mean_rating_pred,axis=1) 

X_test.head(5)

Unnamed: 0,author,title,note,mean_prediction,muser_prediction,mitem_prediction
142099,TS Léodagan,Sticheln,7.1,7.517173,6.317419,7.769048
83445,adamkostka699,Top Ten,9.0,7.517173,8.375,8.28738
168235,Aerth,Small World : Contes et Légendes,10.0,7.517173,7.994565,8.291667
127469,bobdju,Zombie Dice Deluxe,8.0,7.517173,7.475342,6.279638
245872,tedrak,Tiny Epic Galaxies,8.7,7.517173,8.699383,7.787209


## Evaluation metrics

In [14]:
from math import sqrt

def mae(predictions,truth):
    return ((predictions-truth).abs()).mean()

def mse(predictions,truth):
    return ((predictions - truth)**2).mean()

def rmse(predictions,truth):
    return sqrt(mse(predictions,truth))


def all_metrics(predictions,truth):
    return [f(predictions,truth) for f in [mae,mse,rmse]]

In [15]:
metrics = ["mae","mse","rmse"]
results = pd.DataFrame()

results["metrics"] = metrics
results["mean_prediction"] = all_metrics(X_test["mean_prediction"],X_test["note"])
results["muser_prediction"] = all_metrics(X_test["muser_prediction"],X_test["note"])
results["mitem_prediction"] = all_metrics(X_test["mitem_prediction"],X_test["note"])
results = results.set_index("metrics")

print(results)
print("")
print('---Best Models / Metrics: ---')
results.idxmin(1)

         mean_prediction  muser_prediction  mitem_prediction
metrics                                                     
mae             1.688229          1.498049          1.477974
mse             4.430859          3.844892          3.630531
rmse            2.104961          1.960840          1.905395

---Best Models / Metrics: ---


metrics
mae     mitem_prediction
mse     mitem_prediction
rmse    mitem_prediction
dtype: object

In [16]:
## Uncomment this to install required packages if needed (and restart kernel !)
#! pip install --upgrade scikit-surprise

## The baseline model $$ \hat{r}_{ui} = b_{ui} = \mu + b_u + b_i $$

In [17]:
from surprise import NormalPredictor, BaselineOnly, SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(X_test[['author', 'title', 'note']], Reader(rating_scale=(1, 10)))

In [18]:
BaselineModel = BaselineOnly()
BaselineModel.fit(data.build_full_trainset())

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x1307fa100>

In [19]:
def opt_bl_rating_pred(user_item):
    user = user_item["author"]
    item = user_item["title"]
    
    prediction = BaselineModel.predict(user,item)
    
    return prediction.est

X_test["opt_bl_prediction"] = X_test[["author","title"]].apply(opt_bl_rating_pred,axis=1) 

X_test

Unnamed: 0,author,title,note,mean_prediction,muser_prediction,mitem_prediction,opt_bl_prediction
142099,TS Léodagan,Sticheln,7.1,7.517173,6.317419,7.769048,7.671124
83445,adamkostka699,Top Ten,9.0,7.517173,8.375000,8.287380,8.480429
168235,Aerth,Small World : Contes et Légendes,10.0,7.517173,7.994565,8.291667,7.949434
127469,bobdju,Zombie Dice Deluxe,8.0,7.517173,7.475342,6.279638,6.697768
245872,tedrak,Tiny Epic Galaxies,8.7,7.517173,8.699383,7.787209,8.557447
...,...,...,...,...,...,...,...
98862,Guillaume GN,Quartermaster General,7.0,7.517173,8.377953,7.358621,7.557205
228402,Biff,Freight Train,8.0,7.517173,7.301493,9.333333,7.359988
65180,Jones,Legendary: A Marvel Deck Building Game,7.0,7.517173,6.709594,8.533333,7.436350
17005,Kheldane,Call to Adventure (Édition française),6.0,7.517173,7.428571,6.339623,6.965618


## SVD Algorithm $$\hat{r}_{ui} = \mu + b_u + b_i + q_i^Tp_u$$

In [20]:

SVDmodel = SVD()
SVDmodel.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x139719730>

In [22]:
def svd_rating_pred(user_item):
    user = user_item["author"]
    item = user_item["title"]
    
    prediction = SVDmodel.predict(user,item)
    
    return prediction.est

X_test["svd_prediction"] = X_test[["author","title"]].apply(svd_rating_pred,axis=1) 
X_test

Unnamed: 0,author,title,note,mean_prediction,muser_prediction,mitem_prediction,opt_bl_prediction,svd_prediction
142099,TS Léodagan,Sticheln,7.1,7.517173,6.317419,7.769048,7.671124,7.284926
83445,adamkostka699,Top Ten,9.0,7.517173,8.375000,8.287380,8.480429,9.049167
168235,Aerth,Small World : Contes et Légendes,10.0,7.517173,7.994565,8.291667,7.949434,8.592220
127469,bobdju,Zombie Dice Deluxe,8.0,7.517173,7.475342,6.279638,6.697768,7.542646
245872,tedrak,Tiny Epic Galaxies,8.7,7.517173,8.699383,7.787209,8.557447,8.739248
...,...,...,...,...,...,...,...,...
98862,Guillaume GN,Quartermaster General,7.0,7.517173,8.377953,7.358621,7.557205,7.644311
228402,Biff,Freight Train,8.0,7.517173,7.301493,9.333333,7.359988,7.828017
65180,Jones,Legendary: A Marvel Deck Building Game,7.0,7.517173,6.709594,8.533333,7.436350,7.400083
17005,Kheldane,Call to Adventure (Édition française),6.0,7.517173,7.428571,6.339623,6.965618,6.457121


## Evaluation

In [23]:
metrics = ["mae","mse","rmse"]
results = pd.DataFrame()

results["metrics"] = metrics
results["mean_prediction"] = all_metrics(X_test["mean_prediction"],X_test["note"])
results["muser_prediction"] = all_metrics(X_test["muser_prediction"],X_test["note"])
results["mitem_prediction"] = all_metrics(X_test["mitem_prediction"],X_test["note"])
results["opt_bl_prediction"] = all_metrics(X_test["opt_bl_prediction"],X_test["note"])
results["svd_prediction"] = all_metrics(X_test["svd_prediction"],X_test["note"])

results = results.set_index("metrics")

print(results)
print("")
print('---Best Models / Metrics: ---')
results.idxmin(1)

         mean_prediction  muser_prediction  mitem_prediction  \
metrics                                                        
mae             1.688229          1.498049          1.477974   
mse             4.430859          3.844892          3.630531   
rmse            2.104961          1.960840          1.905395   

         opt_bl_prediction  svd_prediction  
metrics                                     
mae               1.342583        0.800351  
mse               3.003858        1.118855  
rmse              1.733164        1.057759  

---Best Models / Metrics: ---


metrics
mae     svd_prediction
mse     svd_prediction
rmse    svd_prediction
dtype: object

## Visualizing learnt embeddings

>-

In [24]:


def save_embeddings(embs,dict_label,path="saved_word_vectors"):
    """
    embs is Numpy.array(N,size)
    dict_label is {str(word)->int(idx)} or {int(idx)->str(word)}
    """
    def int_first(k,v):
        if type(k) == int:
            return (k,v)
        else:
            return (v,k)

    np.savetxt(f"{path}_vectors.tsv", embs, delimiter="\t")

    #labels 
    if dict_label:
        sorted_labs = np.array([lab for idx,lab in sorted([int_first(k,v) for k,v in dict_label.items()])])
        print(sorted_labs)
        with open(f"{path}_metadata.tsv","w") as metadata_file:
            for x in sorted_labs: #hack for space
                if len(x.strip()) == 0:
                    x = f"space-{len(x)}"
                    
                metadata_file.write(f"{x}\n")

In [None]:
titleCSV = pd.read_csv("dataset/movies.csv")
titleCSV.head(5)

In [None]:
id2title = titleCSV[["movieId","title"]].set_index("movieId").to_dict()["title"]
list(id2title.items())[:10]

In [None]:
full_data = data.build_full_trainset()
index2movie = {x:id2title[full_data.to_raw_iid(x)] for x in full_data.all_items()}
SVDmodel.qi # Holds product vectors
SVDmodel.pu # Holds user vectors