In [66]:
import random

import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

## Import des données

In [67]:
DATA_PATH = 'data/processed/'

In [68]:
interactions_df = pd.read_csv(DATA_PATH + 'interactions_df.csv', index_col=0)
interactions_df.head()

Unnamed: 0,user_id,article_id,click
0,3,233769,1
1,3,234686,1
2,3,235665,1
3,3,236065,1
4,3,236294,1


In [69]:
articles_min_df = pd.read_csv(DATA_PATH + 'articles_min.csv', index_col=0)
articles_min_df.head()


Unnamed: 0_level_0,category_id,created_at_ts,words_count,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
319488,433,2017-09-29 15:47:18,157,-0.466458,-0.964679,-0.485105,-0.877867,0.391511,0.483561,0.00548,...,-0.814141,-0.442375,0.039461,0.395648,-0.747964,-0.382282,-0.63284,-0.387871,-0.056827,-0.199358
360463,455,2017-10-01 10:01:03,176,-0.602885,-0.961464,-0.565603,-0.649518,-0.559716,0.524595,-0.3701,...,-0.585967,-0.653178,-0.195667,0.249913,0.331547,0.119053,0.659684,-0.244358,-0.151648,0.401354
360465,455,2017-10-01 13:13:54,114,-0.205855,-0.957574,-0.878344,-0.43189,-0.693044,0.168474,-0.38585,...,-0.557692,-0.653369,-0.120972,-0.227757,0.475501,0.44649,0.455878,-0.210448,-0.116186,0.585806
106520,228,2017-08-11 21:11:33,112,0.505829,-0.931578,-0.277545,0.383692,-0.603157,0.135672,-0.13724,...,0.267274,-0.060965,0.684367,-0.198938,-0.601258,0.362214,0.370468,0.36978,0.006491,0.607185
180261,301,2016-11-19 13:30:13,199,-0.631314,-0.972719,0.554279,0.087726,0.090802,0.394112,0.05757,...,0.300368,-0.433697,-0.092218,-0.694479,0.208018,-0.244026,0.054419,-0.172014,0.355977,-0.357948


## Split

In [70]:
interactions_train_df, interactions_test_df = train_test_split(interactions_df,
                                                               stratify=interactions_df['user_id'], 
                                                               test_size=0.20,
                                                               random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 17121
# interactions on Test set: 4281


### Création de rating

In [71]:
interactions_train_df

Unnamed: 0,user_id,article_id,click
7235,5832,284463,1
13152,11901,59758,1
16441,16610,332114,1
6313,4911,185992,1
9621,8192,207122,1
...,...,...,...
16585,16794,270589,1
6142,4761,207391,1
21172,27110,300299,1
6404,4972,42656,1


In [72]:
interactions_train_df[interactions_train_df[['user_id', 'article_id']].duplicated()]

Unnamed: 0,user_id,article_id,click


In [73]:
ratings_train_df = interactions_train_df.groupby(['user_id', 'article_id'])['article_id'].count().reset_index(name='click_count')

# calculer le nombre total de clics de chaque utilisateur
user_click_counts = ratings_train_df.groupby('user_id')['article_id'].count().reset_index(name='total_click_count')

# joindre les deux tables pour obtenir le nombre de clics de chaque utilisateur sur chaque article ainsi que le nombre total de clics de chaque utilisateur
ratings_train_df = ratings_train_df.merge(user_click_counts, on='user_id')

# calculer le rating de chaque article en utilisant le nombre de clics de l'utilisateur sur l'article normalisé par le nombre total de clics de l'utilisateur
ratings_train_df['rating'] = ratings_train_df['click_count'] / ratings_train_df['total_click_count']
ratings_train_df


Unnamed: 0,user_id,article_id,click_count,total_click_count,rating
0,3,233769,1,4,0.250000
1,3,234686,1,4,0.250000
2,3,235665,1,4,0.250000
3,3,236294,1,4,0.250000
4,5,59929,1,6,0.166667
...,...,...,...,...,...
17116,27876,293425,1,9,0.111111
17117,27903,160474,1,4,0.250000
17118,27903,161178,1,4,0.250000
17119,27903,207122,1,4,0.250000


In [74]:
ratings_train_df['total_click_count'].min()

2

In [75]:
ratings_train_df['rating'].min()

0.020833333333333332

# Popularity Recommander

In [76]:
popularity_df = pd.DataFrame({
    'article_id': interactions_train_df['article_id'].value_counts().index,
    'popularity': interactions_train_df['article_id'].value_counts().values,
})
popularity_df.head()

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
4,119592,602


In [77]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['article_id'].isin(items_to_ignore)] \
                                                .sort_values('popularity', ascending= False)\
                                                .head(topn)
        return recommendations_df

In [78]:
popularity_model = PopularityRecommender(popularity_df, articles_min_df)

In [79]:
popularity_model.recommend_items(user_id=5890, items_to_ignore=set(interactions_test_df[interactions_test_df.user_id==5890].article_id), topn=5)

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
5,59758,525


In [80]:
popularity_model.recommend_items(user_id=8192, items_to_ignore=set(interactions_test_df[interactions_test_df.user_id==8192].article_id), topn=5)

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
4,119592,602


In [81]:
popularity_model.recommend_items(user_id=3, items_to_ignore=set(interactions_test_df[interactions_test_df.user_id==3].article_id), topn=5)

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
4,119592,602


In [82]:
## Test avec filtre des articles déjà vus

In [83]:
set(interactions_test_df[interactions_test_df.user_id==8].article_id)

set()

In [84]:
popularity_model.recommend_items(user_id=8)

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
4,119592,602
5,59758,525
6,336430,525
7,220466,326
8,313504,314
9,118180,280


In [85]:
set(interactions_test_df[interactions_test_df.user_id==9261].article_id)

{36690, 96663, 149734, 161585, 195586, 207035, 207391, 233605, 284664, 354701}

Il conseille un article déjà consulté :
 - 96 663 en 4e

In [86]:
popularity_model.recommend_items(user_id=9261, items_to_ignore = set(interactions_test_df[interactions_test_df.user_id==9261].article_id))

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
4,119592,602
5,59758,525
6,336430,525
7,220466,326
8,313504,314
9,118180,280
10,68866,276


# Content Based 

In [87]:
from sklearn.metrics.pairwise import cosine_similarity

In [88]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, clicks, items_df):
        self.clicks = clicks
        self.items_df = items_df[['category_id', 'created_at_ts', 'words_count']]
        self.embeddings_df =  items_df.drop(columns=['category_id', 'created_at_ts', 'words_count'])

        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def get_user_profile(self, user_id):
        user_read_articles_id = list(set(self.clicks[self.clicks.user_id == user_id].article_id))
        user_profile = self.embeddings_df.loc[user_read_articles_id]
        return user_profile
    
    def recommend_items(self, user_id, items_to_ignore=[], topn=5, verbose=False):
        user_profile = self.get_user_profile(user_id)
        cosine_similarities = cosine_similarity(user_profile, self.embeddings_df)
        max_similarities = cosine_similarities.max(axis=0)
        articles_similarities = pd.DataFrame({'article_id': self.embeddings_df.index, 'max_similarity': max_similarities})
        # Filtrer les articles déjà lus
        articles_similarities = articles_similarities[~articles_similarities['article_id'].isin(list(user_profile.index))]
        # Trier le DataFrame par ordre décroissant de similarité
        articles_similarities = articles_similarities.sort_values('max_similarity', ascending=False)
        return articles_similarities.head(topn)

In [89]:
content_based_recommender_model = ContentBasedRecommender(interactions_train_df, articles_min_df)

In [90]:
user_id = random.choice(list(set(interactions_train_df.user_id)))

In [91]:
content_based_recommender_model.recommend_items(user_id=user_id)

Unnamed: 0,article_id,max_similarity
770,273464,0.90437
908,118549,0.898521
839,118218,0.893075
1104,119454,0.892844
568,272733,0.885766


In [92]:
content_based_recommender_model.recommend_items(user_id=3)

Unnamed: 0,article_id,max_similarity
1678,236682,0.890063
1159,234427,0.881342
1666,236613,0.862717
1382,235373,0.860739
1244,234798,0.859561


In [93]:
for k in [233769, 235665, 234686, 236294]:
    print(k in articles_min_df.index)

True
True
True
True


In [94]:
articles_min_df.index.dtype

dtype('int64')

### Méthode sur un user

self.clicks sera interactions_train_df

self.items_df :

In [95]:
items_df = articles_min_df[['category_id', 'created_at_ts', 'words_count']]

self.embedding_df :

In [96]:
embedding_min_df = articles_min_df.drop(columns=['category_id', 'created_at_ts', 'words_count'])
embedding_min_df

Unnamed: 0_level_0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
319488,-0.466458,-0.964679,-0.485105,-0.877867,0.391511,0.483561,0.005480,-0.211324,-0.020051,-0.174135,...,-0.814141,-0.442375,0.039461,0.395648,-0.747964,-0.382282,-0.632840,-0.387871,-0.056827,-0.199358
360463,-0.602885,-0.961464,-0.565603,-0.649518,-0.559716,0.524595,-0.370100,0.361097,-0.108726,0.181928,...,-0.585967,-0.653178,-0.195667,0.249913,0.331547,0.119053,0.659684,-0.244358,-0.151648,0.401354
360465,-0.205855,-0.957574,-0.878344,-0.431890,-0.693044,0.168474,-0.385850,0.432571,-0.696537,0.027664,...,-0.557692,-0.653369,-0.120972,-0.227757,0.475501,0.446490,0.455878,-0.210448,-0.116186,0.585806
106520,0.505829,-0.931578,-0.277545,0.383692,-0.603157,0.135672,-0.137240,0.333256,0.247378,0.298668,...,0.267274,-0.060965,0.684367,-0.198938,-0.601258,0.362214,0.370468,0.369780,0.006491,0.607185
180261,-0.631314,-0.972719,0.554279,0.087726,0.090802,0.394112,0.057570,-0.612913,0.182290,-0.118985,...,0.300368,-0.433697,-0.092218,-0.694479,0.208018,-0.244026,0.054419,-0.172014,0.355977,-0.357948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32730,-0.063069,-0.963189,-0.020831,0.328858,0.314361,-0.227227,0.389601,0.099065,0.672383,0.442416,...,0.528567,0.427474,0.461421,-0.461597,-0.333884,0.534221,-0.199299,-0.593895,0.302743,-0.524815
352227,-0.037319,-0.970653,0.235284,-0.792944,-0.652155,-0.429864,0.559105,-0.071258,0.550386,-0.846791,...,-0.535248,-0.867873,-0.768359,0.483744,0.029526,0.570641,0.170771,-0.608991,0.154890,0.234986
311282,-0.320332,-0.972268,0.788268,-0.835453,-0.814294,0.650294,0.580025,-0.756977,0.738576,-0.058501,...,-0.786265,-0.272901,-0.877875,0.703472,-0.748671,0.258157,-0.033027,0.110924,-0.687876,-0.688037
196595,-0.416054,-0.976075,-0.187173,-0.600695,0.446745,0.531058,-0.150488,-0.890386,-0.337496,-0.656646,...,-0.132207,0.384831,-0.311023,-0.531965,0.383805,-0.373510,0.313573,0.344284,0.342517,-0.579160


###  Article consulté par un utilisateur

#### Méthode : get_user_profile

In [97]:
interactions_train_df

Unnamed: 0,user_id,article_id,click
7235,5832,284463,1
13152,11901,59758,1
16441,16610,332114,1
6313,4911,185992,1
9621,8192,207122,1
...,...,...,...
16585,16794,270589,1
6142,4761,207391,1
21172,27110,300299,1
6404,4972,42656,1


In [98]:
user_id = 3

In [99]:
user_read_articles_id = set(interactions_train_df[interactions_train_df.user_id == user_id].article_id)
user_profile = embedding_min_df.loc[user_read_articles_id]
user_profile

  user_profile = embedding_min_df.loc[user_read_articles_id]


Unnamed: 0_level_0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
233769,-0.828736,-0.974723,-0.23435,-0.507602,-0.203012,-0.755248,-0.788898,0.013411,-0.33864,-0.556146,...,-0.570091,-0.148384,0.479919,-0.910203,-0.529137,0.142069,-0.476662,0.485083,0.479215,0.014373
235665,-0.51324,-0.963361,-0.602106,-0.206739,-0.412994,-0.115276,-0.732224,-0.266477,0.507747,-0.671837,...,-0.205707,0.124613,0.267336,-0.428892,0.081707,0.299926,0.351382,0.871408,0.290365,0.440695
234686,-0.538448,-0.948071,-0.705048,-0.083332,-0.151408,-0.359961,-0.562148,-0.086911,-0.066602,0.117681,...,0.193455,0.193878,0.146542,-0.761525,0.200309,-0.242208,-0.492294,-0.285201,0.278869,0.238224
236294,-0.791363,-0.967025,-0.247181,-0.462593,0.238766,-0.713218,-0.789062,-0.026739,-0.222805,-0.16398,...,-0.609091,0.230817,0.090354,-0.457567,-0.026513,-0.433131,0.347777,0.511057,0.47007,0.092472


#### Méthode : recommend_items

In [100]:
cosine_similarities = cosine_similarity(user_profile, embedding_min_df)
cosine_similarities.shape

(4, 1842)

In [101]:
max_similarities = cosine_similarities.max(axis=0)
len(max_similarities)

1842

In [102]:
articles_similarities = pd.DataFrame({'article_id': embedding_min_df.index, 'max_similarity': max_similarities})
# Filtrer les articles déjà lus
articles_similarities = articles_similarities[~articles_similarities['article_id'].isin(list(user_profile.index))]
# Trier le DataFrame par ordre décroissant de similarité
articles_similarities = articles_similarities.sort_values('max_similarity', ascending=False)

# Sélectionner les 5 articles avec les valeurs maximales de similarité
top_5_articles = articles_similarities.head(5)

# Afficher les articles recommandés avec leurs valeurs maximales de similarité
print(top_5_articles)

      article_id  max_similarity
1678      236682        0.890063
1159      234427        0.881342
1666      236613        0.862717
1382      235373        0.860739
1244      234798        0.859561


In [103]:
user_clicks_df = interactions_train_df[interactions_train_df['user_id']==user_id]

In [104]:
user_clicks_df['article_id']

0    233769
4    236294
2    235665
1    234686
Name: article_id, dtype: int64

### Divers tests

In [105]:
content_based_recommender_model = ContentBasedRecommender(interactions_train_df, articles_min_df)

In [106]:
content_based_recommender_model.recommend_items(user_id=user_id)

Unnamed: 0,article_id,max_similarity
1678,236682,0.890063
1159,234427,0.881342
1666,236613,0.862717
1382,235373,0.860739
1244,234798,0.859561
