In [66]:
import random

import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

## Import des données

In [67]:
DATA_PATH = 'data/processed/'

In [68]:
interactions_df = pd.read_csv(DATA_PATH + 'interactions_df.csv', index_col=0)
interactions_df.head()

Unnamed: 0,user_id,article_id,click
0,3,233769,1
1,3,234686,1
2,3,235665,1
3,3,236065,1
4,3,236294,1


In [69]:
articles_min_df = pd.read_csv(DATA_PATH + 'articles_min.csv', index_col=0)
articles_min_df.head()


Unnamed: 0_level_0,category_id,created_at_ts,words_count,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
319488,433,2017-09-29 15:47:18,157,-0.466458,-0.964679,-0.485105,-0.877867,0.391511,0.483561,0.00548,...,-0.814141,-0.442375,0.039461,0.395648,-0.747964,-0.382282,-0.63284,-0.387871,-0.056827,-0.199358
360463,455,2017-10-01 10:01:03,176,-0.602885,-0.961464,-0.565603,-0.649518,-0.559716,0.524595,-0.3701,...,-0.585967,-0.653178,-0.195667,0.249913,0.331547,0.119053,0.659684,-0.244358,-0.151648,0.401354
360465,455,2017-10-01 13:13:54,114,-0.205855,-0.957574,-0.878344,-0.43189,-0.693044,0.168474,-0.38585,...,-0.557692,-0.653369,-0.120972,-0.227757,0.475501,0.44649,0.455878,-0.210448,-0.116186,0.585806
106520,228,2017-08-11 21:11:33,112,0.505829,-0.931578,-0.277545,0.383692,-0.603157,0.135672,-0.13724,...,0.267274,-0.060965,0.684367,-0.198938,-0.601258,0.362214,0.370468,0.36978,0.006491,0.607185
180261,301,2016-11-19 13:30:13,199,-0.631314,-0.972719,0.554279,0.087726,0.090802,0.394112,0.05757,...,0.300368,-0.433697,-0.092218,-0.694479,0.208018,-0.244026,0.054419,-0.172014,0.355977,-0.357948


## Split

In [70]:
interactions_train_df, interactions_test_df = train_test_split(interactions_df,
                                                               stratify=interactions_df['user_id'], 
                                                               test_size=0.20,
                                                               random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 17121
# interactions on Test set: 4281


### Création de rating

In [71]:
interactions_train_df

Unnamed: 0,user_id,article_id,click
7235,5832,284463,1
13152,11901,59758,1
16441,16610,332114,1
6313,4911,185992,1
9621,8192,207122,1
...,...,...,...
16585,16794,270589,1
6142,4761,207391,1
21172,27110,300299,1
6404,4972,42656,1


In [72]:
interactions_train_df[interactions_train_df[['user_id', 'article_id']].duplicated()]

Unnamed: 0,user_id,article_id,click


In [73]:
ratings_train_df = interactions_train_df.groupby(['user_id', 'article_id'])['article_id'].count().reset_index(name='click_count')

# calculer le nombre total de clics de chaque utilisateur
user_click_counts = ratings_train_df.groupby('user_id')['article_id'].count().reset_index(name='total_click_count')

# joindre les deux tables pour obtenir le nombre de clics de chaque utilisateur sur chaque article ainsi que le nombre total de clics de chaque utilisateur
ratings_train_df = ratings_train_df.merge(user_click_counts, on='user_id')

# calculer le rating de chaque article en utilisant le nombre de clics de l'utilisateur sur l'article normalisé par le nombre total de clics de l'utilisateur
ratings_train_df['rating'] = ratings_train_df['click_count'] / ratings_train_df['total_click_count']
ratings_train_df


Unnamed: 0,user_id,article_id,click_count,total_click_count,rating
0,3,233769,1,4,0.250000
1,3,234686,1,4,0.250000
2,3,235665,1,4,0.250000
3,3,236294,1,4,0.250000
4,5,59929,1,6,0.166667
...,...,...,...,...,...
17116,27876,293425,1,9,0.111111
17117,27903,160474,1,4,0.250000
17118,27903,161178,1,4,0.250000
17119,27903,207122,1,4,0.250000


In [74]:
ratings_train_df['total_click_count'].min()

2

In [75]:
ratings_train_df['rating'].min()

0.020833333333333332

# Popularity Recommander

In [76]:
popularity_df = pd.DataFrame({
    'article_id': interactions_train_df['article_id'].value_counts().index,
    'popularity': interactions_train_df['article_id'].value_counts().values,
})
popularity_df.head()

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
4,119592,602


In [77]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['article_id'].isin(items_to_ignore)] \
                                                .sort_values('popularity', ascending= False)\
                                                .head(topn)
        return recommendations_df

In [78]:
popularity_model = PopularityRecommender(popularity_df, articles_min_df)

In [79]:
popularity_model.recommend_items(user_id=5890, items_to_ignore=set(interactions_test_df[interactions_test_df.user_id==5890].article_id), topn=5)

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
5,59758,525


In [80]:
popularity_model.recommend_items(user_id=8192, items_to_ignore=set(interactions_test_df[interactions_test_df.user_id==8192].article_id), topn=5)

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
4,119592,602


In [81]:
popularity_model.recommend_items(user_id=3, items_to_ignore=set(interactions_test_df[interactions_test_df.user_id==3].article_id), topn=5)

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
4,119592,602


In [82]:
## Test avec filtre des articles déjà vus

In [83]:
set(interactions_test_df[interactions_test_df.user_id==8].article_id)

set()

In [84]:
popularity_model.recommend_items(user_id=8)

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
4,119592,602
5,59758,525
6,336430,525
7,220466,326
8,313504,314
9,118180,280


In [85]:
set(interactions_test_df[interactions_test_df.user_id==9261].article_id)

{36690, 96663, 149734, 161585, 195586, 207035, 207391, 233605, 284664, 354701}

Il conseille un article déjà consulté :
 - 96 663 en 4e

In [86]:
popularity_model.recommend_items(user_id=9261, items_to_ignore = set(interactions_test_df[interactions_test_df.user_id==9261].article_id))

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
4,119592,602
5,59758,525
6,336430,525
7,220466,326
8,313504,314
9,118180,280
10,68866,276
