In [1]:
import random

from pathlib import Path

import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from surprise import Dataset, NormalPredictor, Reader
from surprise.model_selection import cross_validate

## Import des données

In [3]:
DATA_PATH = Path('data/1_preprocessed/')

In [4]:
interactions_df = pd.read_csv(DATA_PATH / 'interactions_df.csv', index_col=0)
interactions_df.head()

Unnamed: 0,user_id,article_id,click
0,3,233769,1
1,3,234686,1
2,3,235665,1
3,3,236065,1
4,3,236294,1


In [5]:
articles_min_df = pd.read_csv(DATA_PATH / 'articles_min.csv', index_col=0)
articles_min_df.head()


Unnamed: 0_level_0,category_id,created_at_ts,words_count,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
319488,433,2017-09-29 15:47:18,157,-0.466458,-0.964679,-0.485105,-0.877867,0.391511,0.483561,0.00548,...,-0.814141,-0.442375,0.039461,0.395648,-0.747964,-0.382282,-0.63284,-0.387871,-0.056827,-0.199358
360463,455,2017-10-01 10:01:03,176,-0.602885,-0.961464,-0.565603,-0.649518,-0.559716,0.524595,-0.3701,...,-0.585967,-0.653178,-0.195667,0.249913,0.331547,0.119053,0.659684,-0.244358,-0.151648,0.401354
360465,455,2017-10-01 13:13:54,114,-0.205855,-0.957574,-0.878344,-0.43189,-0.693044,0.168474,-0.38585,...,-0.557692,-0.653369,-0.120972,-0.227757,0.475501,0.44649,0.455878,-0.210448,-0.116186,0.585806
106520,228,2017-08-11 21:11:33,112,0.505829,-0.931578,-0.277545,0.383692,-0.603157,0.135672,-0.13724,...,0.267274,-0.060965,0.684367,-0.198938,-0.601258,0.362214,0.370468,0.36978,0.006491,0.607185
180261,301,2016-11-19 13:30:13,199,-0.631314,-0.972719,0.554279,0.087726,0.090802,0.394112,0.05757,...,0.300368,-0.433697,-0.092218,-0.694479,0.208018,-0.244026,0.054419,-0.172014,0.355977,-0.357948


## Split

In [6]:
interactions_train_df, interactions_test_df = train_test_split(interactions_df,
                                                               stratify=interactions_df['user_id'], 
                                                               test_size=0.20,
                                                               random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 17121
# interactions on Test set: 4281


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


### Création de rating

In [7]:
interactions_train_df

Unnamed: 0,user_id,article_id,click
7235,5832,284463,1
13152,11901,59758,1
16441,16610,332114,1
6313,4911,185992,1
9621,8192,207122,1
...,...,...,...
16585,16794,270589,1
6142,4761,207391,1
21172,27110,300299,1
6404,4972,42656,1


In [8]:
interactions_train_df[interactions_train_df[['user_id', 'article_id']].duplicated()]

Unnamed: 0,user_id,article_id,click


In [9]:
ratings_train_df = interactions_train_df.groupby(['user_id', 'article_id'])['article_id'].count().reset_index(name='click_count')

# calculer le nombre total de clics de chaque utilisateur
user_click_counts = ratings_train_df.groupby('user_id')['article_id'].count().reset_index(name='total_click_count')

# joindre les deux tables pour obtenir le nombre de clics de chaque utilisateur sur chaque article ainsi que le nombre total de clics de chaque utilisateur
ratings_train_df = ratings_train_df.merge(user_click_counts, on='user_id')

# calculer le rating de chaque article en utilisant le nombre de clics de l'utilisateur sur l'article normalisé par le nombre total de clics de l'utilisateur
ratings_train_df['rating'] = ratings_train_df['click_count'] / ratings_train_df['total_click_count']
ratings_train_df


Unnamed: 0,user_id,article_id,click_count,total_click_count,rating
0,3,233769,1,4,0.250000
1,3,234686,1,4,0.250000
2,3,235665,1,4,0.250000
3,3,236294,1,4,0.250000
4,5,59929,1,6,0.166667
...,...,...,...,...,...
17116,27876,293425,1,9,0.111111
17117,27903,160474,1,4,0.250000
17118,27903,161178,1,4,0.250000
17119,27903,207122,1,4,0.250000


In [10]:
ratings_train_df['total_click_count'].min()

2

In [11]:
ratings_train_df['rating'].min()

0.020833333333333332

# Popularity Recommander

In [12]:
popularity_df = pd.DataFrame({
    'article_id': interactions_train_df['article_id'].value_counts().index,
    'popularity': interactions_train_df['article_id'].value_counts().values,
})
popularity_df.head()

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
4,119592,602


In [13]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['article_id'].isin(items_to_ignore)] \
                                                .sort_values('popularity', ascending= False)\
                                                .head(topn)
        return recommendations_df

In [14]:
popularity_model = PopularityRecommender(popularity_df, articles_min_df)

In [15]:
popularity_model.recommend_items(user_id=5890, items_to_ignore=set(interactions_test_df[interactions_test_df.user_id==5890].article_id), topn=5)

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
5,59758,525


In [16]:
popularity_model.recommend_items(user_id=8192, items_to_ignore=set(interactions_test_df[interactions_test_df.user_id==8192].article_id), topn=5)

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
4,119592,602


In [17]:
popularity_model.recommend_items(user_id=3, items_to_ignore=set(interactions_test_df[interactions_test_df.user_id==3].article_id), topn=5)

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
4,119592,602


In [18]:
## Test avec filtre des articles déjà vus

In [19]:
set(interactions_test_df[interactions_test_df.user_id==8].article_id)

set()

In [20]:
popularity_model.recommend_items(user_id=8)

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
3,96663,617
4,119592,602
5,59758,525
6,336430,525
7,220466,326
8,313504,314
9,118180,280


In [21]:
set(interactions_test_df[interactions_test_df.user_id==9261].article_id)

{36690, 96663, 149734, 161585, 195586, 207035, 207391, 233605, 284664, 354701}

Il conseille un article déjà consulté :
 - 96 663 en 4e

In [22]:
popularity_model.recommend_items(user_id=9261, items_to_ignore = set(interactions_test_df[interactions_test_df.user_id==9261].article_id))

Unnamed: 0,article_id,popularity
0,284463,871
1,207122,729
2,160474,626
4,119592,602
5,59758,525
6,336430,525
7,220466,326
8,313504,314
9,118180,280
10,68866,276


# Content Based 

In [23]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, clicks, items_df):
        self.clicks = clicks
        self.items_df = items_df[['category_id', 'created_at_ts', 'words_count']]
        self.embeddings_df =  items_df.drop(columns=['category_id', 'created_at_ts', 'words_count'])

        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def get_user_profile(self, user_id):
        user_read_articles_id = list(set(self.clicks[self.clicks.user_id == user_id].article_id))
        user_profile = self.embeddings_df.loc[user_read_articles_id]
        return user_profile
    
    def recommend_items(self, user_id, items_to_ignore=[], topn=5, verbose=False):
        user_profile = self.get_user_profile(user_id)
        cosine_similarities = cosine_similarity(user_profile, self.embeddings_df)
        max_similarities = cosine_similarities.max(axis=0)
        articles_similarities = pd.DataFrame({'article_id': self.embeddings_df.index, 'max_similarity': max_similarities})
        # Filtrer les articles déjà lus
        articles_similarities = articles_similarities[~articles_similarities['article_id'].isin(list(user_profile.index))]
        # Trier le DataFrame par ordre décroissant de similarité
        articles_similarities = articles_similarities.sort_values('max_similarity', ascending=False)
        return articles_similarities.head(topn)

In [24]:
content_based_recommender_model = ContentBasedRecommender(interactions_train_df, articles_min_df)

In [25]:
user_id = random.choice(list(set(interactions_train_df.user_id)))

In [26]:
content_based_recommender_model.recommend_items(user_id=user_id)

Unnamed: 0,article_id,max_similarity
898,118495,0.917994
896,118494,0.906185
908,118549,0.898521
839,118218,0.893075
1104,119454,0.892844


In [27]:
content_based_recommender_model.recommend_items(user_id=3)

Unnamed: 0,article_id,max_similarity
1678,236682,0.890063
1159,234427,0.881342
1666,236613,0.862717
1382,235373,0.860739
1244,234798,0.859561


In [28]:
for k in [233769, 235665, 234686, 236294]:
    print(k in articles_min_df.index)

True
True
True
True


In [29]:
articles_min_df.index.dtype

dtype('int64')

### Méthode sur un user

self.clicks sera interactions_train_df

self.items_df :

In [30]:
items_df = articles_min_df[['category_id', 'created_at_ts', 'words_count']]

self.embedding_df :

In [31]:
embedding_min_df = articles_min_df.drop(columns=['category_id', 'created_at_ts', 'words_count'])
embedding_min_df

Unnamed: 0_level_0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
319488,-0.466458,-0.964679,-0.485105,-0.877867,0.391511,0.483561,0.005480,-0.211324,-0.020051,-0.174135,...,-0.814141,-0.442375,0.039461,0.395648,-0.747964,-0.382282,-0.632840,-0.387871,-0.056827,-0.199358
360463,-0.602885,-0.961464,-0.565603,-0.649518,-0.559716,0.524595,-0.370100,0.361097,-0.108726,0.181928,...,-0.585967,-0.653178,-0.195667,0.249913,0.331547,0.119053,0.659684,-0.244358,-0.151648,0.401354
360465,-0.205855,-0.957574,-0.878344,-0.431890,-0.693044,0.168474,-0.385850,0.432571,-0.696537,0.027664,...,-0.557692,-0.653369,-0.120972,-0.227757,0.475501,0.446490,0.455878,-0.210448,-0.116186,0.585806
106520,0.505829,-0.931578,-0.277545,0.383692,-0.603157,0.135672,-0.137240,0.333256,0.247378,0.298668,...,0.267274,-0.060965,0.684367,-0.198938,-0.601258,0.362214,0.370468,0.369780,0.006491,0.607185
180261,-0.631314,-0.972719,0.554279,0.087726,0.090802,0.394112,0.057570,-0.612913,0.182290,-0.118985,...,0.300368,-0.433697,-0.092218,-0.694479,0.208018,-0.244026,0.054419,-0.172014,0.355977,-0.357948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32730,-0.063069,-0.963189,-0.020831,0.328858,0.314361,-0.227227,0.389601,0.099065,0.672383,0.442416,...,0.528567,0.427474,0.461421,-0.461597,-0.333884,0.534221,-0.199299,-0.593895,0.302743,-0.524815
352227,-0.037319,-0.970653,0.235284,-0.792944,-0.652155,-0.429864,0.559105,-0.071258,0.550386,-0.846791,...,-0.535248,-0.867873,-0.768359,0.483744,0.029526,0.570641,0.170771,-0.608991,0.154890,0.234986
311282,-0.320332,-0.972268,0.788268,-0.835453,-0.814294,0.650294,0.580025,-0.756977,0.738576,-0.058501,...,-0.786265,-0.272901,-0.877875,0.703472,-0.748671,0.258157,-0.033027,0.110924,-0.687876,-0.688037
196595,-0.416054,-0.976075,-0.187173,-0.600695,0.446745,0.531058,-0.150488,-0.890386,-0.337496,-0.656646,...,-0.132207,0.384831,-0.311023,-0.531965,0.383805,-0.373510,0.313573,0.344284,0.342517,-0.579160


###  Article consulté par un utilisateur

#### Méthode : get_user_profile

In [32]:
interactions_train_df

Unnamed: 0,user_id,article_id,click
7235,5832,284463,1
13152,11901,59758,1
16441,16610,332114,1
6313,4911,185992,1
9621,8192,207122,1
...,...,...,...
16585,16794,270589,1
6142,4761,207391,1
21172,27110,300299,1
6404,4972,42656,1


In [33]:
user_id = 3

In [34]:
user_read_articles_id = list(set(interactions_train_df[interactions_train_df.user_id == user_id].article_id))
user_profile = embedding_min_df.loc[user_read_articles_id]
user_profile

Unnamed: 0_level_0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
233769,-0.828736,-0.974723,-0.23435,-0.507602,-0.203012,-0.755248,-0.788898,0.013411,-0.33864,-0.556146,...,-0.570091,-0.148384,0.479919,-0.910203,-0.529137,0.142069,-0.476662,0.485083,0.479215,0.014373
235665,-0.51324,-0.963361,-0.602106,-0.206739,-0.412994,-0.115276,-0.732224,-0.266477,0.507747,-0.671837,...,-0.205707,0.124613,0.267336,-0.428892,0.081707,0.299926,0.351382,0.871408,0.290365,0.440695
234686,-0.538448,-0.948071,-0.705048,-0.083332,-0.151408,-0.359961,-0.562148,-0.086911,-0.066602,0.117681,...,0.193455,0.193878,0.146542,-0.761525,0.200309,-0.242208,-0.492294,-0.285201,0.278869,0.238224
236294,-0.791363,-0.967025,-0.247181,-0.462593,0.238766,-0.713218,-0.789062,-0.026739,-0.222805,-0.16398,...,-0.609091,0.230817,0.090354,-0.457567,-0.026513,-0.433131,0.347777,0.511057,0.47007,0.092472


#### Méthode : recommend_items

In [35]:
cosine_similarities = cosine_similarity(user_profile, embedding_min_df)
cosine_similarities.shape

(4, 1842)

In [36]:
max_similarities = cosine_similarities.max(axis=0)
len(max_similarities)

1842

In [37]:
articles_similarities = pd.DataFrame({'article_id': embedding_min_df.index, 'max_similarity': max_similarities})
# Filtrer les articles déjà lus
articles_similarities = articles_similarities[~articles_similarities['article_id'].isin(list(user_profile.index))]
# Trier le DataFrame par ordre décroissant de similarité
articles_similarities = articles_similarities.sort_values('max_similarity', ascending=False)

# Sélectionner les 5 articles avec les valeurs maximales de similarité
top_5_articles = articles_similarities.head(5)

# Afficher les articles recommandés avec leurs valeurs maximales de similarité
print(top_5_articles)

      article_id  max_similarity
1678      236682        0.890063
1159      234427        0.881342
1666      236613        0.862717
1382      235373        0.860739
1244      234798        0.859561


In [38]:
user_clicks_df = interactions_train_df[interactions_train_df['user_id']==user_id]

In [39]:
user_clicks_df['article_id']

0    233769
4    236294
2    235665
1    234686
Name: article_id, dtype: int64

### Divers tests

In [40]:
content_based_recommender_model = ContentBasedRecommender(interactions_train_df, articles_min_df)

In [41]:
content_based_recommender_model.recommend_items(user_id=user_id)

Unnamed: 0,article_id,max_similarity
1678,236682,0.890063
1159,234427,0.881342
1666,236613,0.862717
1382,235373,0.860739
1244,234798,0.859561


# Collaborative Filtering Method

In [42]:
interactions_train_df

Unnamed: 0,user_id,article_id,click
7235,5832,284463,1
13152,11901,59758,1
16441,16610,332114,1
6313,4911,185992,1
9621,8192,207122,1
...,...,...,...
16585,16794,270589,1
6142,4761,207391,1
21172,27110,300299,1
6404,4972,42656,1


In [43]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_train_df.pivot(index='user_id', 
                                                          columns='article_id', 
                                                          values='click').fillna(0)

users_items_pivot_matrix_df.head(10)

article_id,1916,1933,2136,2137,2288,2662,3147,3201,3330,3515,...,362928,363018,363033,363127,363291,363330,363633,363925,363976,364001
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
users_items_pivot_matrix = users_items_pivot_matrix_df.values
users_items_pivot_matrix[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [45]:
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]

[3, 5, 16, 18, 22, 24, 32, 44, 51, 53]

In [46]:
users_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix)
users_items_pivot_sparse_matrix

<3169x1606 sparse matrix of type '<class 'numpy.float64'>'
	with 17121 stored elements in Compressed Sparse Row format>

In [47]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
#U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
U, sigma, Vt = svds(users_items_pivot_sparse_matrix, k = NUMBER_OF_FACTORS_MF)

In [48]:
U.shape

(3169, 15)

In [49]:
Vt.shape

(15, 1606)

In [50]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [51]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[-1.07137647e-06, -1.26518221e-04,  3.01622940e-04, ...,
         4.66118953e-06,  2.44957682e-04, -1.14766699e-04],
       [ 3.90018265e-06, -7.60817691e-04,  1.24285734e-04, ...,
        -4.16953324e-04,  2.96125890e-03,  3.26052735e-03],
       [ 1.87015346e-06, -1.67197432e-03, -3.33149248e-04, ...,
         5.74113628e-04,  6.59300834e-04,  6.07269345e-04],
       ...,
       [ 5.23789327e-06,  1.38447976e-04, -2.42628411e-06, ...,
        -1.49759020e-04,  6.14390464e-04,  1.03054598e-03],
       [ 3.83166267e-06, -1.88168832e-03,  1.49453823e-03, ...,
         6.88540870e-04,  6.34008024e-03,  8.49227031e-04],
       [ 7.34966857e-06,  6.46750626e-04, -7.02211257e-04, ...,
        -5.55429329e-04, -7.99177658e-04,  3.38833894e-03]])

In [52]:
all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())


In [53]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings_norm, columns=users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,3,5,16,18,22,24,32,44,51,53,...,27747,27750,27756,27765,27766,27792,27813,27849,27876,27903
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1916,0.098585,0.098587,0.098586,0.098586,0.098587,0.098589,0.098586,0.098584,0.098582,0.098582,...,0.098588,0.098586,0.098586,0.098585,0.098589,0.098587,0.098588,0.098587,0.098587,0.098587
1933,0.098556,0.09841,0.0982,0.098561,0.098638,0.098365,0.098489,0.098504,0.098707,0.098291,...,0.098783,0.098586,0.098789,0.098219,0.099113,0.098293,0.098538,0.098618,0.098152,0.098735
2136,0.098655,0.098614,0.098509,0.098591,0.098753,0.098486,0.099321,0.09864,0.099668,0.099104,...,0.098545,0.098586,0.099135,0.098984,0.098857,0.098446,0.098576,0.098585,0.09893,0.098424
2137,0.098572,0.098666,0.098659,0.09857,0.098673,0.098684,0.098602,0.098551,0.09909,0.098357,...,0.098951,0.098586,0.098043,0.09836,0.09814,0.098464,0.098359,0.098468,0.099174,0.098552
2288,0.098595,0.098682,0.098616,0.098597,0.098676,0.098668,0.098584,0.098581,0.098617,0.098627,...,0.09863,0.098586,0.098598,0.098652,0.098659,0.098624,0.098656,0.098636,0.098601,0.098609
2662,0.09856,0.098753,0.099541,0.098557,0.098376,0.099052,0.098209,0.098443,0.098031,0.098324,...,0.098463,0.098586,0.099037,0.099197,0.098253,0.099421,0.09878,0.098708,0.098401,0.098871
3147,0.098764,0.098184,0.098461,0.098607,0.098734,0.097741,0.098335,0.098822,0.100369,0.09985,...,0.098341,0.098586,0.100479,0.100362,0.097733,0.099034,0.097891,0.098213,0.098181,0.097842
3201,0.09856,0.098753,0.099541,0.098557,0.098376,0.099052,0.098209,0.098443,0.098031,0.098324,...,0.098463,0.098586,0.099037,0.099197,0.098253,0.099421,0.09878,0.098708,0.098401,0.098871
3330,0.098586,0.098551,0.098533,0.098585,0.098851,0.098513,0.098556,0.098444,0.098475,0.098375,...,0.098552,0.098586,0.098623,0.098585,0.098561,0.098586,0.098546,0.098551,0.098859,0.098504
3515,0.098586,0.098584,0.098581,0.098585,0.098583,0.098581,0.098582,0.098585,0.098592,0.09859,...,0.098583,0.098586,0.098598,0.098593,0.098586,0.098586,0.098582,0.098585,0.098577,0.098582


In [54]:
len(cf_preds_df.columns)

3169

In [55]:
sorted_user_predictions = cf_preds_df[3].sort_values(ascending=False).reset_index().rename(columns={3: 'recStrength'})

In [56]:
items_to_ignore = []

In [57]:
sorted_user_predictions[~sorted_user_predictions['article_id'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(5)

Unnamed: 0,article_id,recStrength
0,233605,0.114615
1,235210,0.109861
2,284847,0.109818
3,108854,0.103631
4,235840,0.10303


In [58]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['article_id'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'article_id', 
                                                          right_on = 'article_id')[['recStrength', 'article_id']]


        return recommendations_df
    


In [59]:
cf_recommender_model = CFRecommender(cf_preds_df, articles_min_df)

In [60]:
cf_recommender_model.recommend_items(user_id=3)

Unnamed: 0,article_id,recStrength
0,233605,0.114615
1,235210,0.109861
2,284847,0.109818
3,108854,0.103631
4,235840,0.10303
5,235263,0.103013
6,235132,0.102428
7,195586,0.101955
8,157132,0.101055
9,96663,0.101053
