# Model recommendation with lighfm

### Import libraries

In [1]:
import os
import sys
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from lightfm import cross_validation
import scipy.sparse as sp
from scipy import sparse

### Defining variables

In [2]:
import json

with open('config.json', 'r') as f:
    config = json.load(f)

In [3]:
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of epochs to fit model
NO_EPOCHS = 20

# seed for pseudonumber generations
SEED = 42

### Retrieve data

In [4]:
dtype_df_train_treated = {
"userId" : 'string',
"userType" : 'category',
"historySize" : 'UInt16',
"history" : 'string',
"timestampHistory" : 'string',
"numberOfClicksHistory" : 'UInt32',
"timeOnPageHistory" : 'UInt64',
"scrollPercentageHistory" : 'Float32',
"pageVisitsCountHistory" : 'UInt32',
"timestampHistory_new" : 'string'
}

In [5]:
import pandas as pd
# path config

# df_ratings = pd.read_csv(config["TREATED_TRAIN_CSV"], dtype=dtype_df_train_treated)
df_ratings = pd.read_csv(config["TREATED_TRAIN_CSV"], dtype=dtype_df_train_treated, nrows=500000)
df_ratings

Unnamed: 0.1,Unnamed: 0,userId,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,userType
0,0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,0,71998,81.580002,1,Non-Logged
1,1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,0,115232,73.360001,1,Non-Logged
2,2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,19ba89fc-1e06-4c5d-9c57-4a3088dc0511,68,131495,51.740002,1,Non-Logged
3,3,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,12,43733,35.490002,1,Non-Logged
4,4,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,59a61a8a-cc52-453f-b1cd-2bd019e9d574,55,159042,62.189999,1,Non-Logged
...,...,...,...,...,...,...,...,...
499995,499995,ee6c22997bd65f62d16520329a9faadf130be257a96927...,3f2d29ea-e1da-49ed-9793-db95393334d1,0,12001,6.33,1,Non-Logged
499996,499996,ee6c22997bd65f62d16520329a9faadf130be257a96927...,db50bd63-2817-480c-bb0e-a74c59a87f55,0,10000,6.43,1,Non-Logged
499997,499997,ee6c22997bd65f62d16520329a9faadf130be257a96927...,a7e01699-2b24-4db0-92fa-d486d38e2222,3,8232,20.809999,1,Non-Logged
499998,499998,fcfe5f0a27a6fff2b2a6209b4486503f187ad415df30e6...,94b94d0e-74b1-4ad8-8e30-f827cd16c1bc,0,79062,24.4,2,Non-Logged


In [6]:
df_ratings.drop(columns=["Unnamed: 0"],inplace=True)
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   userId                   500000 non-null  string  
 1   history                  500000 non-null  string  
 2   numberOfClicksHistory    500000 non-null  UInt32  
 3   timeOnPageHistory        500000 non-null  UInt64  
 4   scrollPercentageHistory  500000 non-null  Float32 
 5   pageVisitsCountHistory   500000 non-null  UInt32  
 6   userType                 500000 non-null  category
dtypes: Float32(1), UInt32(2), UInt64(1), category(1), string(2)
memory usage: 19.6 MB


In [7]:
df_ratings.describe()

Unnamed: 0,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory
count,500000.0,500000.0,500000.0,500000.0
mean,12.15158,89593.384512,48.63739,1.149904
std,30.664431,110810.268458,1081.962036,1.219241
min,0.0,5000.0,0.72,1.0
25%,0.0,29764.75,27.1,1.0
50%,1.0,60000.0,43.52,1.0
75%,14.0,115901.5,60.540001,1.0
max,3732.0,12975738.0,466698.21875,211.0


In [8]:
#filtrar por noticias
THRESHOLD_RARE_NEWS = 3

clicks_counts = pd.DataFrame(df_ratings["history"].value_counts())
rare_news = clicks_counts[df_ratings["history"].value_counts() <= THRESHOLD_RARE_NEWS].index
common_news = df_ratings[~df_ratings["history"].isin(rare_news)]

In [9]:
common_news.head()

Unnamed: 0,userId,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,userType
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,0,71998,81.580002,1,Non-Logged
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,19ba89fc-1e06-4c5d-9c57-4a3088dc0511,68,131495,51.740002,1,Non-Logged
3,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,12,43733,35.490002,1,Non-Logged
4,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,59a61a8a-cc52-453f-b1cd-2bd019e9d574,55,159042,62.189999,1,Non-Logged
6,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,233f8238-2ce0-470f-a9d5-0e0ac530382a,0,193579,31.030001,1,Non-Logged


In [10]:
common_news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 431075 entries, 0 to 499994
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   userId                   431075 non-null  string  
 1   history                  431075 non-null  string  
 2   numberOfClicksHistory    431075 non-null  UInt32  
 3   timeOnPageHistory        431075 non-null  UInt64  
 4   scrollPercentageHistory  431075 non-null  Float32 
 5   pageVisitsCountHistory   431075 non-null  UInt32  
 6   userType                 431075 non-null  category
dtypes: Float32(1), UInt32(2), UInt64(1), category(1), string(2)
memory usage: 20.1 MB


In [11]:
dtype_df_items = {
"page" : 'string',
"url" : 'string',
"issued" : 'string',
"modified" : 'string',
"title" : 'string',
"body" : 'string',
"caption" : 'string'
}

In [12]:
df_news = pd.read_csv(config["DF_ITEMS_FEATURE"], dtype=dtype_df_items)
df_news.drop(columns=["Unnamed: 0"],inplace=True)

In [13]:
df_news.head(3)

Unnamed: 0,page,url,issued,modified,title,body,caption,data,idade_em_dias,peso,peso_ajustado
0,7371a9b5-5824-4c57-8704-00a74feebe79,http://g1.globo.com/al/alagoas/noticia/2018/09...,2018-09-13 14:52:55+00:00,2018-09-14 16:14:49+00:00,Corpo de motorista da Uber é encontrado em can...,Corpo de motorista de aplicativo desaparecido ...,"Segundo a polícia, Antônio Vitor foi solicitad...",2018-09-13 14:52:55+00:00,1431,0.057154,0.151439
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,http://g1.globo.com/am/amazonas/noticia/detent...,2018-05-20 20:42:40+00:00,2018-05-20 20:42:40+00:00,Detento recapturado após fuga por túnel volta ...,Detento disse que passou nome falso ao dar ent...,Ele tinha registro em presídio com nome falso....,2018-05-20 20:42:40+00:00,1547,0.04532,0.140788
2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,http://g1.globo.com/ap/amapa/noticia/audios-mo...,2017-07-30 00:37:17+00:00,2017-07-30 00:48:42+00:00,Áudios mostram conversa entre bandidos durante...,Áudios mostram possível conversa entre bandido...,Revista realizada na sexta-feira (28) no Iapen...,2017-07-30 00:37:17+00:00,1842,0.025122,0.12261


In [14]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255603 entries, 0 to 255602
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   page           255603 non-null  string 
 1   url            255603 non-null  string 
 2   issued         255603 non-null  string 
 3   modified       255603 non-null  string 
 4   title          255603 non-null  string 
 5   body           255603 non-null  string 
 6   caption        255603 non-null  string 
 7   data           255603 non-null  object 
 8   idade_em_dias  255603 non-null  int64  
 9   peso           255603 non-null  float64
 10  peso_ajustado  255603 non-null  float64
dtypes: float64(2), int64(1), object(1), string(7)
memory usage: 21.5+ MB


In [15]:
df_merged = pd.merge(common_news, df_news, left_on='history', right_on='page', how='left')
# df_merged.drop(columns=["Unnamed: 0"],inplace=True)

In [16]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431075 entries, 0 to 431074
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   userId                   431075 non-null  string  
 1   history                  431075 non-null  string  
 2   numberOfClicksHistory    431075 non-null  UInt32  
 3   timeOnPageHistory        431075 non-null  UInt64  
 4   scrollPercentageHistory  431075 non-null  Float32 
 5   pageVisitsCountHistory   431075 non-null  UInt32  
 6   userType                 431075 non-null  category
 7   page                     431075 non-null  string  
 8   url                      431075 non-null  string  
 9   issued                   431075 non-null  string  
 10  modified                 431075 non-null  string  
 11  title                    431075 non-null  string  
 12  body                     431075 non-null  string  
 13  caption                  431075 non-null  st

In [17]:
df_merged.head()

Unnamed: 0,userId,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,userType,page,url,issued,modified,title,body,caption,data,idade_em_dias,peso,peso_ajustado
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,0,71998,81.580002,1,Non-Logged,80aa7bb2-adce-4a55-9711-912c407927a1,http://g1.globo.com/politica/noticia/2022/08/0...,2022-08-03 21:32:51+00:00,2022-09-19 19:58:15+00:00,Câmara aprova projeto que acaba com saída temp...,A Câmara dos Deputados aprovou nesta quarta-fe...,Relator defende que 'saidinhas' causam 'sentim...,2022-08-03 21:32:51+00:00,11,0.97824,0.980416
1,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,19ba89fc-1e06-4c5d-9c57-4a3088dc0511,68,131495,51.740002,1,Non-Logged,19ba89fc-1e06-4c5d-9c57-4a3088dc0511,http://g1.globo.com/rn/rio-grande-do-norte/not...,2022-07-09 14:15:29+00:00,2022-07-10 12:12:21+00:00,Jovem de 21 anos é morto e mãe é baleada duran...,"João Victor Queiroz Munai Dantas, de 21 anos, ...",Caso aconteceu na madrugada deste sábado (9) n...,2022-07-09 14:15:29+00:00,36,0.930531,0.937478
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,12,43733,35.490002,1,Non-Logged,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,http://g1.globo.com/mundo/noticia/2022/06/04/j...,2022-06-04 11:12:43+00:00,2022-06-04 11:18:12+00:00,Jubileu da rainha Elizabeth 2ª: como Harry e M...,Meghan Markle e Harry na chega à missa oficial...,O duque e a duquesa de Sussex sentaram do lado...,2022-06-04 11:12:43+00:00,71,0.867621,0.880859
3,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,59a61a8a-cc52-453f-b1cd-2bd019e9d574,55,159042,62.189999,1,Non-Logged,59a61a8a-cc52-453f-b1cd-2bd019e9d574,http://g1.globo.com/pr/parana/noticia/2021/10/...,2021-10-20 11:17:14+00:00,2021-10-20 20:27:48+00:00,Taxa para emissão da 2ª via do RG pode ser pag...,Taxa para emissão do RG poderá ser paga por PI...,"Guia de pagamento, de R$ 38,30, é gerada logo ...",2021-10-20 11:17:14+00:00,298,0.551011,0.59591
4,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,233f8238-2ce0-470f-a9d5-0e0ac530382a,0,193579,31.030001,1,Non-Logged,233f8238-2ce0-470f-a9d5-0e0ac530382a,http://g1.globo.com/pop-arte/noticia/jim-carre...,2018-02-02 09:50:34+00:00,2018-02-02 09:50:35+00:00,Jim Carrey não será julgado pela morte da namo...,Jim Carrey carrega caixão da ex-namorada Cathr...,Ex-marido e mãe de Cathriona White haviam denu...,2018-02-02 09:50:34+00:00,1654,0.036589,0.13293


In [18]:
df_merged = df_merged[["userId","history","numberOfClicksHistory","userType","peso_ajustado"]]
df_merged.rename(columns={"peso_ajustado" : "adjusted_freshness"},inplace=True)

In [19]:
# df_merged.head()
df_merged.head()

Unnamed: 0,userId,history,numberOfClicksHistory,userType,adjusted_freshness
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,0,Non-Logged,0.980416
1,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,19ba89fc-1e06-4c5d-9c57-4a3088dc0511,68,Non-Logged,0.937478
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,12,Non-Logged,0.880859
3,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,59a61a8a-cc52-453f-b1cd-2bd019e9d574,55,Non-Logged,0.59591
4,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,233f8238-2ce0-470f-a9d5-0e0ac530382a,0,Non-Logged,0.13293


In [20]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431075 entries, 0 to 431074
Data columns (total 5 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   userId                 431075 non-null  string  
 1   history                431075 non-null  string  
 2   numberOfClicksHistory  431075 non-null  UInt32  
 3   userType               431075 non-null  category
 4   adjusted_freshness     431075 non-null  float64 
dtypes: UInt32(1), category(1), float64(1), string(2)
memory usage: 12.3 MB


### Prepare data

Before fitting the LightFM model, we need to create an instance of `Dataset` which holds the interaction matrix.

In [None]:
dataset = Dataset()

# Get unique values for users, items, and user features
unique_users = df_merged["userId"].unique()
unique_items = df_merged["history"].unique()
unique_user_features = df_merged["userType"].unique().tolist()

# Fit dataset with users, items, and user feature names
dataset.fit(
    users=unique_users,
    items=unique_items,
    user_features=unique_user_features  # Register user features
)


<bound method Dataset.interactions_shape of <lightfm.data.Dataset object at 0x7e8ca68b5600>>

In [23]:
(interactions, weights) = dataset.build_interactions([
    (row.userId, row.history, row.adjusted_freshness) 
    for _, row in df_merged.iterrows()
])


In [24]:
user_features_list = [
    (row.userId, [row.userType])  
    for _, row in df_merged.iterrows()
]

user_features = dataset.build_user_features(user_features_list)


LightLM works slightly differently compared to other packages as it expects the train and test sets to have same dimension. Therefore the conventional train test split will not work.

The package has included the `cross_validation.random_train_test_split` method to split the interaction data and splits it into two disjoint training and test sets. 

However, note that **it does not validate the interactions in the test set to guarantee all items and users have historical interactions in the training set**. Therefore this may result into a partial cold-start problem in the test set.

In [None]:
# Split train and test sets (80/20 split)
train, test = cross_validation.random_train_test_split(interactions, test_percentage=0.2, random_state=SEED)
train_weights, test_weights  = cross_validation.random_train_test_split(weights, test_percentage=0.2, random_state=SEED)


Double check the size of both the train and test sets.

In [None]:
print(f"Shape of train interactions: {train.shape}")
print(f"Shape of test interactions: {test.shape}")

### Fit the LightFM model

In this notebook, the LightFM model will be using the weighted Approximate-Rank Pairwise (WARP) as the loss. Further explanation on the topic can be found [here](https://making.lyst.com/lightfm/docs/examples/warp_loss.html#learning-to-rank-using-the-warp-loss).


In general, it maximises the rank of positive examples by repeatedly sampling negative examples until a rank violation has been located. This approach is recommended when only positive interactions are present.

The LightFM model can be fitted with the following code:

In [None]:
model = LightFM(loss="warp",learning_rate=LEARNING_RATE,random_state=np.random.RandomState(SEED))  # Weighted Approximate-Rank Pairwise (WARP) loss
model.fit(train, sample_weight=train_weights, epochs=NO_EPOCHS, num_threads=4, user_features=user_features)
# model.fit(train, epochs=NO_EPOCHS, num_threads=4, user_features=user_features)


### Evaluate model

In [None]:
# Import the evaluation routines
from lightfm.evaluation import auc_score

# Compute evaluation metrics
auc_test = auc_score(model, test, user_features=user_features).mean()
auc_train = auc_score(model, train, user_features=user_features).mean()

# Print evaluation results
print(f"AUC test Score: {auc_test:.4f}")
print(f"AUC train Score: {auc_train:.4f}")


### Save pkls to serve model

In [None]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

In [None]:
item_id_map_reverse = {v: k for k, v in item_id_map.items()}

In [None]:
import pickle

pickle.dump(model, open('artifacts/lightfm_model.pkl', 'wb'))
pickle.dump(user_id_map, open('artifacts/user_id_map.pkl', 'wb'))
pickle.dump(item_id_map_reverse, open('artifacts/item_id_map_reverse.pkl', 'wb'))
pickle.dump(user_feature_map, open('artifacts/user_feature_map.pkl', 'wb'))

In [None]:
loaded_model = pickle.load(open('artifacts/lightfm_model.pkl', 'rb'))
loaded_user_id_map = pickle.load(open('artifacts/user_id_map.pkl', 'rb'))
loaded_item_id_map_reverse = pickle.load(open('artifacts/item_id_map_reverse.pkl', 'rb'))
loaded_user_feature_map = pickle.load(open('artifacts/user_feature_map.pkl', 'rb'))

### Make predictions to known and unknowm on same recommendation function with pkls

In [None]:

_, n_items = interactions.shape # no of users * no of items

n_items

In [None]:
def format_newuser_input(user_feature_map, user_feature_list):
  normalised_val = 1.0 
  target_indices = []
  for feature in user_feature_list:
    try:
        target_indices.append(user_feature_map[feature])
    except KeyError:
        print("new user feature encountered '{}'".format(feature))
        pass
  #print("target indices: {}".format(target_indices))
  new_user_features = np.zeros(len(user_feature_map.keys()))
  for i in target_indices:
    new_user_features[i] = normalised_val
  new_user_features = sparse.csr_matrix(new_user_features)
  return(new_user_features)

In [None]:
def sample_recommendation(user_hash,df_news,user_feature_list,item_id_map_reverse,user_feature_map,user_id_map,model):
    try:
        user_x = user_id_map[user_hash]
        scores = model.predict(user_x, np.arange(108573)) # means predict for all
    except:
        new_user_features = format_newuser_input(user_feature_map, user_feature_list)
        scores = model.predict(0, np.arange(108573), user_features=new_user_features)
    
    top_5_indices = np.argsort(-scores)[:5]  # Sort scores in descending order and take the top 5
    top_5_items = [item_id_map_reverse[i] for i in top_5_indices]

    print("Top 5 recommended items:")

    for x in top_5_items:
        row = df_news[df_news["page"] == x]
        print("        %s" % row["title"].values[0])


In [None]:
# predict for known user
user_feature_list = ['userType:Logged']
user_hash = '5f5e17781fc2ec0ddcfb2e9356e61c5d3d4b0b3c8fabd20917feb9e807463856'
sample_recommendation(user_hash,df_news,user_feature_list,loaded_item_id_map_reverse,loaded_user_feature_map,loaded_user_id_map,loaded_model)

In [None]:
# predict for unknown user
user_feature_list = ['userType:Non-Logged']
user_hash = ''
sample_recommendation(user_hash,df_news,user_feature_list,loaded_item_id_map_reverse,loaded_user_feature_map,loaded_user_id_map,loaded_model)