# Model recommendation with lighfm

### Import libraries

In [1]:
import os
import sys
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from lightfm import cross_validation
import scipy.sparse as sp
from scipy import sparse

### Defining variables

In [2]:
import json

with open('config.json', 'r') as f:
    config = json.load(f)

In [3]:
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of epochs to fit model
NO_EPOCHS = 20

# seed for pseudonumber generations
SEED = 42

### Retrieve data

In [4]:
import pandas as pd
# path config

df_ratings = pd.read_csv(config["TREATED_TRAIN_CSV"])

In [5]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428361 entries, 0 to 428360
Data columns (total 8 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Unnamed: 0               428361 non-null  int64  
 1   userId                   428361 non-null  object 
 2   history                  428361 non-null  object 
 3   numberOfClicksHistory    428361 non-null  int64  
 4   timeOnPageHistory        428361 non-null  int64  
 5   scrollPercentageHistory  428361 non-null  float64
 6   pageVisitsCountHistory   428361 non-null  int64  
 7   userType                 428361 non-null  object 
dtypes: float64(1), int64(4), object(3)
memory usage: 26.1+ MB


In [6]:
#filtrar por noticias
clicks_counts = pd.DataFrame(df_ratings["history"].value_counts())
rare_news = clicks_counts[df_ratings["history"].value_counts() <= 0].index
common_news = df_ratings[~df_ratings["history"].isin(rare_news)]

In [7]:
common_news.head()

Unnamed: 0.1,Unnamed: 0,userId,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,userType
0,0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,0,71998,81.58,1,Non-Logged
1,1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,0,115232,73.36,1,Non-Logged
2,2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,19ba89fc-1e06-4c5d-9c57-4a3088dc0511,68,131495,51.74,1,Non-Logged
3,3,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,12,43733,35.49,1,Non-Logged
4,4,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,59a61a8a-cc52-453f-b1cd-2bd019e9d574,55,159042,62.19,1,Non-Logged


In [8]:
df_news = pd.read_csv(config["DF_ITEMS_CLUSTERED_FEATURES_ADJ"])

In [9]:
df_news.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,page,url,issued,modified,title,body,caption,cleaned_body,cleaned_title,cleaned_caption,combined_text,cluster,word_count
0,0,0,7371a9b5-5824-4c57-8704-00a74feebe79,http://g1.globo.com/al/alagoas/noticia/2018/09...,2018-09-13 14:52:55+00:00,2018-09-14 16:14:49+00:00,Corpo de motorista da Uber é encontrado em can...,Corpo de motorista de aplicativo desaparecido ...,"Segundo a polícia, Antônio Vitor foi solicitad...",corpo motorista aplicativo desaparecido encont...,corpo motorista uber encontrado canavial rotei...,polícia antônio vitor solicitado corridas desa...,corpo motorista uber encontrado canavial rotei...,0,234
1,1,1,7a5ea08f-4583-49e2-ba52-a71999443f7b,http://g1.globo.com/am/amazonas/noticia/detent...,2018-05-20 20:42:40+00:00,2018-05-20 20:42:40+00:00,Detento recapturado após fuga por túnel volta ...,Detento disse que passou nome falso ao dar ent...,Ele tinha registro em presídio com nome falso....,detento disse passou nome falso entrada presíd...,detento recapturado fuga túnel volta cdpm manaus,tinha registro presídio nome falso presos esca...,detento recapturado fuga túnel volta cdpm mana...,0,345
2,2,2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,http://g1.globo.com/ap/amapa/noticia/audios-mo...,2017-07-30 00:37:17+00:00,2017-07-30 00:48:42+00:00,Áudios mostram conversa entre bandidos durante...,Áudios mostram possível conversa entre bandido...,Revista realizada na sexta-feira (28) no Iapen...,áudios mostram conversa bandidos tentativa fug...,áudios mostram conversa bandidos durante tenta...,revista realizada iapen apreendeu celulares dr...,áudios mostram conversa bandidos durante tenta...,0,383
3,3,3,5cc3bd27-80c7-457d-a807-2e8e7fddf031,http://g1.globo.com/ap/amapa/noticia/2020/11/0...,2020-11-06 12:54:00+00:00,2020-11-12 21:22:52+00:00,FOTOS: Apagão no Amapá,"Moradores da capital do Amapá, em Macapá, faze...",Incêndio em subestação de energia deixa 13 dos...,moradores capital amapá macapá protestos maksu...,fotos apagão amapá,incêndio subestação energia deixa municípios e...,fotos apagão amapá incêndio subestação energia...,0,470
4,4,4,d6956177-db96-42f5-9f68-dd0d6e930661,http://g1.globo.com/ap/amapa/noticia/2019/05/2...,2019-05-27 13:43:03+00:00,2019-05-27 18:19:06+00:00,Profissionais da educação no AP paralisam ativ...,Profissionais da educação paralisam atividades...,Ato comprometeu aulas em escolas nesta segunda...,profissionais educação paralisam atividades re...,profissionais educação ap paralisam atividades...,ato comprometeu aulas escolas sinsepeap seed n...,profissionais educação ap paralisam atividades...,0,609


In [10]:
df_merged = pd.merge(common_news,df_news, left_on='history', right_on='page', how='left')

In [11]:
df_merged.head()

Unnamed: 0,Unnamed: 0_x,userId,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,userType,Unnamed: 0.1,Unnamed: 0_y,...,modified,title,body,caption,cleaned_body,cleaned_title,cleaned_caption,combined_text,cluster,word_count
0,0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,0,71998,81.58,1,Non-Logged,3118.0,3118.0,...,2022-09-19 19:58:15+00:00,Câmara aprova projeto que acaba com saída temp...,A Câmara dos Deputados aprovou nesta quarta-fe...,Relator defende que 'saidinhas' causam 'sentim...,câmara deputados aprovou projeto lei acaba pos...,câmara aprova projeto acaba saída temporária p...,relator defende saidinhas causam sentimento im...,câmara aprova projeto acaba saída temporária p...,0.0,949.0
1,1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,0,115232,73.36,1,Non-Logged,,,...,,,,,,,,,,
2,2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,19ba89fc-1e06-4c5d-9c57-4a3088dc0511,68,131495,51.74,1,Non-Logged,,,...,,,,,,,,,,
3,3,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,12,43733,35.49,1,Non-Logged,,,...,,,,,,,,,,
4,4,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,59a61a8a-cc52-453f-b1cd-2bd019e9d574,55,159042,62.19,1,Non-Logged,,,...,,,,,,,,,,


In [12]:
df_merged = df_merged[["userId","history","numberOfClicksHistory","userType","peso_ajustado"]]

KeyError: "['peso_ajustado'] not in index"

In [13]:
df_merged.head()

Unnamed: 0,userId,history,numberOfClicksHistory,userType,peso_ajustado
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,c8aab885-433d-4e46-8066-479f40ba7fb2,76,Non-Logged,0.106482
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,68d2039c-c9aa-456c-ac33-9b2e8677fba7,38,Non-Logged,0.100065
2,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,13e423ce-1d69-4c78-bc18-e8c8f7271964,41,Non-Logged,0.101399
3,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,3325b5a1-979a-4cb3-82b6-63905c9edbe8,7,Non-Logged,1.0
4,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,fe856057-f97d-419f-ab1c-97c5c3e0719c,80,Non-Logged,1.0


### Prepare data

Before fitting the LightFM model, we need to create an instance of `Dataset` which holds the interaction matrix.

In [15]:
dataset = Dataset()

# Get unique values for users, items, and user features
unique_users = df_merged["userId"].unique()
unique_items = df_merged["history"].unique()
unique_user_features = df_merged["userType"].unique().tolist() 

# Fit dataset with users, items, and user feature names
dataset.fit(
    users=unique_users,
    items=unique_items,
    user_features=unique_user_features  # Register user features
)


In [16]:
(interactions, weights) = dataset.build_interactions([
    (row.userId, row.history, row.peso_ajustado) 
    for _, row in df_merged.iterrows()
])


In [17]:
user_features_list = [
    (row.userId, [row.userType])  
    for _, row in df_merged.iterrows()
]

user_features = dataset.build_user_features(user_features_list)


LightLM works slightly differently compared to other packages as it expects the train and test sets to have same dimension. Therefore the conventional train test split will not work.

The package has included the `cross_validation.random_train_test_split` method to split the interaction data and splits it into two disjoint training and test sets. 

However, note that **it does not validate the interactions in the test set to guarantee all items and users have historical interactions in the training set**. Therefore this may result into a partial cold-start problem in the test set.

In [18]:
# Split train and test sets (80/20 split)
train, test = cross_validation.random_train_test_split(interactions, test_percentage=0.2, random_state=SEED)
train_weights, test_weights  = cross_validation.random_train_test_split(weights, test_percentage=0.2, random_state=SEED)


Double check the size of both the train and test sets.

In [19]:
print(f"Shape of train interactions: {train.shape}")
print(f"Shape of test interactions: {test.shape}")

Shape of train interactions: (100000, 108573)
Shape of test interactions: (100000, 108573)


### Fit the LightFM model

In this notebook, the LightFM model will be using the weighted Approximate-Rank Pairwise (WARP) as the loss. Further explanation on the topic can be found [here](https://making.lyst.com/lightfm/docs/examples/warp_loss.html#learning-to-rank-using-the-warp-loss).


In general, it maximises the rank of positive examples by repeatedly sampling negative examples until a rank violation has been located. This approach is recommended when only positive interactions are present.

The LightFM model can be fitted with the following code:

In [20]:
model = LightFM(loss="warp",learning_rate=LEARNING_RATE,random_state=np.random.RandomState(SEED))  # Weighted Approximate-Rank Pairwise (WARP) loss
model.fit(train, sample_weight=train_weights, epochs=NO_EPOCHS, num_threads=4, user_features=user_features)


<lightfm.lightfm.LightFM at 0x75919f187d30>

### Evaluate model

In [21]:
# Import the evaluation routines
from lightfm.evaluation import auc_score

# Compute evaluation metrics
auc_test = auc_score(model, test, user_features=user_features).mean()
auc_train = auc_score(model, train, user_features=user_features).mean()

# Print evaluation results
print(f"AUC test Score: {auc_test:.4f}")
print(f"AUC train Score: {auc_train:.4f}")


AUC test Score: 0.7865
AUC train Score: 0.9370


### Save pkls to serve model

In [22]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

In [23]:
item_id_map_reverse = {v: k for k, v in item_id_map.items()}

In [24]:
import pickle

pickle.dump(model, open('artifacts/lightfm_model.pkl', 'wb'))
pickle.dump(user_id_map, open('artifacts/user_id_map.pkl', 'wb'))
pickle.dump(item_id_map_reverse, open('artifacts/item_id_map_reverse.pkl', 'wb'))
pickle.dump(user_feature_map, open('artifacts/user_feature_map.pkl', 'wb'))

In [25]:
loaded_model = pickle.load(open('artifacts/lightfm_model.pkl', 'rb'))
loaded_user_id_map = pickle.load(open('artifacts/user_id_map.pkl', 'rb'))
loaded_item_id_map_reverse = pickle.load(open('artifacts/item_id_map_reverse.pkl', 'rb'))
loaded_user_feature_map = pickle.load(open('artifacts/user_feature_map.pkl', 'rb'))

### Make predictions to known and unknowm on same recommendation function with pkls

In [26]:

_, n_items = interactions.shape # no of users * no of items

n_items

108573

In [27]:
def format_newuser_input(user_feature_map, user_feature_list):
  normalised_val = 1.0 
  target_indices = []
  for feature in user_feature_list:
    try:
        target_indices.append(user_feature_map[feature])
    except KeyError:
        print("new user feature encountered '{}'".format(feature))
        pass
  #print("target indices: {}".format(target_indices))
  new_user_features = np.zeros(len(user_feature_map.keys()))
  for i in target_indices:
    new_user_features[i] = normalised_val
  new_user_features = sparse.csr_matrix(new_user_features)
  return(new_user_features)

In [28]:
def sample_recommendation(user_hash,df_news,user_feature_list,item_id_map_reverse,user_feature_map,user_id_map,model):
    try:
        user_x = user_id_map[user_hash]
        scores = model.predict(user_x, np.arange(108573)) # means predict for all
    except:
        new_user_features = format_newuser_input(user_feature_map, user_feature_list)
        scores = model.predict(0, np.arange(108573), user_features=new_user_features)
    
    top_5_indices = np.argsort(-scores)[:5]  # Sort scores in descending order and take the top 5
    top_5_items = [item_id_map_reverse[i] for i in top_5_indices]

    print("Top 5 recommended items:")

    for x in top_5_items:
        row = df_news[df_news["page"] == x]
        print("        %s" % row["title"].values[0])


In [29]:
# predict for known user
user_feature_list = ['userType:Logged']
user_hash = '5f5e17781fc2ec0ddcfb2e9356e61c5d3d4b0b3c8fabd20917feb9e807463856'
sample_recommendation(user_hash,df_news,user_feature_list,loaded_item_id_map_reverse,loaded_user_feature_map,loaded_user_id_map,loaded_model)

Top 5 recommended items:
        De Pombagira, Paolla Oliveira investe em look ousado para desfilar como rainha de bateria da Grande Rio
        Auxílio caminhoneiro: não recebeu as duas primeiras parcelas do benefício? Veja como fazer a autodeclaração
        Motorista é preso após ser flagrado com uma tonelada de maconha em rodovia do interior de SP
        Por que os bebês suam tanto na cabeça?
        Novo trecho da Avenida Mozart Pinheiro de Lucena, em Fortaleza, é bloqueado para obras; veja desvios


In [30]:
# predict for unknown user
user_feature_list = ['userType:Non-Logged']
user_hash = ''
sample_recommendation(user_hash,df_news,user_feature_list,loaded_item_id_map_reverse,loaded_user_feature_map,loaded_user_id_map,loaded_model)

new user feature encountered 'userType:Non-Logged'
Top 5 recommended items:
        Jô Soares, ícone do humor e da TV, morre em São Paulo aos 84 anos
        Campeão mundial de jiu-jítsu, Leandro Lo é baleado na cabeça durante show em clube da Zona Sul de SP
        Menina de 10 anos que estava desaparecida após ir a padaria na Grande BH é encontrada morta
        PM preso por matar lutador Leandro Lo em show em SP já foi condenado na Justiça Militar por agredir policiais em boate em 2017
        WhatsApp vai deixar você esconder que está 'online'; veja como fazer
