### 2.1 Import libraries

In [79]:
import os
import sys
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from lightfm import cross_validation

### 2.2 Defining variables

In [80]:
# Select MovieLens data size
MOVIELENS_DATA_SIZE = '100k'

# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 20
# no of epochs to fit model
NO_EPOCHS = 20
# no of threads to fit model
NO_THREADS = 32
# regularisation for both user and item features
ITEM_ALPHA = 1e-6
USER_ALPHA = 1e-6

# seed for pseudonumber generations
SEED = 42

### 2.2 Retrieve data

In [81]:
import pandas as pd
# path config
data_path = '/home/raiane/Documentos/Projetos/tech_challenge_5/data/training_data_v2.csv'
df_ratings = pd.read_csv(
    data_path)

In [82]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1426291 entries, 0 to 1426290
Data columns (total 7 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   Unnamed: 0               1426291 non-null  int64  
 1   userId                   1426291 non-null  object 
 2   history                  1426291 non-null  object 
 3   numberOfClicksHistory    1426291 non-null  int64  
 4   timeOnPageHistory        1426291 non-null  int64  
 5   scrollPercentageHistory  1426291 non-null  float64
 6   pageVisitsCountHistory   1426291 non-null  int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 76.2+ MB


In [83]:
#considerando apenas noticias com mais de 30 cliques
clicks_counts = pd.DataFrame(df_ratings["history"].value_counts())
rare_news = clicks_counts[df_ratings["history"].value_counts() <= 500].index
common_news = df_ratings[~df_ratings["history"].isin(rare_news)]

In [84]:
unique_historys = common_news.history.unique()
unique_users = common_news.userId.unique()

In [85]:
import numpy as np

hash_dict = dict(enumerate(unique_historys.flatten(), 1))
user_dict = dict(enumerate(unique_users.flatten(), 1))

In [None]:
user_dict

In [None]:
hash_dict

In [87]:
inv_map = {v: k for k, v in hash_dict.items()}
inv_map_u = {v: k for k, v in user_dict.items()}

In [None]:
inv_map

In [None]:
inv_map_u

In [89]:
common_news["history_number"] = common_news["history"].map(inv_map)
common_news["user_number"] = common_news["userId"].map(inv_map_u)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_news["history_number"] = common_news["history"].map(inv_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_news["user_number"] = common_news["userId"].map(inv_map_u)


In [90]:
common_news = common_news[["user_number","history_number","numberOfClicksHistory"]]

In [91]:
common_news.head()

Unnamed: 0,user_number,history_number,numberOfClicksHistory
4,1,1,80
33,1,2,16
64,2,3,0
66,2,4,0
71,2,5,2


### 2.3 Prepare data

Before fitting the LightFM model, we need to create an instance of `Dataset` which holds the interaction matrix.

In [92]:
dataset = Dataset()

The `fit` method creates the user/item id mappings.

In [93]:
dataset.fit(users=common_news['user_number'], 
            items=common_news['history_number'])

# quick check to determine the number of unique users and items in the data
num_users, num_topics = dataset.interactions_shape()
print(f'Num users: {num_users}, num_topics: {num_topics}.')

Num users: 34107, num_topics: 536.


Next is to build the interaction matrix. The `build_interactions` method returns 2 COO sparse matrices, namely the `interactions` and `weights` matrices.

In [94]:
(interactions, weights) = dataset.build_interactions(common_news.iloc[:, 0:3].values)

LightLM works slightly differently compared to other packages as it expects the train and test sets to have same dimension. Therefore the conventional train test split will not work.

The package has included the `cross_validation.random_train_test_split` method to split the interaction data and splits it into two disjoint training and test sets. 

However, note that **it does not validate the interactions in the test set to guarantee all items and users have historical interactions in the training set**. Therefore this may result into a partial cold-start problem in the test set.

In [95]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEED))

Double check the size of both the train and test sets.

In [96]:
print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")

Shape of train interactions: (34107, 536)
Shape of test interactions: (34107, 536)


### 2.4 Fit the LightFM model

In this notebook, the LightFM model will be using the weighted Approximate-Rank Pairwise (WARP) as the loss. Further explanation on the topic can be found [here](https://making.lyst.com/lightfm/docs/examples/warp_loss.html#learning-to-rank-using-the-warp-loss).


In general, it maximises the rank of positive examples by repeatedly sampling negative examples until a rank violation has been located. This approach is recommended when only positive interactions are present.

In [97]:
model = LightFM(loss='warp', no_components=NO_COMPONENTS, 
                 learning_rate=LEARNING_RATE,                 
                 random_state=np.random.RandomState(SEED))

The LightFM model can be fitted with the following code:

In [98]:
model.fit(interactions=train_interactions,
          epochs=NO_EPOCHS);

### 2.4 Evaluate model

In [99]:
# Import the evaluation routines
from lightfm.evaluation import auc_score

# Compute and print the AUC score
train_auc = auc_score(model, train_interactions, num_threads=2).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

Collaborative filtering train AUC: 0.8371181


In [100]:
test_auc = auc_score(model, test_interactions, train_interactions=train_interactions, num_threads=2).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

Collaborative filtering test AUC: 0.6553675


In [101]:
# Measure how well it did in the Test period
for metric in [precision_at_k, recall_at_k]:
    # Get the precision and recall for Train and Test
    for data, name in [(train_interactions, "Train"), (test_interactions, "Test ")]:
        print(f"{name} {metric.__name__}: %.2f" % 
              metric(model,
                         data, 
                         k=10).mean())
        
    # # What about for just new-to-user purchases?
    # print(f"Test new {metric.__name__}: %.2f" % 
    #       metric(model,
    #                  test_new_interactions, 
    #                  train_interactions=train_interactions, # supress previously bought prods from being recommended
    #                  k=10).mean())

Train precision_at_k: 0.13
Test  precision_at_k: 0.03
Train recall_at_k: 0.31
Test  recall_at_k: 0.07


### 2.5 Using validation csv

In [104]:
df_validacao = pd.read_csv(
    '/home/raiane/Documentos/Projetos/tech_challenge_5/data/validacao.csv')

In [105]:
df_validacao.shape

(112184, 4)

In [108]:
df_validacao["userid_number"] = df_validacao["userId"].map(inv_map_u)

### 2.6 Make predictions

In [62]:
# Create all user and item matrix to get predictions for it
n_users, n_items = train_interactions.shape

# Force lightFM to create predictions for all users and all items
scoring_user_ids = np.concatenate([np.full((n_items, ), i) for i in range(n_users)]) # repeat user ID for number of prods
scoring_item_ids = np.concatenate([np.arange(n_items) for i in range(n_users)]) # repeat entire range of item IDs x number of user
scores = model.predict(user_ids = scoring_user_ids, 
                                     item_ids = scoring_item_ids)
scores = scores.reshape(-1, n_items) # get 1 row per user
recommendations = pd.DataFrame(scores)
recommendations.shape

# Have a look at the predicted scores for the first 5 users and first 5 items
recommendations.iloc[:5,:5] 

Unnamed: 0,0,1,2,3,4
0,-3.772773,-1.512855,-2.652201,-2.191434,-1.862019
1,-14.489299,-13.110419,-10.685356,-11.419316,-11.088422
2,-7.746505,-6.02825,-7.079825,-6.130264,-7.589519
3,-3.801059,-1.944845,1.787615,1.890942,1.018832
4,-3.299577,-1.76621,-1.637671,-0.936435,-1.7488


In [63]:
df_news = pd.read_csv("../data/noticias.csv")

In [64]:
df_news= df_news.drop(columns=["Unnamed: 0"])

In [112]:
def sample_recommendation(model, train_interactions,df_news, user_ids):
    

    n_users, n_items = train_interactions.shape

    for user_id in user_ids:

        known_positives = np.vectorize(hash_dict.get)(train_interactions.tocsr()[user_id].indices)
        
        scores = model.predict(user_id, np.arange(n_items))
        top_items = np.vectorize(hash_dict.get)(np.argsort(-scores))
        
        print("User %s" % user_id)

        print("     Known positives:")
        
        for x in known_positives[:3]:
            row = df_news[df_news["page"] == x]
            print("        %s" % row["title"].values[0])

        print("     Recommended:")
        
        for x in top_items[:5]:
            row = df_news[df_news["page"] == x]
            print("        %s" % row["title"].values[0])
            print(x)
        
sample_recommendation(model, train_interactions,df_news, [26364]) 

User 26364
     Known positives:
        Delegado descarta participação de outra pessoa em morte de médica achada em banheiro de hospital, em Pirenópolis
        Ex-empresário proibiu Luva de Pedreiro de participar da festa de São João de Quijingue, diz prefeito da cidade 
        Menino morre após passar 1 ano com prego no pulmão na Bahia; família acusa hospital de negligência
     Recommended:
        Casa abandonada em Higienópolis: Entenda o caso da mulher que vive em mansão de SP
5af379e6-1bd1-4cf8-a23c-03266fb77b2c
        Quem é Sabine Boghici, presa por golpe milionário contra a mãe e herdeira de um dos maiores colecionadores de arte do país
29b6b142-4173-4ec4-832f-7d0a32255c10
        Quem é Giovanni Quintella, anestesista preso em flagrante por estuprar grávida no parto; ele atuou em pelo menos 10 hospitais
8d477e04-3bab-4ad9-8fe3-799059238a9c
        Quem é Bolívar Guerrero Silva, médico preso por manter paciente em cárcere privado; ele responde a pelo menos 19 processos
056

In [66]:
# Predict scores for all users
num_users = interactions.shape[0]
item_ids = np.arange(interactions.shape[1])

# Predict scores for all users and average them
average_scores = np.zeros_like(item_ids, dtype=float)
for user_id in range(num_users):
    average_scores += model.predict(user_id, item_ids)

average_scores /= num_users

# Recommend top-N items with the highest average scores
top_n = 5
recommended_items = np.vectorize(hash_dict.get)(np.argsort(-average_scores))[:top_n]
recommended_items_name = []
for i in recommended_items:
    recommended_items_name.append(df_news[df_news["page"] == i]["title"].values[0])
print(f"Top {top_n} items for new users: {recommended_items_name}")


Top 5 items for new users: ['Quem é Giovanni Quintella, anestesista preso em flagrante por estuprar grávida no parto; ele atuou em pelo menos 10 hospitais', 'Casa abandonada em Higienópolis: Entenda o caso da mulher que vive em mansão de SP', 'Quem é Sabine Boghici, presa por golpe milionário contra a mãe e herdeira de um dos maiores colecionadores de arte do país', 'Jovem vai para a UTI após ser agredida e jogada em viela no litoral de SP', "Integrante do 'Sexteto', Derico chora e lamenta morte de Jô Soares: 'Foi uma espécie de pai. Me ensinou tudo'"]


In [None]:
import pickle

with open('lightfm_model.pkl', 'wb') as f:
    pickle.dump(model, f)
print("Model saved to lightfm_model.pkl")

# Step 2: Load the model
with open('lightfm_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
print("Model loaded successfully")

sample_recommendation(loaded_model, train_interactions, df_news, [4, 25, 450]) 

Model saved to lightfm_model.pkl
Model loaded successfully
User 4
     Known positives:
        Lista de concursos públicos e vagas de emprego - G1 Economia
     Recommended:
        Casa abandonada em Higienópolis: Entenda o caso da mulher que vive em mansão de SP
        Jovem é baleado, perde rim e hospital entrega órgão à família em saco plástico na Bahia
        Jovem vai para a UTI após ser agredida e jogada em viela no litoral de SP
        Quem é Giovanni Quintella, anestesista preso em flagrante por estuprar grávida no parto; ele atuou em pelo menos 10 hospitais
        Quem é Jorge Guaranho, apoiador de Bolsonaro que matou petista em Foz do Iguaçu
User 25
     Known positives:
        Quem é Sabine Boghici, presa por golpe milionário contra a mãe e herdeira de um dos maiores colecionadores de arte do país
        Filha é presa por golpe estimado em R$ 725 milhões contra a mãe; quadros renomados roubados foram recuperados
        Polícia prende cônsul alemão por suspeita na mo