### 2.1 Import libraries

In [1]:
import os
import sys
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from lightfm import cross_validation

### 2.2 Defining variables

In [2]:
# Select MovieLens data size
MOVIELENS_DATA_SIZE = '100k'

# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 20
# no of epochs to fit model
NO_EPOCHS = 20
# no of threads to fit model
NO_THREADS = 32
# regularisation for both user and item features
ITEM_ALPHA = 1e-6
USER_ALPHA = 1e-6

# seed for pseudonumber generations
SEED = 42

### 2.2 Retrieve data

In [3]:
import pandas as pd
# path config
data_path = '/home/raiane/Documentos/Projetos/tech_challenge_5/data/training_data.csv'
df_ratings = pd.read_csv(
    data_path)

In [4]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1426291 entries, 0 to 1426290
Data columns (total 7 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   Unnamed: 0               1426291 non-null  int64  
 1   userId                   1426291 non-null  int64  
 2   history                  1426291 non-null  object 
 3   numberOfClicksHistory    1426291 non-null  int64  
 4   timeOnPageHistory        1426291 non-null  int64  
 5   scrollPercentageHistory  1426291 non-null  float64
 6   pageVisitsCountHistory   1426291 non-null  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 76.2+ MB


In [5]:
#considerando apenas noticias com mais de 30 cliques
clicks_counts = pd.DataFrame(df_ratings["history"].value_counts())
rare_news = clicks_counts[df_ratings["history"].value_counts() <= 500].index
common_news = df_ratings[~df_ratings["history"].isin(rare_news)]

In [6]:
unique_historys = common_news.history.unique()

In [7]:
import numpy as np

hash_dict = dict(enumerate(unique_historys.flatten(), 1))

In [8]:
hash_dict

{1: 'fe856057-f97d-419f-ab1c-97c5c3e0719c',
 2: 'e3cdb277-ad80-4025-b5de-cd98cc67d23d',
 3: '29b6b142-4173-4ec4-832f-7d0a32255c10',
 4: '1f32787b-de2b-49be-8c20-ddaeae34cc22',
 5: 'a36c98b5-f159-48f8-9f5a-1fc6ea9956c8',
 6: '87b7c93a-809a-4709-8e59-c56b02fec7c2',
 7: '5dff8fb2-73e6-4c22-a34f-c367aa2677df',
 8: '7594da99-d606-4338-a373-710a7dec776a',
 9: 'f4ac3eb5-e145-4685-b9e9-b61f3551aa58',
 10: '9771b0d9-b14a-4908-8932-440b6fe33aeb',
 11: 'e5b31e24-1bfe-49d5-a0b2-8af464411fda',
 12: '3ffb80eb-0ae9-457e-883c-d52f57c124c0',
 13: '8c37ed33-976c-4100-b831-5a776c7a6716',
 14: 'e1a89c3a-1649-486f-a964-ecfb33821e77',
 15: '4d89c4b6-6827-4935-9ba1-0502025af270',
 16: '1aaa53f5-750c-4860-a79e-fe7ffac3d71f',
 17: '1fd81f89-cca7-4122-8749-db21b14c25b1',
 18: '8f28e4d1-c4ab-4ce6-8acf-27726b061aa3',
 19: 'f774e860-8103-49bf-b04e-63b316735e4b',
 20: '7fcf82cf-6a19-4e1f-a054-d5356546bf31',
 21: 'b4a5680b-d0e1-4260-bc54-82fbe328d3a0',
 22: '141814bf-48d5-492e-9605-bed1b400d346',
 23: 'a6e7224d-da3e

In [9]:
inv_map = {v: k for k, v in hash_dict.items()}

In [10]:
inv_map

{'fe856057-f97d-419f-ab1c-97c5c3e0719c': 1,
 'e3cdb277-ad80-4025-b5de-cd98cc67d23d': 2,
 '29b6b142-4173-4ec4-832f-7d0a32255c10': 3,
 '1f32787b-de2b-49be-8c20-ddaeae34cc22': 4,
 'a36c98b5-f159-48f8-9f5a-1fc6ea9956c8': 5,
 '87b7c93a-809a-4709-8e59-c56b02fec7c2': 6,
 '5dff8fb2-73e6-4c22-a34f-c367aa2677df': 7,
 '7594da99-d606-4338-a373-710a7dec776a': 8,
 'f4ac3eb5-e145-4685-b9e9-b61f3551aa58': 9,
 '9771b0d9-b14a-4908-8932-440b6fe33aeb': 10,
 'e5b31e24-1bfe-49d5-a0b2-8af464411fda': 11,
 '3ffb80eb-0ae9-457e-883c-d52f57c124c0': 12,
 '8c37ed33-976c-4100-b831-5a776c7a6716': 13,
 'e1a89c3a-1649-486f-a964-ecfb33821e77': 14,
 '4d89c4b6-6827-4935-9ba1-0502025af270': 15,
 '1aaa53f5-750c-4860-a79e-fe7ffac3d71f': 16,
 '1fd81f89-cca7-4122-8749-db21b14c25b1': 17,
 '8f28e4d1-c4ab-4ce6-8acf-27726b061aa3': 18,
 'f774e860-8103-49bf-b04e-63b316735e4b': 19,
 '7fcf82cf-6a19-4e1f-a054-d5356546bf31': 20,
 'b4a5680b-d0e1-4260-bc54-82fbe328d3a0': 21,
 '141814bf-48d5-492e-9605-bed1b400d346': 22,
 'a6e7224d-da3e-468

In [11]:
common_news["history_number"] = common_news["history"].map(inv_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_news["history_number"] = common_news["history"].map(inv_map)


In [12]:
common_news = common_news[["userId","history_number","numberOfClicksHistory"]]

In [14]:
common_news.head()

Unnamed: 0,userId,history_number,numberOfClicksHistory
4,1,1,80
33,1,2,16
64,2,3,0
66,2,4,0
71,2,5,2


### 2.3 Prepare data

Before fitting the LightFM model, we need to create an instance of `Dataset` which holds the interaction matrix.

In [15]:
dataset = Dataset()

The `fit` method creates the user/item id mappings.

In [16]:
dataset.fit(users=common_news['userId'], 
            items=common_news['history_number'])

# quick check to determine the number of unique users and items in the data
num_users, num_topics = dataset.interactions_shape()
print(f'Num users: {num_users}, num_topics: {num_topics}.')

Num users: 34107, num_topics: 536.


Next is to build the interaction matrix. The `build_interactions` method returns 2 COO sparse matrices, namely the `interactions` and `weights` matrices.

In [17]:
(interactions, weights) = dataset.build_interactions(common_news.iloc[:, 0:3].values)

LightLM works slightly differently compared to other packages as it expects the train and test sets to have same dimension. Therefore the conventional train test split will not work.

The package has included the `cross_validation.random_train_test_split` method to split the interaction data and splits it into two disjoint training and test sets. 

However, note that **it does not validate the interactions in the test set to guarantee all items and users have historical interactions in the training set**. Therefore this may result into a partial cold-start problem in the test set.

In [18]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEED))

Double check the size of both the train and test sets.

In [19]:
print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")

Shape of train interactions: (34107, 536)
Shape of test interactions: (34107, 536)


### 2.4 Fit the LightFM model

In this notebook, the LightFM model will be using the weighted Approximate-Rank Pairwise (WARP) as the loss. Further explanation on the topic can be found [here](https://making.lyst.com/lightfm/docs/examples/warp_loss.html#learning-to-rank-using-the-warp-loss).


In general, it maximises the rank of positive examples by repeatedly sampling negative examples until a rank violation has been located. This approach is recommended when only positive interactions are present.

In [20]:
model = LightFM(loss='warp', no_components=NO_COMPONENTS, 
                 learning_rate=LEARNING_RATE,                 
                 random_state=np.random.RandomState(SEED))

The LightFM model can be fitted with the following code:

In [21]:
model.fit(interactions=train_interactions,
          epochs=NO_EPOCHS);

### 2.4 Evaluate model

In [22]:
# Import the evaluation routines
from lightfm.evaluation import auc_score

# Compute and print the AUC score
train_auc = auc_score(model, train_interactions, num_threads=2).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

Collaborative filtering train AUC: 0.8371181


In [23]:
test_auc = auc_score(model, test_interactions, train_interactions=train_interactions, num_threads=2).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

Collaborative filtering test AUC: 0.6553675


In [24]:
# Measure how well it did in the Test period
for metric in [precision_at_k, recall_at_k]:
    # Get the precision and recall for Train and Test
    for data, name in [(train_interactions, "Train"), (test_interactions, "Test ")]:
        print(f"{name} {metric.__name__}: %.2f" % 
              metric(model,
                         data, 
                         k=10).mean())
        
    # # What about for just new-to-user purchases?
    # print(f"Test new {metric.__name__}: %.2f" % 
    #       metric(model,
    #                  test_new_interactions, 
    #                  train_interactions=train_interactions, # supress previously bought prods from being recommended
    #                  k=10).mean())

Train precision_at_k: 0.13
Test  precision_at_k: 0.03
Train recall_at_k: 0.31
Test  recall_at_k: 0.07


### 2.5 Make predictions

In [25]:
# Create all user and item matrix to get predictions for it
n_users, n_items = train_interactions.shape

# Force lightFM to create predictions for all users and all items
scoring_user_ids = np.concatenate([np.full((n_items, ), i) for i in range(n_users)]) # repeat user ID for number of prods
scoring_item_ids = np.concatenate([np.arange(n_items) for i in range(n_users)]) # repeat entire range of item IDs x number of user
scores = model.predict(user_ids = scoring_user_ids, 
                                     item_ids = scoring_item_ids)
scores = scores.reshape(-1, n_items) # get 1 row per user
recommendations = pd.DataFrame(scores)
recommendations.shape

# Have a look at the predicted scores for the first 5 users and first 5 items
recommendations.iloc[:5,:5] 

Unnamed: 0,0,1,2,3,4
0,-3.772773,-1.512855,-2.652201,-2.191434,-1.862019
1,-14.489299,-13.110419,-10.685356,-11.419316,-11.088422
2,-7.746505,-6.02825,-7.079825,-6.130264,-7.589519
3,-3.801059,-1.944845,1.787615,1.890942,1.018832
4,-3.299577,-1.76621,-1.637671,-0.936435,-1.7488


In [26]:
df_news = pd.read_csv("../data/noticias.csv")

In [27]:
df_news= df_news.drop(columns=["Unnamed: 0"])

In [28]:
def sample_recommendation(model, df_news, user_ids):
    

    n_users, n_items = train_interactions.shape

    for user_id in user_ids:
        scores = model.predict(user_id, np.arange(n_items))
        top_items = np.vectorize(hash_dict.get)(np.argsort(-scores))
        
        print("User %s" % user_id)

        print("     Recommended:")
        
        for x in top_items[:5]:
            row = df_news[df_news["page"] == x]
            print("        %s" % row["title"].values[0])
        
sample_recommendation(model, df_news, [4, 25, 450]) 

User 4
     Recommended:
        Casa abandonada em Higienópolis: Entenda o caso da mulher que vive em mansão de SP
        Jovem é baleado, perde rim e hospital entrega órgão à família em saco plástico na Bahia
        Jovem vai para a UTI após ser agredida e jogada em viela no litoral de SP
        Quem é Giovanni Quintella, anestesista preso em flagrante por estuprar grávida no parto; ele atuou em pelo menos 10 hospitais
        Quem é Jorge Guaranho, apoiador de Bolsonaro que matou petista em Foz do Iguaçu
User 25
     Recommended:
        Adolescente de 12 anos vítima de estupro coletivo na BA saiu da escola após 'virar chacota' entre os suspeitos, diz pai da jovem
        Jornal Nacional entrevistará candidatos à Presidência da República
        
Golpe em idosa: ‘Mata essa velha!’, mandou Rosa para Sabine em ligação, diz delegado
        Mãe descobre paradeiro de filha desaparecida por 36 anos em conversa com vizinho no litoral de SP 
        VÍDEO: Tayara Andreza diz que teve de

In [None]:
# Predict scores for all users
num_users = interactions.shape[0]
item_ids = np.arange(interactions.shape[1])

# Predict scores for all users and average them
average_scores = np.zeros_like(item_ids, dtype=float)
for user_id in range(num_users):
    average_scores += model.predict(user_id, item_ids)

average_scores /= num_users

# Recommend top-N items with the highest average scores
top_n = 5
recommended_items = np.vectorize(hash_dict.get)(np.argsort(-scores))[:top_n]
recommended_items_name = []
for i in recommended_items:
    recommended_items_name.append(df_news[df_news["page"] == i]["title"].values[0])
print(f"Top {top_n} items for new users: {recommended_items_name}")


In [29]:
import pickle

with open('lightfm_model.pkl', 'wb') as f:
    pickle.dump(model, f)
print("Model saved to lightfm_model.pkl")

# Step 2: Load the model
with open('lightfm_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
print("Model loaded successfully")

# Step 3: Predict for a user
user_id = 0
item_ids = np.arange(interactions.shape[1])  # Item IDs
predictions = loaded_model.predict(user_id, item_ids)
print(f"Predicted scores for user {user_id}: {predictions}")


Model saved to lightfm_model.pkl
Model loaded successfully
Predicted scores for user 0: [-3.7727726 -1.5128552 -2.652201  -2.1914344 -1.8620193 -2.373983
 -3.1374588 -3.6189573 -3.8269384 -4.0774374 -2.913527  -4.8337035
 -3.7068512 -2.9294903 -3.268705  -4.4205832 -4.1129403 -2.8261092
 -3.9401865 -3.9456625 -3.2528465 -4.0910225 -3.242227  -4.789383
 -4.028302  -3.1483438 -1.5214676 -3.9208767 -3.2808917 -4.392265
 -3.3249397 -4.0853934 -4.876736  -5.2587237 -3.8244774 -2.5733504
 -2.9108574 -2.4987292 -4.5217285 -2.1397102 -4.72403   -3.6946132
 -3.5407746 -2.23294   -3.9915812 -3.2498236 -3.6800957 -3.0763235
 -4.443243  -3.9077268 -4.404257  -3.674695  -3.6965735 -3.0637298
 -3.2827933 -2.5712023 -2.4913976 -4.2718205 -3.3006344 -2.8036375
 -3.3562984 -4.721491  -4.666229  -1.9071616 -3.2079017 -3.5724463
 -2.306938  -2.3660274 -4.3312936 -3.45944   -1.5534421 -5.009878
 -2.268209  -4.3333054 -2.17737   -5.124013  -3.0946898 -3.8845615
 -7.2248797 -8.085825  -2.244387  -4.7336783 