### 2.1 Import libraries

In [2]:
import os
import sys
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from lightfm import cross_validation

### 2.2 Defining variables

In [3]:
# Select MovieLens data size
MOVIELENS_DATA_SIZE = '100k'

# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 20
# no of epochs to fit model
NO_EPOCHS = 20
# no of threads to fit model
NO_THREADS = 32
# regularisation for both user and item features
ITEM_ALPHA = 1e-6
USER_ALPHA = 1e-6

# seed for pseudonumber generations
SEED = 42

### 2.2 Retrieve data

In [4]:
import pandas as pd
# path config
data_path = '/home/raiane/Documentos/Projetos/tech_challenge_5/data/training_data.csv'
df_ratings = pd.read_csv(
    data_path)

In [5]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1426291 entries, 0 to 1426290
Data columns (total 7 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   Unnamed: 0               1426291 non-null  int64  
 1   userId                   1426291 non-null  int64  
 2   history                  1426291 non-null  object 
 3   numberOfClicksHistory    1426291 non-null  int64  
 4   timeOnPageHistory        1426291 non-null  int64  
 5   scrollPercentageHistory  1426291 non-null  float64
 6   pageVisitsCountHistory   1426291 non-null  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 76.2+ MB


In [6]:
#considerando apenas noticias com mais de 30 cliques
clicks_counts = pd.DataFrame(df_ratings["history"].value_counts())
rare_news = clicks_counts[df_ratings["history"].value_counts() <= 1000].index
common_news = df_ratings[~df_ratings["history"].isin(rare_news)]

In [7]:
unique_historys = common_news.history.unique()

In [8]:
import numpy as np

hash_dict = dict(enumerate(unique_historys.flatten(), 1))

In [9]:
hash_dict

{1: 'e3cdb277-ad80-4025-b5de-cd98cc67d23d',
 2: '29b6b142-4173-4ec4-832f-7d0a32255c10',
 3: '1f32787b-de2b-49be-8c20-ddaeae34cc22',
 4: 'a36c98b5-f159-48f8-9f5a-1fc6ea9956c8',
 5: '87b7c93a-809a-4709-8e59-c56b02fec7c2',
 6: '5dff8fb2-73e6-4c22-a34f-c367aa2677df',
 7: '7594da99-d606-4338-a373-710a7dec776a',
 8: 'f4ac3eb5-e145-4685-b9e9-b61f3551aa58',
 9: '9771b0d9-b14a-4908-8932-440b6fe33aeb',
 10: 'e1a89c3a-1649-486f-a964-ecfb33821e77',
 11: '4d89c4b6-6827-4935-9ba1-0502025af270',
 12: '8f28e4d1-c4ab-4ce6-8acf-27726b061aa3',
 13: 'b4a5680b-d0e1-4260-bc54-82fbe328d3a0',
 14: 'a6e7224d-da3e-468f-bc51-26331659e06a',
 15: '949a81ac-5df1-4e7b-9f4e-e364b5603e68',
 16: '57615619-0c1d-41fb-8640-ccec5b3add58',
 17: '750ec729-cd61-4227-85fe-a1e0ebbf7d50',
 18: '13f2cc37-f575-44d5-b33f-045d0b0a912b',
 19: '100ff71c-da18-4182-9672-369324a71bb5',
 20: '15281e10-e6bc-48bc-9b1b-94402f83699b',
 21: '83568c85-7264-4e45-8a2e-eeb246118465',
 22: 'cb324527-6a66-491d-b53c-6a6d7ece566f',
 23: '0b5caeed-ff6f

In [10]:
inv_map = {v: k for k, v in hash_dict.items()}

In [11]:
inv_map

{'e3cdb277-ad80-4025-b5de-cd98cc67d23d': 1,
 '29b6b142-4173-4ec4-832f-7d0a32255c10': 2,
 '1f32787b-de2b-49be-8c20-ddaeae34cc22': 3,
 'a36c98b5-f159-48f8-9f5a-1fc6ea9956c8': 4,
 '87b7c93a-809a-4709-8e59-c56b02fec7c2': 5,
 '5dff8fb2-73e6-4c22-a34f-c367aa2677df': 6,
 '7594da99-d606-4338-a373-710a7dec776a': 7,
 'f4ac3eb5-e145-4685-b9e9-b61f3551aa58': 8,
 '9771b0d9-b14a-4908-8932-440b6fe33aeb': 9,
 'e1a89c3a-1649-486f-a964-ecfb33821e77': 10,
 '4d89c4b6-6827-4935-9ba1-0502025af270': 11,
 '8f28e4d1-c4ab-4ce6-8acf-27726b061aa3': 12,
 'b4a5680b-d0e1-4260-bc54-82fbe328d3a0': 13,
 'a6e7224d-da3e-468f-bc51-26331659e06a': 14,
 '949a81ac-5df1-4e7b-9f4e-e364b5603e68': 15,
 '57615619-0c1d-41fb-8640-ccec5b3add58': 16,
 '750ec729-cd61-4227-85fe-a1e0ebbf7d50': 17,
 '13f2cc37-f575-44d5-b33f-045d0b0a912b': 18,
 '100ff71c-da18-4182-9672-369324a71bb5': 19,
 '15281e10-e6bc-48bc-9b1b-94402f83699b': 20,
 '83568c85-7264-4e45-8a2e-eeb246118465': 21,
 'cb324527-6a66-491d-b53c-6a6d7ece566f': 22,
 '0b5caeed-ff6f-4b9

In [12]:
common_news["history_number"] = common_news["history"].map(inv_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_news["history_number"] = common_news["history"].map(inv_map)


In [13]:
common_news = common_news[["userId","history_number","numberOfClicksHistory"]]

In [14]:
common_news.head()

Unnamed: 0,userId,history_number,numberOfClicksHistory
33,1,1,16
64,2,2,0
66,2,3,0
71,2,4,2
75,2,5,0


### 2.3 Prepare data

Before fitting the LightFM model, we need to create an instance of `Dataset` which holds the interaction matrix.

In [15]:
dataset = Dataset()

The `fit` method creates the user/item id mappings.

In [16]:
dataset.fit(users=common_news['userId'], 
            items=common_news['history_number'])

# quick check to determine the number of unique users and items in the data
num_users, num_topics = dataset.interactions_shape()
print(f'Num users: {num_users}, num_topics: {num_topics}.')

Num users: 29063, num_topics: 176.


Next is to build the interaction matrix. The `build_interactions` method returns 2 COO sparse matrices, namely the `interactions` and `weights` matrices.

In [17]:
(interactions, weights) = dataset.build_interactions(common_news.iloc[:, 0:3].values)

LightLM works slightly differently compared to other packages as it expects the train and test sets to have same dimension. Therefore the conventional train test split will not work.

The package has included the `cross_validation.random_train_test_split` method to split the interaction data and splits it into two disjoint training and test sets. 

However, note that **it does not validate the interactions in the test set to guarantee all items and users have historical interactions in the training set**. Therefore this may result into a partial cold-start problem in the test set.

In [18]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEED))

Double check the size of both the train and test sets.

In [19]:
print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")

Shape of train interactions: (29063, 176)
Shape of test interactions: (29063, 176)


### 2.4 Fit the LightFM model

In this notebook, the LightFM model will be using the weighted Approximate-Rank Pairwise (WARP) as the loss. Further explanation on the topic can be found [here](https://making.lyst.com/lightfm/docs/examples/warp_loss.html#learning-to-rank-using-the-warp-loss).


In general, it maximises the rank of positive examples by repeatedly sampling negative examples until a rank violation has been located. This approach is recommended when only positive interactions are present.

In [20]:
model = LightFM(loss='warp', no_components=NO_COMPONENTS, 
                 learning_rate=LEARNING_RATE,                 
                 random_state=np.random.RandomState(SEED))

The LightFM model can be fitted with the following code:

In [21]:
model.fit(interactions=train_interactions,
          epochs=NO_EPOCHS);

### 2.4 Evaluate model

In [22]:
# Import the evaluation routines
from lightfm.evaluation import auc_score

# Compute and print the AUC score
train_auc = auc_score(model, train_interactions, num_threads=2).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

Collaborative filtering train AUC: 0.94510543


In [23]:
test_auc = auc_score(model, test_interactions, train_interactions=train_interactions, num_threads=2).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

Collaborative filtering test AUC: 0.64607763


In [24]:
# Measure how well it did in the Test period
for metric in [precision_at_k, recall_at_k]:
    # Get the precision and recall for Train and Test
    for data, name in [(train_interactions, "Train"), (test_interactions, "Test ")]:
        print(f"{name} {metric.__name__}: %.2f" % 
              metric(model,
                         data, 
                         k=10).mean())
        
    # # What about for just new-to-user purchases?
    # print(f"Test new {metric.__name__}: %.2f" % 
    #       metric(model,
    #                  test_new_interactions, 
    #                  train_interactions=train_interactions, # supress previously bought prods from being recommended
    #                  k=10).mean())

Train precision_at_k: 0.30
Test  precision_at_k: 0.03
Train recall_at_k: 0.73
Test  recall_at_k: 0.11


### 2.5 Make predictions

In [25]:
# Create all user and item matrix to get predictions for it
n_users, n_items = train_interactions.shape

# Force lightFM to create predictions for all users and all items
scoring_user_ids = np.concatenate([np.full((n_items, ), i) for i in range(n_users)]) # repeat user ID for number of prods
scoring_item_ids = np.concatenate([np.arange(n_items) for i in range(n_users)]) # repeat entire range of item IDs x number of user
scores = model.predict(user_ids = scoring_user_ids, 
                                     item_ids = scoring_item_ids)
scores = scores.reshape(-1, n_items) # get 1 row per user
recommendations = pd.DataFrame(scores)
recommendations.shape

# Have a look at the predicted scores for the first 5 users and first 5 items
recommendations.iloc[:5,:5] 

Unnamed: 0,0,1,2,3,4
0,1.894309,-2.252751,-2.010876,-0.799899,-0.874582
1,-12.991467,-9.249822,-7.52775,-11.129276,-10.997106
2,-7.337574,-4.814159,-2.835614,-5.400973,-6.209405
3,-1.952892,-1.379185,-0.166378,-0.697236,-1.639585
4,-1.399175,-0.521738,-0.030561,-0.221206,-1.033263


In [26]:
df_news = pd.read_csv("../data/noticias.csv")

In [27]:
df_news= df_news.drop(columns=["Unnamed: 0"])

In [28]:
def sample_recommendation(model, df_news, user_ids):
    

    n_users, n_items = train_interactions.shape

    for user_id in user_ids:
        scores = model.predict(user_id, np.arange(n_items))
        top_items = np.vectorize(hash_dict.get)(np.argsort(-scores))
        
        print("User %s" % user_id)

        print("     Recommended:")
        
        for x in top_items[:5]:
            row = df_news[df_news["page"] == x]
            print("        %s" % row["title"].values[0])
        
sample_recommendation(model, df_news, [4, 25, 450]) 

User 4
     Recommended:
        Exclusivo: vídeo mostra salto de aluno de paraquedismo que terminou em morte em Boituva
        Voo de Nancy Pelosi pousa em Taiwan, e China fala de 'infração severa' à sua soberania e anuncia ações militares
        Mulher dança no TikTok para comemorar processo trabalhista e leva multa
        Anestesista preso após estupro é réu por erro de diagnóstico; paciente ficou em coma e perdeu dedão
        Ex-assessor de Gabriel Monteiro revelou que vereador fazia sexo na frente da equipe durante o trabalho
User 25
     Recommended:
        Exclusivo: vídeo mostra salto de aluno de paraquedismo que terminou em morte em Boituva
        Flavia Soares, ex-mulher de Jô Soares, lamenta morte do apresentador: 'Amor eterno'
        Voo de Nancy Pelosi pousa em Taiwan, e China fala de 'infração severa' à sua soberania e anuncia ações militares
        Corpo de João Paulo Diniz é levado para São Paulo para ser velado e sepultado
        Vídeo mostra 'prédio de Neymar

In [None]:
# Predict scores for all users
num_users = interactions.shape[0]
item_ids = np.arange(interactions.shape[1])

# Predict scores for all users and average them
average_scores = np.zeros_like(item_ids, dtype=float)
for user_id in range(num_users):
    average_scores += model.predict(user_id, item_ids)

average_scores /= num_users

# Recommend top-N items with the highest average scores
top_n = 5
recommended_items = np.vectorize(hash_dict.get)(np.argsort(-scores))[:top_n]
recommended_items_name = []
print(recommended_items)
for i in recommended_items:
    recommended_items_name.append(df_news[df_news["page"] == i]["title"].values[0])
print(f"Top {top_n} items for new users: {recommended_items_name}")


In [31]:
import pickle

with open('lightfm_model.pkl', 'wb') as f:
    pickle.dump(model, f)
print("Model saved to lightfm_model.pkl")

# Step 2: Load the model
with open('lightfm_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
print("Model loaded successfully")

sample_recommendation(loaded_model, df_news, [4, 25, 450]) 

Model saved to lightfm_model.pkl
Model loaded successfully
User 4
     Recommended:
        Exclusivo: vídeo mostra salto de aluno de paraquedismo que terminou em morte em Boituva
        Voo de Nancy Pelosi pousa em Taiwan, e China fala de 'infração severa' à sua soberania e anuncia ações militares
        Mulher dança no TikTok para comemorar processo trabalhista e leva multa
        Anestesista preso após estupro é réu por erro de diagnóstico; paciente ficou em coma e perdeu dedão
        Ex-assessor de Gabriel Monteiro revelou que vereador fazia sexo na frente da equipe durante o trabalho
User 25
     Recommended:
        Exclusivo: vídeo mostra salto de aluno de paraquedismo que terminou em morte em Boituva
        Flavia Soares, ex-mulher de Jô Soares, lamenta morte do apresentador: 'Amor eterno'
        Voo de Nancy Pelosi pousa em Taiwan, e China fala de 'infração severa' à sua soberania e anuncia ações militares
        Corpo de João Paulo Diniz é levado para São Paulo para se