### 2.1 Import libraries

In [2]:
import os
import sys
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from lightfm import cross_validation

### 2.2 Defining variables

In [3]:
# Select MovieLens data size
MOVIELENS_DATA_SIZE = '100k'

# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 20
# no of epochs to fit model
NO_EPOCHS = 20
# no of threads to fit model
NO_THREADS = 32
# regularisation for both user and item features
ITEM_ALPHA = 1e-6
USER_ALPHA = 1e-6

# seed for pseudonumber generations
SEED = 42

### 2.2 Retrieve data

In [4]:
import pandas as pd
# path config
data_path = '/home/raiane/Documentos/Projetos/tech_challenge_5/data/training_data_v2.csv'
df_ratings = pd.read_csv(
    data_path)

In [5]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1426291 entries, 0 to 1426290
Data columns (total 7 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   Unnamed: 0               1426291 non-null  int64  
 1   userId                   1426291 non-null  object 
 2   history                  1426291 non-null  object 
 3   numberOfClicksHistory    1426291 non-null  int64  
 4   timeOnPageHistory        1426291 non-null  int64  
 5   scrollPercentageHistory  1426291 non-null  float64
 6   pageVisitsCountHistory   1426291 non-null  int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 76.2+ MB


In [6]:
#considerando apenas noticias com mais de 30 cliques
clicks_counts = pd.DataFrame(df_ratings["history"].value_counts())
rare_news = clicks_counts[df_ratings["history"].value_counts() <= 500].index
common_news = df_ratings[~df_ratings["history"].isin(rare_news)]

In [8]:
common_news.head()

Unnamed: 0.1,Unnamed: 0,userId,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory
4,1,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,fe856057-f97d-419f-ab1c-97c5c3e0719c,80,210489,45.66,1
33,1,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,e3cdb277-ad80-4025-b5de-cd98cc67d23d,16,59188,42.53,1
64,2,0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...,29b6b142-4173-4ec4-832f-7d0a32255c10,0,140000,47.22,1
66,2,0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...,1f32787b-de2b-49be-8c20-ddaeae34cc22,0,157018,63.09,1
71,2,0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...,a36c98b5-f159-48f8-9f5a-1fc6ea9956c8,2,166793,44.52,2


In [9]:
df_news = pd.read_csv("../data/noticias_final.csv")

In [10]:
df_news.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,page,url,issued,modified,title,body,caption,cleaned_title,cleaned_caption,combined_text,cleaned_body,cluster,word_count,data,idade_em_dias,peso,peso_ajustado
0,0,0,13db0ab1-eea2-4603-84c4-f40a876c7400,http://g1.globo.com/am/amazonas/noticia/2022/0...,2022-06-18 20:37:45+00:00,2023-04-15 00:02:08+00:00,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"Após audiência de custódia, a Justiça do Amazo...",Jeferson da Silva Lima foi escoltado por agent...,caso bruno dom suspeito prisão temporária decr...,jeferson silva lima escoltado agentes polícia ...,caso bruno dom suspeito prisão temporária decr...,audiência custódia justiça amazonas decretou s...,2,704,2022-06-18 20:37:45+00:00,57,0.1495686,0.234612
1,1,1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,http://g1.globo.com/pa/santarem-regiao/noticia...,2019-06-20 17:19:52+00:00,2023-06-16 20:19:15+00:00,Linguajar dos santarenos é diferenciado e chei...,Vista aérea de Santarém\nÁdrio Denner/ AD Prod...,As expressões santarenas não significam apenas...,linguajar santarenos diferenciado cheio identi...,expressões santarenas significam significam co...,linguajar santarenos diferenciado cheio identi...,vista aérea santarém ádrio ad produções paraen...,0,728,2019-06-20 17:19:52+00:00,1151,2.175547e-17,0.1
2,2,2,61e07f64-cddf-46f2-b50c-ea0a39c22050,http://g1.globo.com/mundo/noticia/2022/07/08/e...,2022-07-08 08:55:52+00:00,2023-04-15 04:25:39+00:00,Ex-premiê Shinzo Abe morre após ser baleado no...,Novo vídeo mostra que assassino de Shinzo Abe ...,Ex-primeiro-ministro foi atingido por tiros de...,shinzo abe morre baleado japão,atingido tiros espingarda caseira discursava c...,shinzo abe morre baleado japão atingido tiros ...,vídeo mostra assassino shinzo abe atirou costa...,2,1315,2022-07-08 08:55:52+00:00,37,0.2913199,0.362188
3,3,3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,http://g1.globo.com/politica/noticia/2021/09/0...,2021-09-09 19:06:46+00:00,2023-06-07 17:44:54+00:00,"Relator no STF, Fachin vota contra marco tempo...","Relator no STF, Fachin vota contra marco tempo...",Ministro defendeu que posse indígena é diferen...,relator stf fachin vota marco temporal demarca...,ministro defendeu posse indígena diferente pos...,relator stf fachin vota marco temporal demarca...,relator stf fachin vota marco temporal demarca...,0,611,2021-09-09 19:06:46+00:00,339,1.237292e-05,0.100011
4,4,4,9dff71eb-b681-40c7-ac8d-68017ac36675,http://g1.globo.com/politica/noticia/2021/09/1...,2021-09-15 19:16:13+00:00,2023-06-07 17:43:39+00:00,"\nApós 2 votos, pedido de vista suspende julga...",Após um pedido de vista (mais tempo para análi...,"Pelo marco temporal, índios só podem reivindic...",votos pedido vista suspende julgamento stf dem...,marco temporal índios reivindicar demarcação t...,votos pedido vista suspende julgamento stf dem...,pedido vista análise processo ministro alexand...,0,506,2021-09-15 19:16:13+00:00,333,1.511232e-05,0.100014


In [11]:
df_merged = pd.merge(common_news,df_news, left_on='history', right_on='page', how='left')

In [12]:
df_merged.head()

Unnamed: 0,Unnamed: 0_x,userId,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,Unnamed: 0.1,Unnamed: 0_y,page,...,cleaned_title,cleaned_caption,combined_text,cleaned_body,cluster,word_count,data,idade_em_dias,peso,peso_ajustado
0,1,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,fe856057-f97d-419f-ab1c-97c5c3e0719c,80,210489,45.66,1,186368,186368,fe856057-f97d-419f-ab1c-97c5c3e0719c,...,designer sobrancelhas viraliza web pegadinha c...,vídeo publicado geizielle ferreira mendes mora...,designer sobrancelhas viraliza web pegadinha c...,designer sobrancelhas viraliza web pegadinha c...,0,389,2022-08-14 11:39:11+00:00,0,1.0,1.0
1,1,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,e3cdb277-ad80-4025-b5de-cd98cc67d23d,16,59188,42.53,1,199434,199434,e3cdb277-ad80-4025-b5de-cd98cc67d23d,...,instrutor academia araras morre baleado pai es...,nelson ré soares anos resistiu ferimentos auto...,instrutor academia araras morre baleado pai es...,instrutor nelson ré soares anos resistiu ferim...,2,528,2022-07-23 16:17:25+00:00,22,0.480305,0.532275
2,2,0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...,29b6b142-4173-4ec4-832f-7d0a32255c10,0,140000,47.22,1,105647,105647,29b6b142-4173-4ec4-832f-7d0a32255c10,...,sabine boghici presa golpe milionário mãe herd...,mulher contratou pessoas passaram videntes mãe...,sabine boghici presa golpe milionário mãe herd...,sabine boghici sabine boghici mulher presa gol...,2,908,2022-08-10 21:21:02+00:00,4,0.875173,0.887656
3,2,0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...,1f32787b-de2b-49be-8c20-ddaeae34cc22,0,157018,63.09,1,60121,60121,1f32787b-de2b-49be-8c20-ddaeae34cc22,...,filha presa golpe estimado milhões mãe quadros...,investigações filha contratou pessoas passaram...,filha presa golpe estimado milhões mãe quadros...,filha presa golpe estimado milhões mãe atriz h...,0,889,2022-08-10 09:55:29+00:00,4,0.875173,0.887656
4,2,0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...,a36c98b5-f159-48f8-9f5a-1fc6ea9956c8,2,166793,44.52,2,116040,116040,a36c98b5-f159-48f8-9f5a-1fc6ea9956c8,...,campeão mundial leandro lo baleado cabeça dura...,advogado família confirmada morte cerebral atl...,campeão mundial leandro lo baleado cabeça dura...,campeão mundial leandro lo baleado cabeça paul...,2,640,2022-08-07 12:37:58+00:00,7,0.79189,0.812701


In [13]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset

# Convert to strings for LightFM compatibility
df_merged["cluster"] = "cluster:" + df_merged["cluster"].astype(str)  # Convert cluster to string feature

# Bin word_count into categories (as per previous request)
df_merged["word_count_bin"] = pd.cut(
    df_merged["word_count"],
    bins=[0, 100, 500, 1000, np.inf],  
    labels=["low", "medium", "high", "very_high"]
).astype(str)  # Convert to string
df_merged["word_count_bin"] = "word_count:" + df_merged["word_count_bin"]


In [77]:
df_merged = df_merged[["userId","history","numberOfClicksHistory","cluster","word_count_bin","peso_ajustado"]]

In [78]:
df_merged.head()

Unnamed: 0,userId,history,numberOfClicksHistory,cluster,word_count_bin,peso_ajustado
0,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,fe856057-f97d-419f-ab1c-97c5c3e0719c,80,cluster:0,word_count:medium,1.0
1,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,e3cdb277-ad80-4025-b5de-cd98cc67d23d,16,cluster:2,word_count:high,0.532275
2,0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...,29b6b142-4173-4ec4-832f-7d0a32255c10,0,cluster:2,word_count:high,0.887656
3,0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...,1f32787b-de2b-49be-8c20-ddaeae34cc22,0,cluster:0,word_count:high,0.887656
4,0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...,a36c98b5-f159-48f8-9f5a-1fc6ea9956c8,2,cluster:2,word_count:high,0.812701


### 2.3 Prepare data

Before fitting the LightFM model, we need to create an instance of `Dataset` which holds the interaction matrix.

In [14]:
dataset = Dataset()

# Get unique values for users, items, and user features
unique_users = df_merged["userId"].unique()
unique_items = df_merged["history"].unique()
unique_user_features = df_merged["cluster"].unique().tolist() + df_merged["word_count_bin"].unique().tolist()

# Fit dataset with users, items, and user feature names
dataset.fit(
    users=unique_users,
    items=unique_items,
    user_features=unique_user_features  # Register user features (cluster & word_count_bin)
)


In [16]:
(interactions, weights) = dataset.build_interactions([
    (row.userId, row.history, row.peso_ajustado) 
    for _, row in df_merged.iterrows()
])


In [20]:
user_features_list = [
    (row.userId, [row.cluster, row.word_count_bin])  
    for _, row in df_merged.iterrows()
]

user_features = dataset.build_user_features(user_features_list)


LightLM works slightly differently compared to other packages as it expects the train and test sets to have same dimension. Therefore the conventional train test split will not work.

The package has included the `cross_validation.random_train_test_split` method to split the interaction data and splits it into two disjoint training and test sets. 

However, note that **it does not validate the interactions in the test set to guarantee all items and users have historical interactions in the training set**. Therefore this may result into a partial cold-start problem in the test set.

In [21]:
import scipy.sparse as sp
# Split train and test sets (80/20 split)
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=42)

train_weights, test_weights  = random_train_test_split(weights, test_percentage=0.2, random_state=42)


In [22]:
train_weights.shape

(34107, 536)

Double check the size of both the train and test sets.

In [23]:
print(f"Shape of train interactions: {train.shape}")
print(f"Shape of test interactions: {test.shape}")

Shape of train interactions: (34107, 536)
Shape of test interactions: (34107, 536)


### 2.4 Fit the LightFM model

In this notebook, the LightFM model will be using the weighted Approximate-Rank Pairwise (WARP) as the loss. Further explanation on the topic can be found [here](https://making.lyst.com/lightfm/docs/examples/warp_loss.html#learning-to-rank-using-the-warp-loss).


In general, it maximises the rank of positive examples by repeatedly sampling negative examples until a rank violation has been located. This approach is recommended when only positive interactions are present.

The LightFM model can be fitted with the following code:

In [24]:
# model = LightFM(loss='warp', no_components=NO_COMPONENTS, 
#                  learning_rate=LEARNING_RATE,                 
                 
#                  random_state=np.random.RandomState(SEED))

model = LightFM(loss="warp",learning_rate=LEARNING_RATE)  # Weighted Approximate-Rank Pairwise (WARP) loss
model.fit(train, sample_weight=train_weights, epochs=NO_EPOCHS, num_threads=4, user_features=user_features)


<lightfm.lightfm.LightFM at 0x71f50a383970>

### 2.4 Evaluate model

In [25]:
# Import the evaluation routines
from lightfm.evaluation import auc_score

# Compute evaluation metrics
auc_test = auc_score(model, test, user_features=user_features).mean()
auc_train = auc_score(model, train, user_features=user_features).mean()

# Print evaluation results
print(f"AUC test Score: {auc_test:.4f}")
print(f"AUC train Score: {auc_train:.4f}")


AUC test Score: 0.7170
AUC train Score: 0.8343


### 2.6 Make predictions known users

In [30]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

In [47]:
item_id_map_reverse = {v: k for k, v in item_id_map.items()}


In [73]:
def sample_recommendation(user_x,n_items,df_news):
    scores = model.predict(user_x, np.arange(n_items)) # means predict for all
    top_5_indices = np.argsort(-scores)[:5]  # Sort scores in descending order and take the top 5
    top_5_items = [item_id_map_reverse[i] for i in top_5_indices]

    print("Top 5 recommended items:")

    for x in top_5_items:
        row = df_news[df_news["page"] == x]
        print("        %s" % row["title"].values[0])

In [79]:
# predict for existing user
user_x = user_id_map['5f5e17781fc2ec0ddcfb2e9356e61c5d3d4b0b3c8fabd20917feb9e807463856']
_, n_items = interactions.shape # no of users * no of items

sample_recommendation(user_x,n_items,df_news)

Top 5 recommended items:
        Onda atinge grupo que observava surfistas e deixa 8 feridos em Itacoatiara, Niterói
        Google vai distribuir 500 mil bolsas de estudos para cursos em tecnologia; veja como se candidatar
        Celulares com 5G: veja a lista de aparelhos homologados pela Anatel
        Datafolha: Lula tem 47% no primeiro turno, contra 29% de Bolsonaro
        IBGE lança novo concurso com 15 mil vagas para recenseador


### 2.6 Make predictions unknown users

In [38]:
from scipy import sparse

def format_newuser_input(user_feature_map, user_feature_list):
  #user_feature_map = user_feature_map  
  num_features = len(user_feature_list)
  normalised_val = 1.0 
  target_indices = []
  for feature in user_feature_list:
    try:
        target_indices.append(user_feature_map[feature])
    except KeyError:
        print("new user feature encountered '{}'".format(feature))
        pass
  #print("target indices: {}".format(target_indices))
  new_user_features = np.zeros(len(user_feature_map.keys()))
  for i in target_indices:
    new_user_features[i] = normalised_val
  new_user_features = sparse.csr_matrix(new_user_features)
  return(new_user_features)

In [39]:
# predict for new user
user_feature_list = ['cluster:0', 'word_count_bin:high']

In [40]:
new_user_features = format_newuser_input(user_feature_map, user_feature_list)

new user feature encountered 'word_count_bin:high'


In [75]:
new_user_features

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1 stored elements and shape (1, 34113)>

In [61]:

model.predict(0, np.arange(n_items), user_features=new_user_features) # Here 0 means pick the first row of the user_features sparse matrix

array([-1240.7048, -1243.9634, -1240.8347, -1238.1718, -1241.3679,
       -1243.3411, -1245.6151, -1244.4965, -1244.4512, -1243.4095,
       -1240.3734, -1240.0743, -1249.2484, -1240.5262, -1235.3912,
       -1243.042 , -1246.784 , -1246.8486, -1235.9092, -1239.4435,
       -1237.7383, -1242.7742, -1242.5287, -1236.6522, -1237.8728,
       -1245.5396, -1235.3964, -1237.065 , -1247.5812, -1238.6405,
       -1239.5883, -1239.3997, -1243.0555, -1241.3712, -1243.3131,
       -1237.2103, -1237.5797, -1241.3918, -1243.6853, -1239.8214,
       -1243.6676, -1246.6827, -1241.0176, -1242.148 , -1239.2128,
       -1242.9564, -1246.3877, -1237.1011, -1242.9227, -1240.4587,
       -1243.6687, -1241.1655, -1243.1774, -1240.904 , -1240.4133,
       -1242.0392, -1241.7694, -1238.1925, -1238.4968, -1244.7991,
       -1242.5426, -1242.4398, -1238.0922, -1242.7742, -1246.7732,
       -1239.1377, -1243.3821, -1234.4337, -1241.1188, -1235.9014,
       -1236.7063, -1238.6431, -1235.0616, -1237.855 , -1239.9