# Model recommendation with lighfm

### Import libraries

In [1]:
import os
import sys
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from lightfm import cross_validation
import scipy.sparse as sp
from scipy import sparse

### Defining variables

In [2]:
import json

with open('config.json', 'r') as f:
    config = json.load(f)

In [3]:
# NUM_ITEMS_TRAIN = 108573
# NUM_ITEMS_TRAIN = 63893
NUM_ITEMS_TRAIN = 231305


### Retrieve data

In [4]:
dtype_df_valid = {
"userId" : 'string',
"userType" : 'category',
"history" : 'string',
"timestampHistory" : 'string'
}

In [5]:
import pandas as pd
# path config

df_valid = pd.read_csv(config["VALID_DF"],dtype=dtype_df_valid)
df_valid.head(3)

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,['be89a7da-d9fa-49d4-9fdc-388c27a15bc8'  '01c5...,[1660533136590 1660672113513]
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,['77901133-aee7-4f7b-afc0-652231d76fe9'],[1660556860253]
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,['857aa90f-a7ec-410d-ba82-dfa4f85d4e71'],[1660561649242]


In [6]:
dtype_df_train_score = {
"userId" : 'string',
"userType" : 'category',
"history" : 'string',
"score" : 'Float32'
}

In [7]:
import pandas as pd

# df_ratings = pd.read_csv(config["DF_TRAIN_SCORES"], dtype=dtype_df_train_score)
df_ratings = pd.read_csv(config["DF_TRAIN_SCORES"], dtype=dtype_df_train_score, nrows=1000000)
df_ratings.drop(columns=["Unnamed: 0"],inplace=True)
df_ratings.head(3)

Unnamed: 0,userId,history,userType,score
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,Non-Logged,2.216477
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,Non-Logged,2.429501
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,Non-Logged,1.794861


In [8]:
df_news = pd.read_csv(config["DF_ITEMS_FEATURE"])
df_news.drop(columns=["Unnamed: 0"],inplace=True)
df_news.head(3)

Unnamed: 0,page,age_exp_normalized,ageCategories
0,7371a9b5-5824-4c57-8704-00a74feebe79,0.151439,very-old
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,0.140788,very-old
2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,0.12261,very-old


In [9]:
df_valid_new = df_valid.drop(columns="timestampHistory")

from utils.custom_treat_data_funcs import transform_text_to_list, explode_df_columns

# df_valid_new["history"] = df_valid_new["history"].apply(str)
df_valid_new["history"] = df_valid_new["history"].apply(transform_text_to_list)
# df_valid_new.head(3)
df_valid_new = df_valid_new.explode("history", ignore_index=True)
df_valid_new.head(3)

Unnamed: 0,userId,userType,history
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,be89a7da-d9fa-49d4-9fdc-388c27a15bc8
1,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,01c59ff6-fb82-4258-918f-2910cb2d4c52
2,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,77901133-aee7-4f7b-afc0-652231d76fe9


In [10]:
df_valid_new.loc[0,"history"]

'be89a7da-d9fa-49d4-9fdc-388c27a15bc8'

In [11]:
df_valid_exploded = pd.concat([df_ratings, df_valid_new])
df_valid_exploded

Unnamed: 0,userId,history,userType,score
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,Non-Logged,2.216477
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,Non-Logged,2.429501
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,Non-Logged,1.794861
3,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,a0562805-c7d1-4ffd-b622-87c50ae006f4,Non-Logged,1.68271
4,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,233f8238-2ce0-470f-a9d5-0e0ac530382a,Non-Logged,2.266852
...,...,...,...,...
178863,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,ecb1f348-cd55-47f7-99f9-bb2c84e93f96,Non-Logged,
178864,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,ecb1f348-cd55-47f7-99f9-bb2c84e93f96,Non-Logged,
178865,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,3d52cd6b-706e-49f0-9215-0340010a9845,Non-Logged,
178866,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,d730c4a6-e8f6-4fde-b73a-afbe148479cd,Non-Logged,


In [12]:
# duplicates = df_valid_exploded[df_valid_exploded.duplicated(subset=['userId','history'], keep=False)]
# duplicates = duplicates[duplicates["score"].isna()]
duplicates = df_valid_exploded[df_valid_exploded.duplicated(subset=['userId','history'], keep='first')]
duplicates
# duplicates = df_valid_exploded[df_valid_exploded.duplicated(subset=['history'], keep=False)]
# duplicates[duplicates["history"]=='253339a1-92b6-44d1-8fa2-59236c5251b1']
# duplicates[duplicates["userId"]=='f0e758359a184c99912e1ad5fc912b92b5e7b63c7a6002']

Unnamed: 0,userId,history,userType,score
75,9107ae20aa08ea8e46e27bb83801c8a77696a7499a2b6e...,c1c1e207-3a8c-4074-8973-56bdfed265b5,Logged,
99,e8f9ad90314928d72e21b7994c200cf01148d7f1779845...,b2a21491-7576-4be0-a520-97ce9470efeb,Logged,
151,86e84a6997cb9b6b72d226cf892d57e83a96216443f7aa...,esid:conteudo_editorial_g1#materia#https://esp...,Logged,
152,86e84a6997cb9b6b72d226cf892d57e83a96216443f7aa...,esid:conteudo_editorial_g1#materia#http://espe...,Logged,
287,fbcb5d638da9f894b86276d6a9957db67505b4a4b8fbd1...,bbdb1500-3b2c-407b-93e5-dff4d7f3d497,Logged,
...,...,...,...,...
178739,50fea75c26f3696a8d568735df3caef1fb7056fe444a48...,esid:conteudo_editorial_g1#materia#https://esp...,Non-Logged,
178765,dbdaa92a45877acbe933fcd5615abaae9094eda22c76ab...,00876641-3e48-4247-9d1c-22384270c1e0,Non-Logged,
178793,3b0a6ec5c08ac7605facf4c69ea36729ef58451ff66b70...,facb1836-1fe2-454f-8537-84d0b6875236,Non-Logged,
178794,dbbb10db744577c8d84d331a5c06d03a01b790f1e764bf...,af3506e9-7b91-4620-9ec8-7daa8a6f4877,Non-Logged,


In [13]:
df_valid_cleaned = df_valid_exploded.drop_duplicates(subset=['userId','history'], keep="first")
df_valid_cleaned

Unnamed: 0,userId,history,userType,score
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,Non-Logged,2.216477
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,Non-Logged,2.429501
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,Non-Logged,1.794861
3,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,a0562805-c7d1-4ffd-b622-87c50ae006f4,Non-Logged,1.68271
4,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,233f8238-2ce0-470f-a9d5-0e0ac530382a,Non-Logged,2.266852
...,...,...,...,...
178863,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,ecb1f348-cd55-47f7-99f9-bb2c84e93f96,Non-Logged,
178864,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,ecb1f348-cd55-47f7-99f9-bb2c84e93f96,Non-Logged,
178865,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,3d52cd6b-706e-49f0-9215-0340010a9845,Non-Logged,
178866,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,d730c4a6-e8f6-4fde-b73a-afbe148479cd,Non-Logged,


In [14]:
df_valid_cleaned["score"].fillna(0,inplace=True)
df_valid_cleaned

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_valid_cleaned["score"].fillna(0,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid_cleaned["score"].fillna(0,inplace=True)


Unnamed: 0,userId,history,userType,score
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,Non-Logged,2.216477
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,Non-Logged,2.429501
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,Non-Logged,1.794861
3,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,a0562805-c7d1-4ffd-b622-87c50ae006f4,Non-Logged,1.68271
4,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,233f8238-2ce0-470f-a9d5-0e0ac530382a,Non-Logged,2.266852
...,...,...,...,...
178863,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,ecb1f348-cd55-47f7-99f9-bb2c84e93f96,Non-Logged,0.0
178864,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,ecb1f348-cd55-47f7-99f9-bb2c84e93f96,Non-Logged,0.0
178865,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,3d52cd6b-706e-49f0-9215-0340010a9845,Non-Logged,0.0
178866,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,d730c4a6-e8f6-4fde-b73a-afbe148479cd,Non-Logged,0.0


In [15]:
dataset = Dataset()

# Get unique values for users, items, and user features
unique_users = df_valid_cleaned["userId"].unique()
unique_items = df_valid_cleaned["history"].unique()
unique_user_features = df_valid_cleaned["userType"].unique().tolist()

# Fit dataset with users, items, and user feature names
dataset.fit(
    users=unique_users,
    items=unique_items,
    user_features=unique_user_features  # Register user features
)

In [16]:
(interactions, weights) = dataset.build_interactions([
    (row.userId, row.history) 
    for _, row in df_valid_cleaned.iterrows()
])

In [17]:
user_features_list = [
    (row.userId, [row.userType])  
    for _, row in df_valid_cleaned.iterrows()
]

user_features = dataset.build_user_features(user_features_list)

In [18]:
import pickle

loaded_model = pickle.load(open('artifacts/lightfm_model.pkl', 'rb'))
# loaded_user_id_map = pickle.load(open('artifacts/user_id_map.pkl', 'rb'))
# loaded_item_id_map_reverse = pickle.load(open('artifacts/item_id_map_reverse.pkl', 'rb'))
# loaded_user_feature_map = pickle.load(open('artifacts/user_feature_map.pkl', 'rb'))

In [19]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
item_id_map_reverse = {v: k for k, v in item_id_map.items()}

loaded_user_id_map = user_id_map
loaded_item_id_map_reverse = item_id_map_reverse
loaded_user_feature_map = user_feature_map

In [20]:
_, n_items = interactions.shape # no of users * no of items

n_items

102540

### Make predictions to known and unknowm on same recommendation function with pkls

In [21]:
def format_newuser_input(user_feature_map, user_feature_list):
  normalised_val = 1.0 
  target_indices = []
  for feature in user_feature_list:
    try:
        target_indices.append(user_feature_map[feature])
    except KeyError:
        print("new user feature encountered '{}'".format(feature))
        pass
  #print("target indices: {}".format(target_indices))
  new_user_features = np.zeros(len(user_feature_map.keys()))
  for i in target_indices:
    new_user_features[i] = normalised_val
  new_user_features = sparse.csr_matrix(new_user_features)
  return(new_user_features)

In [22]:
def sample_recommendation_by_title(user_hash,df_news,user_feature_list,item_id_map_reverse,user_feature_map,user_id_map,model):
    try:
        user_x = user_id_map[user_hash]
        scores = model.predict(user_x, np.arange(n_items)) # means predict for all
    except:
        new_user_features = format_newuser_input(user_feature_map, user_feature_list)
        scores = model.predict(0, np.arange(n_items), user_features=new_user_features)
    
    top_5_indices = np.argsort(-scores)[:10]  # Sort scores in descending order and take the top 5
    top_5_items = [item_id_map_reverse[i] for i in top_5_indices]

    print("Top 5 recommended items:")

    for x in top_5_items:
        row = df_news[df_news["page"] == x]
        print("        %s" % row["title"].values[0])


In [23]:
# predict for known user
# df_valid["userId"].iloc[0]
user_feature_list = [df_valid["userType"].iloc[0]]
user_hash = df_valid["userId"].iloc[0]

# sample_recommendation_by_title(user_hash,df_news,user_feature_list,loaded_item_id_map_reverse,loaded_user_feature_map,loaded_user_id_map,loaded_model)

In [24]:
def get_recommended_history_list(user_hash,user_feature_list,item_id_map_reverse,user_feature_map,user_id_map,model):
    """
    This function verifies if the users is known or new, and makes recommendations depending on this verification.
    The top 5 recommendations from the list are returned.
    """
    try:
        user_x = user_id_map[user_hash]
        scores = model.predict(user_x, np.arange(n_items)) # means predict for all
    except:
        new_user_features = format_newuser_input(user_feature_map, user_feature_list)
        scores = model.predict(0, np.arange(n_items), user_features=new_user_features)
    
    top_5_indices = np.argsort(-scores)[:5]  # Sort scores in descending order and take the top 5
    top_5_items = [item_id_map_reverse[i] for i in top_5_indices]

    return top_5_items

In [25]:
from utils.custom_treat_data_funcs import transform_text_to_list

# Transform the single string with histories/items into a list of strings

df_valid["history"] = df_valid["history"].apply(transform_text_to_list)
df_valid

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,"[be89a7da-d9fa-49d4-9fdc-388c27a15bc8, 01c59ff...",[1660533136590 1660672113513]
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,[77901133-aee7-4f7b-afc0-652231d76fe9],[1660556860253]
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,[857aa90f-a7ec-410d-ba82-dfa4f85d4e71],[1660561649242]
3,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,"[b7b90e18-7613-4ca0-a8fc-fd69addfcd85, 835fdd8...",[1660533830245 1660540831707 1660542659111 166...
4,a120515626fe5d12b22b7d5a7c5008912cc69284aa26cc...,Logged,"[9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6, b8eba39...",[1660548813953 1660572329731 1660594848200]
...,...,...,...,...
112179,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,Non-Logged,[ecb1f348-cd55-47f7-99f9-bb2c84e93f96],[1660546612592]
112180,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,Non-Logged,[ecb1f348-cd55-47f7-99f9-bb2c84e93f96],[1660597026440]
112181,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,Non-Logged,[3d52cd6b-706e-49f0-9215-0340010a9845],[1660678862844]
112182,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,Non-Logged,[d730c4a6-e8f6-4fde-b73a-afbe148479cd],[1660584228926]


In [26]:
# Testing for just one user

user_feature_list = [df_valid["userType"].iloc[0]]
user_hash = df_valid["userId"].iloc[0]
validation_history_hashes = df_valid["history"].iloc[0]

recommeded_histories = get_recommended_history_list(user_hash,user_feature_list,loaded_item_id_map_reverse,loaded_user_feature_map,loaded_user_id_map,loaded_model)
print(recommeded_histories)
print(validation_history_hashes)

['bb83f87b-f663-459a-bbf4-2a61ec342daa', 'a9e186f0-b5fb-4427-888c-8452c5b18197', '710be344-2bd0-41e7-b7cf-fe33fa348145', '15672858-2de0-4166-a262-a00eec119ba4', 'b25d07a5-2bc7-4d9c-b7a1-564b3131eaae']
['be89a7da-d9fa-49d4-9fdc-388c27a15bc8', '01c59ff6-fb82-4258-918f-2910cb2d4c52']


In [27]:
def count_valid_recommendations(validation_history_hashes, recommeded_histories):
    """
    This function receives 
    * The `validation_history_hashes` (which is a list of histories contained in the "validacao.csv")
    * The `recommeded_histories` (which are the recommended histories/items by the model)
    Then, it verifies how many recommended items match with the validation histories.
    """
    count_valids = 0
    for valid_hist in validation_history_hashes:
        if valid_hist in recommeded_histories:
            count_valids = count_valids+1
    return count_valids

In [28]:
df_valid["recommended_hists"] = "" # creates a column to receive the list of recommended histories
df_valid["matched_recommendations"] = 0 # creates a column to receive the quantity of matches between recommendations and validation items
df_valid["historySize"] = df_valid["history"].apply(lambda x : len(x)) # calculate the quantity of histories for each user

In [29]:
df_valid.head(2)

Unnamed: 0,userId,userType,history,timestampHistory,recommended_hists,matched_recommendations,historySize
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,"[be89a7da-d9fa-49d4-9fdc-388c27a15bc8, 01c59ff...",[1660533136590 1660672113513],,0,2
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,[77901133-aee7-4f7b-afc0-652231d76fe9],[1660556860253],,0,1


In [30]:
for index, row in df_valid.iterrows():
    """
    For each user, get the top 5 recommendations from the model, and also count how many of them are contained within the validation set.
    Stores the results on the columns `recommended_hists` `matched_recommendations`.
    """
    user_feature_list = [row["userType"]]
    user_hash = row["userId"]
    validation_history_hashes = row["history"]

    recommended_hist = get_recommended_history_list(user_hash,user_feature_list,loaded_item_id_map_reverse,loaded_user_feature_map,loaded_user_id_map,loaded_model)
    num_valid_recommendations = count_valid_recommendations(validation_history_hashes, recommended_hist)

    row["recommended_hists"] = recommended_hist
    row["matched_recommendations"] = num_valid_recommendations

In [31]:
df_valid.head(2)

Unnamed: 0,userId,userType,history,timestampHistory,recommended_hists,matched_recommendations,historySize
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,"[be89a7da-d9fa-49d4-9fdc-388c27a15bc8, 01c59ff...",[1660533136590 1660672113513],,0,2
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,[77901133-aee7-4f7b-afc0-652231d76fe9],[1660556860253],,0,1


In [32]:
df_valid.describe()

Unnamed: 0,matched_recommendations,historySize
count,112184.0,112184.0
mean,0.0,1.594416
std,0.0,1.055817
min,0.0,1.0
25%,0.0,1.0
50%,0.0,1.0
75%,0.0,2.0
max,0.0,5.0
