# Model recommendation with lighfm

### Import libraries

In [1]:
import os
import sys
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from lightfm import cross_validation
import scipy.sparse as sp
from scipy import sparse

### Defining variables

In [2]:
import json

with open('config.json', 'r') as f:
    config = json.load(f)

In [3]:
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of epochs to fit model
NO_EPOCHS = 20

# seed for pseudonumber generations
SEED = 42

### Retrieve data

In [4]:
import pandas as pd
# path config

df_valid = pd.read_csv(config["VALID_DF"])
df_valid

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,['be89a7da-d9fa-49d4-9fdc-388c27a15bc8'\n '01c...,[1660533136590 1660672113513]
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,['77901133-aee7-4f7b-afc0-652231d76fe9'],[1660556860253]
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,['857aa90f-a7ec-410d-ba82-dfa4f85d4e71'],[1660561649242]
3,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,['b7b90e18-7613-4ca0-a8fc-fd69addfcd85'\n '835...,[1660533830245 1660540831707 1660542659111 166...
4,a120515626fe5d12b22b7d5a7c5008912cc69284aa26cc...,Logged,['9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6'\n 'b8e...,[1660548813953 1660572329731 1660594848200]
...,...,...,...,...
112179,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,Non-Logged,['ecb1f348-cd55-47f7-99f9-bb2c84e93f96'],[1660546612592]
112180,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,Non-Logged,['ecb1f348-cd55-47f7-99f9-bb2c84e93f96'],[1660597026440]
112181,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,Non-Logged,['3d52cd6b-706e-49f0-9215-0340010a9845'],[1660678862844]
112182,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,Non-Logged,['d730c4a6-e8f6-4fde-b73a-afbe148479cd'],[1660584228926]


In [10]:
df_news = pd.read_csv(config["DF_ITEMS_CLUSTERED_FEATURES_ADJ"])
df_news.drop(columns=["Unnamed: 0"],inplace=True)
df_news.head(3)

Unnamed: 0,page,url,issued,modified,title,body,caption,cleaned_body,cleaned_title,cleaned_caption,combined_text,cluster,word_count,data,idade_em_dias,peso,peso_ajustado
0,7371a9b5-5824-4c57-8704-00a74feebe79,http://g1.globo.com/al/alagoas/noticia/2018/09...,2018-09-13 14:52:55+00:00,2018-09-14 16:14:49+00:00,Corpo de motorista da Uber é encontrado em can...,Corpo de motorista de aplicativo desaparecido ...,"Segundo a polícia, Antônio Vitor foi solicitad...",corpo motorista aplicativo desaparecido encont...,corpo motorista uber encontrado canavial rotei...,polícia antônio vitor solicitado corridas desa...,corpo motorista uber encontrado canavial rotei...,0,234,2018-09-13 14:52:55+00:00,1431,0.053698,0.148328
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,http://g1.globo.com/am/amazonas/noticia/detent...,2018-05-20 20:42:40+00:00,2018-05-20 20:42:40+00:00,Detento recapturado após fuga por túnel volta ...,Detento disse que passou nome falso ao dar ent...,Ele tinha registro em presídio com nome falso....,detento disse passou nome falso entrada presíd...,detento recapturado fuga túnel volta cdpm manaus,tinha registro presídio nome falso presos esca...,detento recapturado fuga túnel volta cdpm mana...,0,345,2018-05-20 20:42:40+00:00,1547,0.042365,0.138128
2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,http://g1.globo.com/ap/amapa/noticia/audios-mo...,2017-07-30 00:37:17+00:00,2017-07-30 00:48:42+00:00,Áudios mostram conversa entre bandidos durante...,Áudios mostram possível conversa entre bandido...,Revista realizada na sexta-feira (28) no Iapen...,áudios mostram conversa bandidos tentativa fug...,áudios mostram conversa bandidos durante tenta...,revista realizada iapen apreendeu celulares dr...,áudios mostram conversa bandidos durante tenta...,0,383,2017-07-30 00:37:17+00:00,1842,0.023184,0.120865


In [9]:
df_valid["userType"].iloc[0]

'Logged'

In [5]:
import pickle

loaded_model = pickle.load(open('artifacts/lightfm_model.pkl', 'rb'))
loaded_user_id_map = pickle.load(open('artifacts/user_id_map.pkl', 'rb'))
loaded_item_id_map_reverse = pickle.load(open('artifacts/item_id_map_reverse.pkl', 'rb'))
loaded_user_feature_map = pickle.load(open('artifacts/user_feature_map.pkl', 'rb'))

### Make predictions to known and unknowm on same recommendation function with pkls

In [6]:
def format_newuser_input(user_feature_map, user_feature_list):
  normalised_val = 1.0 
  target_indices = []
  for feature in user_feature_list:
    try:
        target_indices.append(user_feature_map[feature])
    except KeyError:
        print("new user feature encountered '{}'".format(feature))
        pass
  #print("target indices: {}".format(target_indices))
  new_user_features = np.zeros(len(user_feature_map.keys()))
  for i in target_indices:
    new_user_features[i] = normalised_val
  new_user_features = sparse.csr_matrix(new_user_features)
  return(new_user_features)

In [18]:
def sample_recommendation_by_title(user_hash,df_news,user_feature_list,item_id_map_reverse,user_feature_map,user_id_map,model):
    try:
        user_x = user_id_map[user_hash]
        scores = model.predict(user_x, np.arange(108573)) # means predict for all
    except:
        new_user_features = format_newuser_input(user_feature_map, user_feature_list)
        scores = model.predict(0, np.arange(108573), user_features=new_user_features)
    
    top_5_indices = np.argsort(-scores)[:5]  # Sort scores in descending order and take the top 5
    top_5_items = [item_id_map_reverse[i] for i in top_5_indices]

    print("Top 5 recommended items:")

    for x in top_5_items:
        row = df_news[df_news["page"] == x]
        print("        %s" % row["title"].values[0])


In [37]:
# predict for known user
# df_valid["userId"].iloc[0]
user_feature_list = [df_valid["userType"].iloc[0]]
user_hash = df_valid["userId"].iloc[0]

sample_recommendation_by_title(user_hash,df_news,user_feature_list,loaded_item_id_map_reverse,loaded_user_feature_map,loaded_user_id_map,loaded_model)

Top 5 recommended items:
        Filha é presa por golpe estimado em R$ 725 milhões contra a mãe; quadros renomados roubados foram recuperados
        Jô Soares, ícone do humor e da TV, morre em São Paulo aos 84 anos
        Campeão mundial de jiu-jítsu, Leandro Lo é baleado na cabeça durante show em clube da Zona Sul de SP
        Quem é Sabine Boghici, presa por golpe milionário contra a mãe e herdeira de um dos maiores colecionadores de arte do país
        Caso Bárbara: suspeito de envolvimento no assassinato de criança é encontrado morto em BH


In [42]:
def get_recommended_history_list(user_hash,user_feature_list,item_id_map_reverse,user_feature_map,user_id_map,model):
    try:
        user_x = user_id_map[user_hash]
        scores = model.predict(user_x, np.arange(108573)) # means predict for all
    except:
        new_user_features = format_newuser_input(user_feature_map, user_feature_list)
        scores = model.predict(0, np.arange(108573), user_features=new_user_features)
    
    top_5_indices = np.argsort(-scores)[:5]  # Sort scores in descending order and take the top 5
    top_5_items = [item_id_map_reverse[i] for i in top_5_indices]

    return top_5_items

In [40]:
from utils.custom_treat_data_funcs import transform_text_to_list

df_valid["history"] = df_valid["history"].apply(transform_text_to_list)
df_valid

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,"[be89a7da-d9fa-49d4-9fdc-388c27a15bc8, 01c59ff...",[1660533136590 1660672113513]
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,[77901133-aee7-4f7b-afc0-652231d76fe9],[1660556860253]
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,[857aa90f-a7ec-410d-ba82-dfa4f85d4e71],[1660561649242]
3,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,"[b7b90e18-7613-4ca0-a8fc-fd69addfcd85, 835fdd8...",[1660533830245 1660540831707 1660542659111 166...
4,a120515626fe5d12b22b7d5a7c5008912cc69284aa26cc...,Logged,"[9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6, b8eba39...",[1660548813953 1660572329731 1660594848200]
...,...,...,...,...
112179,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,Non-Logged,[ecb1f348-cd55-47f7-99f9-bb2c84e93f96],[1660546612592]
112180,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,Non-Logged,[ecb1f348-cd55-47f7-99f9-bb2c84e93f96],[1660597026440]
112181,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,Non-Logged,[3d52cd6b-706e-49f0-9215-0340010a9845],[1660678862844]
112182,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,Non-Logged,[d730c4a6-e8f6-4fde-b73a-afbe148479cd],[1660584228926]


In [44]:
# predict for known user
user_feature_list = [df_valid["userType"].iloc[0]]
user_hash = df_valid["userId"].iloc[0]
validation_history_hashes = df_valid["history"].iloc[0]

recommeded_histories = get_recommended_history_list(user_hash,user_feature_list,loaded_item_id_map_reverse,loaded_user_feature_map,loaded_user_id_map,loaded_model)
print(recommeded_histories)
print(validation_history_hashes)

['1f32787b-de2b-49be-8c20-ddaeae34cc22', 'bf257382-74fb-4392-ad6a-143240e39f81', 'a36c98b5-f159-48f8-9f5a-1fc6ea9956c8', '29b6b142-4173-4ec4-832f-7d0a32255c10', 'f0a78e58-ec7e-494c-9462-fbd6446a9a89']
['be89a7da-d9fa-49d4-9fdc-388c27a15bc8', '01c59ff6-fb82-4258-918f-2910cb2d4c52']


In [60]:
def count_valid_recommendations(validation_history_hashes, recommeded_histories):
    count_valids = 0
    for valid_hist in validation_history_hashes:
        if valid_hist in recommeded_histories:
            count_valids = count_valids+1
    return count_valids

In [61]:
df_valid["recommended_hists"] = ""
df_valid["matched_recommendations"] = 0
df_valid["historySize"] = df_valid["history"].apply(lambda x : len(x))

In [62]:
df_valid.head(2)

Unnamed: 0,userId,userType,history,timestampHistory,recommended_hists,matched_recommendations,historySize
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,"[be89a7da-d9fa-49d4-9fdc-388c27a15bc8, 01c59ff...",[1660533136590 1660672113513],,0,3
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,[77901133-aee7-4f7b-afc0-652231d76fe9],[1660556860253],,0,1


In [63]:
for index, row in df_valid.iterrows():
    user_feature_list = [row["userType"]]
    user_hash = row["userId"]
    validation_history_hashes = row["history"]

    recommended_hist = get_recommended_history_list(user_hash,user_feature_list,loaded_item_id_map_reverse,loaded_user_feature_map,loaded_user_id_map,loaded_model)
    num_valid_recommendations = count_valid_recommendations(validation_history_hashes, recommended_hist)

    row["recommended_hists"] = recommended_hist
    row["matched_recommendations"] = num_valid_recommendations

In [64]:
df_valid["percent_matches"] = df_valid["matched_recommendations"]/df_valid["historySize"]

In [65]:
df_valid

Unnamed: 0,userId,userType,history,timestampHistory,recommended_hists,matched_recommendations,historySize,percent_matches
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,"[be89a7da-d9fa-49d4-9fdc-388c27a15bc8, 01c59ff...",[1660533136590 1660672113513],,0,3,0.0
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,[77901133-aee7-4f7b-afc0-652231d76fe9],[1660556860253],,0,1,0.0
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,[857aa90f-a7ec-410d-ba82-dfa4f85d4e71],[1660561649242],,0,1,0.0
3,ec1639851d99586c7f4da928deb49187303aec6e3b8d66...,Logged,"[b7b90e18-7613-4ca0-a8fc-fd69addfcd85, 835fdd8...",[1660533830245 1660540831707 1660542659111 166...,,0,5,0.0
4,a120515626fe5d12b22b7d5a7c5008912cc69284aa26cc...,Logged,"[9c764c3a-f9f8-4fb2-b2c4-6331eaeb3dd6, b8eba39...",[1660548813953 1660572329731 1660594848200],,0,3,0.0
...,...,...,...,...,...,...,...,...
112179,aacb28d7d2a4ea745e12ceba1f9ffa0c7b92aae9304ce5...,Non-Logged,[ecb1f348-cd55-47f7-99f9-bb2c84e93f96],[1660546612592],,0,1,0.0
112180,91b3a960562e34504022dbe552c0e8947af20fd14ce861...,Non-Logged,[ecb1f348-cd55-47f7-99f9-bb2c84e93f96],[1660597026440],,0,1,0.0
112181,9de95a8cd681cd21cc9a7e830859e468b83c051c11f0c4...,Non-Logged,[3d52cd6b-706e-49f0-9215-0340010a9845],[1660678862844],,0,1,0.0
112182,001455437b3ca991b144afd8cb785f7727eb0c154faa54...,Non-Logged,[d730c4a6-e8f6-4fde-b73a-afbe148479cd],[1660584228926],,0,1,0.0


In [66]:
df_valid.describe()

Unnamed: 0,matched_recommendations,historySize,percent_matches
count,112184.0,112184.0,112184.0
mean,0.0,1.594425,0.0
std,0.0,1.055825,0.0
min,0.0,1.0,0.0
25%,0.0,1.0,0.0
50%,0.0,1.0,0.0
75%,0.0,2.0,0.0
max,0.0,5.0,0.0
