# Model recommendation with lighfm

### Import libraries

In [20]:
import os
import sys
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from lightfm import cross_validation
import scipy.sparse as sp
from scipy import sparse

In [21]:
from lightfm import LightFM
from sklearn.base import clone


class LightFMResizable(LightFM):
    """A LightFM that resizes the model to accomodate new users,
    items, and features"""

    def fit_partial(
        self,
        interactions,
        user_features=None,
        item_features=None,
        sample_weight=None,
        epochs=1,
        num_threads=1,
        verbose=False,
    ):
        try:
            self._check_initialized()
            self._resize(interactions, user_features, item_features)
        except ValueError:
            # This is the first call so just fit without resizing
            pass

        super().fit_partial(
            interactions,
            user_features,
            item_features,
            sample_weight,
            epochs,
            num_threads,
            verbose,
        )

        return self

    def _resize(self, interactions, user_features=None, item_features=None):
        """Resizes the model to accommodate new users/items/features"""

        no_components = self.no_components
        no_user_features, no_item_features = interactions.shape  # default

        if hasattr(user_features, "shape"):
            no_user_features = user_features.shape[-1]
        if hasattr(item_features, "shape"):
            no_item_features = item_features.shape[-1]

        if (
            no_user_features == self.user_embeddings.shape[0]
            and no_item_features == self.item_embeddings.shape[0]
        ):
            return self

        new_model = clone(self)
        new_model._initialize(no_components, no_item_features, no_user_features)

        # update all attributes from self._check_initialized
        for attr in (
            "item_embeddings",
            "item_embedding_gradients",
            "item_embedding_momentum",
            "item_biases",
            "item_bias_gradients",
            "item_bias_momentum",
            "user_embeddings",
            "user_embedding_gradients",
            "user_embedding_momentum",
            "user_biases",
            "user_bias_gradients",
            "user_bias_momentum",
        ):
            # extend attribute matrices with new rows/cols from
            # freshly initialized model with right shape
            old_array = getattr(self, attr)
            old_slice = [slice(None, i) for i in old_array.shape]
            new_array = getattr(new_model, attr)
            new_array[tuple(old_slice)] = old_array
            setattr(self, attr, new_array)

        return self

### Defining variables

In [22]:
import json

with open('config.json', 'r') as f:
    config = json.load(f)

In [23]:
# NUM_ITEMS_TRAIN = 108573
NUM_ITEMS_TRAIN = 63893
# NUM_ITEMS_TRAIN = 231305

NO_EPOCHS = 80

NUM_COMPONENTS = 30

NUM_THREADS = 4

### Retrieve data

In [24]:
dtype_df_valid = {
"userId" : 'string',
"userType" : 'category',
"history" : 'string',
"timestampHistory" : 'string'
}

In [25]:
import pandas as pd
# path config

df_valid = pd.read_csv(config["VALID_DF"],dtype=dtype_df_valid,nrows=50000)
df_valid.head(3)

Unnamed: 0,userId,userType,history,timestampHistory
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,['be89a7da-d9fa-49d4-9fdc-388c27a15bc8'  '01c5...,[1660533136590 1660672113513]
1,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,['77901133-aee7-4f7b-afc0-652231d76fe9'],[1660556860253]
2,755062dd39a48809880cf363b04268c3af2c003088cde0...,Logged,['857aa90f-a7ec-410d-ba82-dfa4f85d4e71'],[1660561649242]


In [26]:
dtype_df_train_score = {
"userId" : 'string',
"userType" : 'category',
"history" : 'string',
"score" : 'Float32'
}

In [27]:
import pandas as pd

# df_ratings = pd.read_csv(config["DF_TRAIN_SCORES"], dtype=dtype_df_train_score)
df_ratings = pd.read_csv(config["DF_TRAIN_SCORES"], dtype=dtype_df_train_score, nrows=1000000)
df_ratings.drop(columns=["Unnamed: 0"],inplace=True)
df_ratings.head(3)

Unnamed: 0,userId,history,userType,score
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,Non-Logged,2.216477
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,Non-Logged,2.429501
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,Non-Logged,1.794861


In [28]:
df_news = pd.read_csv(config["DF_ITEMS_FEATURE"])
df_news.drop(columns=["Unnamed: 0"],inplace=True)
df_news.head(3)

Unnamed: 0,page,age_exp_normalized,ageCategories
0,7371a9b5-5824-4c57-8704-00a74feebe79,0.151439,very-old
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,0.140788,very-old
2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,0.12261,very-old


In [29]:
df_valid_new = df_valid.drop(columns="timestampHistory")

from utils.custom_treat_data_funcs import transform_text_to_list, explode_df_columns

# df_valid_new["history"] = df_valid_new["history"].apply(str)
df_valid_new["history"] = df_valid_new["history"].apply(transform_text_to_list)
# df_valid_new.head(3)
df_valid_new = df_valid_new.explode("history", ignore_index=True)
df_valid_new.head(3)

Unnamed: 0,userId,userType,history
0,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,be89a7da-d9fa-49d4-9fdc-388c27a15bc8
1,e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4...,Logged,01c59ff6-fb82-4258-918f-2910cb2d4c52
2,d0afad7ea843d86597d822f0df1d39d31a3fea7c39fdee...,Logged,77901133-aee7-4f7b-afc0-652231d76fe9


In [30]:
df_valid_new.loc[0,"history"]

'be89a7da-d9fa-49d4-9fdc-388c27a15bc8'

In [31]:
df_valid_exploded = pd.concat([df_ratings, df_valid_new])
df_valid_exploded

Unnamed: 0,userId,history,userType,score
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,Non-Logged,2.216477
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,Non-Logged,2.429501
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,Non-Logged,1.794861
3,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,a0562805-c7d1-4ffd-b622-87c50ae006f4,Non-Logged,1.68271
4,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,233f8238-2ce0-470f-a9d5-0e0ac530382a,Non-Logged,2.266852
...,...,...,...,...
83777,c5ce6c84ea7444d9e023f2da417320d1d66adb5af502d6...,0a36d0c3-9ca7-4bd8-a597-ed5bb00922d2,Non-Logged,
83778,c5ce6c84ea7444d9e023f2da417320d1d66adb5af502d6...,7d8220b9-a52b-42c1-848d-ae55a8927fcc,Non-Logged,
83779,abec708d272d798b9cd904de99b62fa38a3393c8340c00...,48b7b61f-1113-44ee-b2be-62c004edffda,Non-Logged,
83780,d7e5eeb1bba9c4f202a5e2a2e57156450aaf34a47dac71...,48eddf29-6e0a-4433-a71d-451993f4b2ec,Non-Logged,


In [32]:
# duplicates = df_valid_exploded[df_valid_exploded.duplicated(subset=['userId','history'], keep=False)]
# duplicates = duplicates[duplicates["score"].isna()]
duplicates = df_valid_exploded[df_valid_exploded.duplicated(subset=['userId','history'], keep='first')]
duplicates
# duplicates = df_valid_exploded[df_valid_exploded.duplicated(subset=['history'], keep=False)]
# duplicates[duplicates["history"]=='253339a1-92b6-44d1-8fa2-59236c5251b1']
# duplicates[duplicates["userId"]=='f0e758359a184c99912e1ad5fc912b92b5e7b63c7a6002']

Unnamed: 0,userId,history,userType,score
75,9107ae20aa08ea8e46e27bb83801c8a77696a7499a2b6e...,c1c1e207-3a8c-4074-8973-56bdfed265b5,Logged,
99,e8f9ad90314928d72e21b7994c200cf01148d7f1779845...,b2a21491-7576-4be0-a520-97ce9470efeb,Logged,
151,86e84a6997cb9b6b72d226cf892d57e83a96216443f7aa...,esid:conteudo_editorial_g1#materia#https://esp...,Logged,
152,86e84a6997cb9b6b72d226cf892d57e83a96216443f7aa...,esid:conteudo_editorial_g1#materia#http://espe...,Logged,
287,fbcb5d638da9f894b86276d6a9957db67505b4a4b8fbd1...,bbdb1500-3b2c-407b-93e5-dff4d7f3d497,Logged,
...,...,...,...,...
83353,134d101fd3053b72d17e07b5de7ab4e8252746588cbff4...,480d7011-d1b6-4154-92c2-8835d2dc5f72,Non-Logged,
83559,be54df8ee374c08dc4339004a52050bb03338d11203849...,b4820c6d-f251-4d41-94c9-29ea3dd9c491,Non-Logged,
83745,72f042e4a64ed86262dfa39fcb1b71fe87495fb73fb8ea...,cc54e489-8de8-498b-a3ba-d9c296751ff6,Non-Logged,
83747,1f05439a4338abadbd2c014eb9727abb42f58aed740662...,571996d3-d583-48ba-a131-1c95f8006044,Non-Logged,


In [33]:
df_valid_cleaned = df_valid_exploded.drop_duplicates(subset=['userId','history'], keep="first")
df_valid_cleaned

Unnamed: 0,userId,history,userType,score
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,Non-Logged,2.216477
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,Non-Logged,2.429501
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,Non-Logged,1.794861
3,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,a0562805-c7d1-4ffd-b622-87c50ae006f4,Non-Logged,1.68271
4,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,233f8238-2ce0-470f-a9d5-0e0ac530382a,Non-Logged,2.266852
...,...,...,...,...
83777,c5ce6c84ea7444d9e023f2da417320d1d66adb5af502d6...,0a36d0c3-9ca7-4bd8-a597-ed5bb00922d2,Non-Logged,
83778,c5ce6c84ea7444d9e023f2da417320d1d66adb5af502d6...,7d8220b9-a52b-42c1-848d-ae55a8927fcc,Non-Logged,
83779,abec708d272d798b9cd904de99b62fa38a3393c8340c00...,48b7b61f-1113-44ee-b2be-62c004edffda,Non-Logged,
83780,d7e5eeb1bba9c4f202a5e2a2e57156450aaf34a47dac71...,48eddf29-6e0a-4433-a71d-451993f4b2ec,Non-Logged,


In [34]:
df_valid_cleaned["score"].fillna(0,inplace=True)
df_valid_cleaned

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_valid_cleaned["score"].fillna(0,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid_cleaned["score"].fillna(0,inplace=True)


Unnamed: 0,userId,history,userType,score
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,Non-Logged,2.216477
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,Non-Logged,2.429501
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,Non-Logged,1.794861
3,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,a0562805-c7d1-4ffd-b622-87c50ae006f4,Non-Logged,1.68271
4,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,233f8238-2ce0-470f-a9d5-0e0ac530382a,Non-Logged,2.266852
...,...,...,...,...
83777,c5ce6c84ea7444d9e023f2da417320d1d66adb5af502d6...,0a36d0c3-9ca7-4bd8-a597-ed5bb00922d2,Non-Logged,0.0
83778,c5ce6c84ea7444d9e023f2da417320d1d66adb5af502d6...,7d8220b9-a52b-42c1-848d-ae55a8927fcc,Non-Logged,0.0
83779,abec708d272d798b9cd904de99b62fa38a3393c8340c00...,48b7b61f-1113-44ee-b2be-62c004edffda,Non-Logged,0.0
83780,d7e5eeb1bba9c4f202a5e2a2e57156450aaf34a47dac71...,48eddf29-6e0a-4433-a71d-451993f4b2ec,Non-Logged,0.0


In [35]:
dataset_valid = Dataset()

# Get unique values for users, items, and user features
unique_users_valid = df_valid["userId"].unique()
unique_items_valid = df_valid["history"].unique()
unique_user_features_valid = df_valid["userType"].unique().tolist()

# Fit dataset with users, items, and user feature names
dataset_valid.fit(
    users=unique_users_valid,
    items=unique_items_valid,
    # user_features=unique_user_features_valid  # Register user features
)

In [36]:
(interactions_valid, weights_valid) = dataset_valid.build_interactions([
    (row.userId, row.history) 
    for _, row in df_valid.iterrows()
])

In [37]:
interactions_valid.shape

(50000, 26453)

In [38]:
user_features_list_valid = [
    (row.userId, [row.userType])  
    for _, row in df_valid.iterrows()
]

# user_features_valid = dataset_valid.build_user_features(user_features_list_valid)

In [39]:
import pickle

loaded_model = pickle.load(open('artifacts/lightfm_model.pkl', 'rb'))
loaded_user_id_map = pickle.load(open('artifacts/user_id_map.pkl', 'rb'))
loaded_item_id_map_reverse = pickle.load(open('artifacts/item_id_map_reverse.pkl', 'rb'))
loaded_user_feature_map = pickle.load(open('artifacts/user_feature_map.pkl', 'rb'))

In [40]:
loaded_model.fit_partial(interactions=interactions_valid, sample_weight=weights_valid, 
                         epochs=NO_EPOCHS, num_threads=NUM_THREADS)

ValueError: The user feature matrix specifies more features than there are estimated feature embeddings: 41873 vs 50000.

In [None]:
# user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
# item_id_map_reverse = {v: k for k, v in item_id_map.items()}

# loaded_user_id_map = user_id_map
# loaded_item_id_map_reverse = item_id_map_reverse
# loaded_user_feature_map = user_feature_map

In [None]:
# _, n_items = interactions.shape # no of users * no of items
_, n_items_valid = interactions_valid.shape # no of users * no of items

# n_items = NUM_ITEMS_TRAIN
n_items = NUM_ITEMS_TRAIN + n_items_valid

### Make predictions to known and unknowm on same recommendation function with pkls

In [None]:
def format_newuser_input(user_feature_map, user_feature_list):
  normalised_val = 1.0 
  target_indices = []
  for feature in user_feature_list:
    try:
        target_indices.append(user_feature_map[feature])
    except KeyError:
        print("new user feature encountered '{}'".format(feature))
        pass
  #print("target indices: {}".format(target_indices))
  new_user_features = np.zeros(len(user_feature_map.keys()))
  for i in target_indices:
    new_user_features[i] = normalised_val
  new_user_features = sparse.csr_matrix(new_user_features)
  return(new_user_features)

In [None]:
def sample_recommendation_by_title(user_hash,df_news,user_feature_list,item_id_map_reverse,user_feature_map,user_id_map,model):
    try:
        user_x = user_id_map[user_hash]
        scores = model.predict(user_x, np.arange(n_items)) # means predict for all
    except:
        new_user_features = format_newuser_input(user_feature_map, user_feature_list)
        scores = model.predict(0, np.arange(n_items), user_features=new_user_features)
    
    top_5_indices = np.argsort(-scores)[:10]  # Sort scores in descending order and take the top 5
    top_5_items = [item_id_map_reverse[i] for i in top_5_indices]

    print("Top 5 recommended items:")

    for x in top_5_items:
        row = df_news[df_news["page"] == x]
        print("        %s" % row["title"].values[0])


In [None]:
# predict for known user
# df_valid["userId"].iloc[0]
user_feature_list = [df_valid["userType"].iloc[0]]
user_hash = df_valid["userId"].iloc[0]

# sample_recommendation_by_title(user_hash,df_news,user_feature_list,loaded_item_id_map_reverse,loaded_user_feature_map,loaded_user_id_map,loaded_model)

In [None]:
def get_recommended_history_list(user_hash,user_feature_list,item_id_map_reverse,user_feature_map,user_id_map,model):
    """
    This function verifies if the users is known or new, and makes recommendations depending on this verification.
    The top 5 recommendations from the list are returned.
    """
    try:
        user_x = user_id_map[user_hash]
        scores = model.predict(user_x, np.arange(n_items)) # means predict for all
    except:
        new_user_features = format_newuser_input(user_feature_map, user_feature_list)
        scores = model.predict(0, np.arange(n_items), user_features=new_user_features)
    
    top_5_indices = np.argsort(-scores)[:5]  # Sort scores in descending order and take the top 5
    top_5_items = [item_id_map_reverse[i] for i in top_5_indices]

    return top_5_items

In [None]:
from utils.custom_treat_data_funcs import transform_text_to_list

# Transform the single string with histories/items into a list of strings

df_valid["history"] = df_valid["history"].apply(transform_text_to_list)
df_valid

In [None]:
# Testing for just one user

user_feature_list = [df_valid["userType"].iloc[0]]
user_hash = df_valid["userId"].iloc[0]
validation_history_hashes = df_valid["history"].iloc[0]

recommeded_histories = get_recommended_history_list(user_hash,user_feature_list,loaded_item_id_map_reverse,loaded_user_feature_map,loaded_user_id_map,loaded_model)
print(recommeded_histories)
print(validation_history_hashes)

In [None]:
def count_valid_recommendations(validation_history_hashes, recommeded_histories):
    """
    This function receives 
    * The `validation_history_hashes` (which is a list of histories contained in the "validacao.csv")
    * The `recommeded_histories` (which are the recommended histories/items by the model)
    Then, it verifies how many recommended items match with the validation histories.
    """
    count_valids = 0
    for valid_hist in validation_history_hashes:
        if valid_hist in recommeded_histories:
            count_valids = count_valids+1
    return count_valids

In [None]:
df_valid["recommended_hists"] = "" # creates a column to receive the list of recommended histories
df_valid["matched_recommendations"] = 0 # creates a column to receive the quantity of matches between recommendations and validation items
df_valid["historySize"] = df_valid["history"].apply(lambda x : len(x)) # calculate the quantity of histories for each user

In [None]:
df_valid.head(2)

In [None]:
import multiprocessing

def parallelize_dataframe(df, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [None]:
def validate_recommendations(df_valid):
    for _, row in df_valid.iterrows():
        """
        For each user, get the top 5 recommendations from the model, and also count how many of them are contained within the validation set.
        Stores the results on the columns `recommended_hists` `matched_recommendations`.
        """
        user_feature_list = [row["userType"]]
        user_hash = row["userId"]
        validation_history_hashes = row["history"]

        recommended_hist = get_recommended_history_list(user_hash,user_feature_list,loaded_item_id_map_reverse,loaded_user_feature_map,loaded_user_id_map,loaded_model)
        num_valid_recommendations = count_valid_recommendations(validation_history_hashes, recommended_hist)

        row["recommended_hists"] = recommended_hist
        row["matched_recommendations"] = num_valid_recommendations

In [None]:
parallelize_dataframe(df_valid, validate_recommendations)

In [None]:
df_valid.head(2)

In [None]:
df_valid.describe()