*Inspired* and repurposed code by [ Lingjun (Ivan) Chen and Nuo (Nora) Xu[link text](https://)](https://sites.northwestern.edu/msia/2019/04/24/personalized-restaurant-recommender-system-using-hybrid-approach/)




## Importing modules

In [None]:
# importing required libraries
import pandas as pd
from collections import Counter 
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lightfm import LightFM
from lightfm.evaluation import precision_at_k,auc_score,reciprocal_rank
import scipy
import time
import math
from lightfm.data import Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# loading data from CSV
business=pd.read_csv('drive/MyDrive/business_final.csv')

In [None]:
rest_review=pd.read_csv('drive/MyDrive/rest_review.csv')

## Fetching the categories of business

In [None]:
# Fetching the categories of business
category_frequency = [content for line in business['categories'] for content in line.split(";")]
num_tags = [len(content) for line in business['categories'] for content in line.split(";")]
sum(num_tags)

## Taking the top 60 categories as our features

In [None]:
# using counter tp count the frequency of common categories
new_feature = Counter(category_frequency).most_common(60)
print(new_feature)

## One hot encoding with chosen features and merging them with dataset

In [None]:
new_feature = Counter(category_frequency).most_common(60)# Getting top 60 most frequent categories

feature = pd.DataFrame()
for ind, val in enumerate(new_feature[1:], start=1):
  category, freq = val[0], val[1]
  idf_score = math.log1p(len(business.business_id) / freq) # Calculating IDF score
  feature.loc[ind-1, ['Feature', 'Category_Score']] = category, idf_score # Adding category and its IDF score to the dataframe 'feature'

In [None]:
business2 = business

for index in range(feature.shape[0]):
    # Get the feature name and IDF score from the current row in 'feature'
    f,idf = feature.loc[index,'Feature'],feature.loc[index,'Category_Score']
    # For each row in the 'categories' column in 'business2', create a binary feature 
    # indicating whether or not the current feature is present in the list of categories
    # for that row, and then multiply the result by the IDF score for that feature
    business2[f] = [1 / len(each_line) * idf if f in each_line else 0 
                      for each_line in business2['categories'].str.split(';')]

In [None]:
rest_review.dtypes

In [None]:
business2.dtypes

In [None]:
rest_review=rest_review.merge(business2,on='business_id',how='inner')


In [None]:
rest_review=rest_review.drop('Unnamed: 0_x',axis=1)

## Loading user dataset and extracting features

In [None]:
# Reading and selecting columns from yelp_user.csv file
user=pd.read_csv('drive/MyDrive/yelp_user.csv')
user=user[['user_id','review_count','useful']]
# Renaming columns to avoid confusion later
user=user.rename(columns={'review_count':'user_rc','useful':'user_useful'})
rest_review=rest_review.merge(user,on='user_id',how='inner')
rest_review.columns

In [None]:
rest_review.columns# num_tags

## Normalizing columns 

In [None]:
# normalizing review count and useful columns
rest_review.review_count = pd.Series([math.sqrt(x) for x in rest_review.review_count])
rest_review.useful =  pd.Series([math.sqrt(x) for x in rest_review.useful])

In [None]:
rest_review=rest_review.sample(frac=.01)
business_id=rest_review.business_id.unique()
business2=business2[business2['business_id'].isin(business_id)]

## Created object dataset and fit it with user_id and business_id 

In [None]:
#  Fit dataset on user and business data
data_set = Dataset()
data_set.fit(rest_review.user_id,business2.business_id)


## Fitting other features such as stars, review_count

In [None]:

data_set.fit_partial(items=business2.business_id,
                    item_features=['stars','review_count'])


In [None]:
# creating a list of features from business table that include only categories we split and fitting them to our data_set.
item_cols = [x for x in business2.columns[21:]]
data_set.fit_partial(items = business2.business_id,item_features = item_cols)  

In [None]:
item_cols

In [None]:
rest_review.columns

In [None]:
# creating list of features from rest_review table and fitting them to the model
user_1 = [x for x in rest_review.columns[29:]]
user_1

In [None]:
data_set.fit_partial(users=rest_review.user_id,
                    user_features = user_1)


## Created interaction sparse matrix from review table

In [None]:
# creating interactions
(interactions, weights) = data_set.build_interactions([(x['user_id'],x['business_id'],x[['stars_x']]) for index,x in rest_review.iterrows()])
print(repr(interactions))

In [None]:
def compute_rmse(X_test, X_pred):
    # Ref: https://github.com/ncu-dart/rdf/blob/master/rdf/utils.py
    
    sse = 0.
    for i in range(len(X_test)):
        sse += (X_test[i] - X_pred[i]) ** 2
    
    return (sse / len(X_test)) ** .5

created functions that bulids dictionaries which are used to user and item feature mapping and the values for the features are normalized to preserve the importance of features

In [None]:
from tqdm import tqdm
# Build item features function
def item(df,item_cols,values):
    output = {}
    for col in item_cols:
        output.update({col: df[col]})
    sum_val = sum(float(value) for value in output.values()) # get sum of all the tfidf values
    
    if(sum_val == 0):
        return output
    else:
      for key, value in output.items():
        output[key] = value / sum_val # normalizing it to preserve the importance of all features
    return output

# Build user features function
def user_dict(df,item_cols,values):
    output = {}
    for col in item_cols:
        output.update({col: df[col]})
    sum_val = sum(list(output.values())) # get sum of all the tfidf values
    
    if(sum_val == 0):
        return output
    else:
        for key, value in output.items():
          output[key] = value / sum_val # normalizing it to preserve the importance of all features
    return output

# get max of each column to regularize value to [0,1]
star_max = business2.stars.max()
max_item = business2.review_count.max()
max_u_rc = rest_review.review_count.max()
max_useful = rest_review.useful.max()

# build item features
item_features_data = []
for index, x in business2.iterrows():
    feature_dict = {
        'stars': 0.8 * x['stars'] / star_max,
        'review_count': 0.2 * x['review_count'] / max_item,
    }
    feature_dict.update(item(x, item_cols, [0.5 * x['stars'] / star_max, 0.5 * x['review_count'] / max_item]))
    item_features_data.append((x['business_id'], feature_dict))
item_features = data_set.build_item_features(tqdm(item_features_data, desc="Building item features"))

# build user features
user_features_data = []
for index, x in rest_review.iterrows():
    feature_dict = {
        'user_rc': 0.7 * x['user_rc'] / max_u_rc,
        'user_useful': 0.3 * x['user_useful'] / max_useful,
    }
    feature_dict.update(user_dict(x, user_1, [0.7 * x['user_rc'] / max_u_rc, 0.3 * x['user_useful'] / max_useful]))
    user_features_data.append((x['user_id'], feature_dict))
user_features = data_set.build_user_features(tqdm(user_features_data, desc="Building user features"))



split out interactions matrix created earlier into train and test with train having 80% and test set having 20%. 

In [None]:
seed = 69
from lightfm.cross_validation import random_train_test_split
train,test=random_train_test_split(interactions,test_percentage=0.2,random_state=np.random.RandomState(seed))
test = test - train.multiply(test)

defined hyperparameters and applied grid search to our model to get the best parameters which get the best auc score

In [None]:
# Grid search to find the best hyperparameters for LightFM model
from itertools import product
# Define hyperparameter combinations to try
NUM_THREADS = [5]
NUM_COMPONENTS = [30, 50]
LEARNING_RATE = [0.01,  0.1]
NUM_EPOCHS = [10, 20]
ITEM_ALPHA = [1e-5, 1e-6]
loss=['warp','logistic']
grid = product(NUM_THREADS, NUM_COMPONENTS, LEARNING_RATE, NUM_EPOCHS, ITEM_ALPHA,loss)
# Function to train and evaluate the model using the given hyperparameters 
def train_evaluate_model(train, test, user_features, item_features, num_threads, num_components, learning_rate, num_epochs, item_alpha,loss):
    model = LightFM(loss=loss, item_alpha=item_alpha, random_state=seed, no_components=num_components,learning_rate=learning_rate)
    model = model.fit(train, user_features=user_features, item_features=item_features, epochs=num_epochs, num_threads=num_threads)
    test_precision = precision_at_k(model, test, train_interactions=train, item_features=item_features, user_features=user_features, k=5, num_threads=num_threads).mean()
    test_auc = auc_score(model, test, item_features=item_features, user_features=user_features, num_threads=num_threads).mean()
    return (num_threads, num_components, learning_rate, num_epochs, item_alpha, test_precision, test_auc,loss)

results = []
# Loop through hyperparameters and run the model using each combination
for params in grid:
    num_threads, num_components, learning_rate, num_epochs, item_alpha,loss = params
    result = train_evaluate_model(train, test, user_features, item_features, num_threads, num_components, learning_rate, num_epochs, item_alpha,loss)
    results.append(result)
    print(f"num_threads={num_threads}, num_components={num_components}, learning_rate={learning_rate}, num_epochs={num_epochs}, item_alpha={item_alpha}, test_precision={result[5]:.4f}, test_auc={result[6]:.4f},best_func={loss}")

# Find the best result
best_result = max(results, key=lambda x: x[6])
print(f"\nBest hyperparameters: num_threads={best_result[0]}, num_components={best_result[1]}, learning_rate={best_result[2]}, num_epochs={best_result[3]}, item_alpha={best_result[4]}, test_precision={best_result[5]:.4f}, test_auc={best_result[6]:.4f},best_loss_func={best_result[7]}")

## Using our best hyperparameters got from grid search 

In [None]:
from lightfm.evaluation import  recall_at_k

# Define hyperparameters
num_threads=5
num_components=50
learning_rate=0.1
num_epochs=[5,10,15,20,25,30]
item_alpha=1e-05
# Initialize a dataframe to store the test scores
test_scores=pd.DataFrame(columns=['AUC','precision','recall'])
# Loop through different values of num_epochs
for i in num_epochs:
  # Train a LightFM model with the specified hyperparameters
  model = LightFM(loss='warp',item_alpha=item_alpha,random_state=69,no_components=num_components,learning_rate=learning_rate)
  model = model.fit(train,user_features=user_features,item_features=item_features,epochs=i,num_threads=num_threads)
  # Compute the AUC, precision@1, and recall@1 scores on the test set
  test_auc = auc_score(model, test,user_features=user_features,item_features=item_features,num_threads=num_threads).mean()
  precision=precision_at_k(model, test,train_interactions=train,item_features=item_features,user_features=user_features, k=1,num_threads=num_threads).mean()

  recall = recall_at_k(model, test,train_interactions=train,
                            item_features=item_features, user_features=user_features, 
                            k=1, num_threads=num_threads).mean()
  # Store the scores in the dataframe
  test_scores.loc[f'{i}']=[test_auc,precision,recall]
  


### Calculating auc score, precision and recall for every epoch

In [None]:
test_scores


In [None]:
def recommend(model, train, data_meta, user_ids, k, name, mapping, tag=None, user_features=None, item_features=None, num_threads=2):
    # model: LightFM model object
    # train: sparse matrix representing user-item interactions
    # data_meta: dataframe containing metadata for items
    # user_ids: list of user ids for which recommendations are to be generated
    # k: number of items to be recommended per user
    # name: column name in data_meta representing item names
    # mapping: dictionary mapping item IDs to indices
    # tag: (optional) column name in data_meta representing item tags
    # user_features: (optional) matrix representing user features
    # item_features: (optional) matrix representing item features
    # num_threads: number of threads to use for prediction
    n_users, n_items = train.shape

    recommendations = {}

    for user_id in user_ids:
        # get indices of items already interacted with by user
        t_idx = {value: key for key, value in mapping.items()}
        # get names of already interacted items
        u_idx = train.getrow(user_id).indices

        known_positives = data_meta.loc[u_idx, name]
        # if tag is provided, get tags of already interacted items
        if tag is not None:
            known_tags = data_meta.loc[u_idx, tag]
        # predict scores for all items

        scores = model.predict(user_id, np.arange(n_items), user_features=user_features, item_features=item_features, num_threads=num_threads)
        # get indices of top k items with highest scores
        item_ids = np.argsort(-scores)[:k]
        # get names of top k items
        top_items = data_meta.iloc[item_ids]['name']
        # add recommendations for current user to recommendations dictionary
        recommendations[user_id] = top_items

    return recommendations


In [None]:
user_index=list(set(rf.get_user_index(test)))
a=recommend(model,train,business2,[49],5,'name',mapping=data_set.mapping()[2],tag='categories',
                              user_features = user_features,item_features=item_features).values()
pd.DataFrame(a).transpose()

In [None]:
import matplotlib.pyplot as plt


plt.plot(test_scores['AUC'])
plt.title('AUC')
plt.xlabel('Auc')




In [None]:
plt.plot(test_scores['precision'])
plt.title('precision')
plt.xlabel('precision')

In [None]:
plt.plot(test_scores['recall'])
plt.title('recall')
plt.xlabel('recall')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.DataFrame({
    'test_precision': [0.0036, 0.0003, 0.0036, 0.0003, 0.0033, 0.0003, 0.0037, 0.0003, 0.0070, 0.0003, 0.0071, 0.0003, 0.0106, 0.0003, 0.0120, 0.0003, 0.0037, 0.0003, 0.0037, 0.0003, 0.0039, 0.0003, 0.0040, 0.0003],
    'test_auc': [0.6398, 0.5757, 0.6391, 0.5757, 0.7286, 0.5755, 0.9276, 0.9755, 0.7737, 0.5750, 0.7788, 0.9751, 0.8041, 0.9749, 0.9069, 0.9751, 0.9327, 0.5759, 0.9321, 0.5759, 0.9242, 0.5756, 0.6223, 0.5756],
    'loss_func': ['warp', 'logistic', 'warp', 'logistic', 'warp', 'logistic', 'warp', 'logistic', 'warp', 'logistic', 'warp', 'logistic', 'warp', 'logistic', 'warp', 'logistic', 'warp', 'logistic', 'warp', 'logistic', 'warp', 'logistic', 'warp', 'logistic']
})

heatmap_data = pd.pivot_table(data, values=['test_precision', 'test_auc'], index='loss_func')

sns.heatmap(heatmap_data, cmap='YlGnBu', annot=True, annot_kws={"size": 10})