#**Collaborative Filtering using Surprise**

In [1]:
!pip install surprise 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp39-cp39-linux_x86_64.whl size=3195812 sha256=89e844b95bbdeb1face6e094aa377709973558eb0ffaf50c4a88c2a11dd06c97
  Stored in directory: /root/.cache/pip/wheels/c6/3a/46/9b17b3512bdf283c6cb84f59929cdd5199d4e754d596d22784
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import Libraries

In [3]:
# importing libraries
import pandas as pd
from surprise import *
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import dataset

## Read Review and Business Dataset

In [4]:
# Load the review and business_final csv files
review=pd.read_csv('drive/MyDrive/DIC/review.csv')
business_final=pd.read_csv('drive/MyDrive/DIC/business_final.csv')
review = review.merge(business_final[['name','business_id']], how='inner')
review

Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool,sentiment_score,sentiment,super_score,name
0,93,BF0ANB54sc_f-3_howQBCg,ssuXFjkH4neiBgwv-oN4IA,JlNeaOymdVbE6_bubqjohg,1,2014-08-09,always go chevo chandler delicious one ahwatuk...,3,0,0,0.7964,positive,0.95928,"""Papa Chevo's Taco Shop"""
1,15312,-QgtOpsFzLHd58-Y1Ao2tA,BPKpLbR9NuWFAR9SUWpZOw,JlNeaOymdVbE6_bubqjohg,5,2014-06-29,great grilled chicken burrito believe coming c...,0,0,0,0.8481,positive,0.96962,"""Papa Chevo's Taco Shop"""
2,49074,zdhN3MBABBKi-9QHAh-G1A,sg5q7rz2_7PfaN-6JnLb5g,JlNeaOymdVbE6_bubqjohg,5,2014-05-01,everything far great sure reviews pleased food...,1,0,1,0.9463,positive,0.98926,"""Papa Chevo's Taco Shop"""
3,94,DbLUpPT61ykLTakknCF9CQ,ssuXFjkH4neiBgwv-oN4IA,0Rni7ocMC_Lg2UH0lDeKMQ,1,2014-08-09,place always dirty grimy twice back customer s...,6,0,0,-0.8481,negative,0.63038,"""Barro's Pizza"""
4,2901,W2tkFrhscAIBo-PttIalYg,L8d61tDHFB5CemHBUvxn-g,0Rni7ocMC_Lg2UH0lDeKMQ,1,2016-04-21,walked order pizza wanted simple pizza pie ext...,0,0,0,0.4215,positive,0.88430,"""Barro's Pizza"""
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32123,99914,kRhVtVQJXyB39paGKNHKkw,oxxHn3CSVW8N1JPLiOCHqg,uRNQ4z2QQPX7tqWpMjiuTw,3,2014-10-15,nice lunch good food worth another trip spicy ...,0,0,0,0.9066,positive,0.98132,"""Emiliano's Mexican Restaurant"""
32124,99916,g78E0chAO3d-yS9qiLtyZg,oxxHn3CSVW8N1JPLiOCHqg,EgW-kEqxP83MlVl06Qn21g,1,2014-09-28,minus stars hear order brought wrong item atti...,7,0,0,-0.1027,negative,0.77946,"""Rochester Inn & Hardwood Grille"""
32125,99924,N7vmawqApfl0fn7OlKjGjg,oxxHn3CSVW8N1JPLiOCHqg,0Ge4hEQ8HYnrkfm8UHj2BQ,5,2015-08-27,time husband use go ny gyros said equal better...,0,0,0,0.7906,positive,0.95812,"""My Big Fat Greek Gyro"""
32126,99942,bH0lMpJQFjEPaD3ZkOK97w,mUbIcYFzkkvLXP0yV4RpqQ,_iUlt5rm-15QoLw0MEEg5A,5,2015-12-03,joe absolutely amazing love coming hard day wo...,0,0,0,0.8927,positive,0.97854,"""Mitchell's Restaurant Bar & Banquet Center"""


## Define Scale

In [5]:
reader = Reader(rating_scale=(0,1))

## Get the relevant Data

In [52]:
data = Dataset.load_from_df(review[['user_id', 'business_id', 'super_score']], reader)

## Train, Test split

In [7]:
# splitting data into train and test
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
trainset = data.build_full_trainset()

## 1. KNN

### 1. KNNBaseline

In [8]:
parameters_knnb={
    'name':['cosine','pearson'], # similarity metric options: cosine or pearson
    'min_k': [3, 6, 9], # minimum number of neighbors
    'min_support': [True, False], # minimum number of common items between users
    'user_based':[True, False]  # user-based or item-based collaborative filtering
}

In [9]:
grid_knnb=GridSearchCV(KNNBaseline, param_grid=parameters_knnb,n_jobs=-1)
grid_knnb.fit(data) # Fitting the data to the KNNBaseline algorithm using the 

In [10]:
print(grid_knnb.best_score)
print(grid_knnb.best_params)

{'rmse': 0.08124346935615659, 'mae': 0.050019257075632616}
{'rmse': {'name': 'cosine', 'min_k': 6, 'min_support': True, 'user_based': True}, 'mae': {'name': 'cosine', 'min_k': 3, 'min_support': True, 'user_based': True}}


In [11]:
algo_knnb = grid_knnb.best_estimator["rmse"] # Getting the best estimator with RMSE as the performance metric
algo_knnb.fit(data.build_full_trainset()) # "build_full_trainset()" method is used to create a training set from the entire dataset.

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7fcf3c1c4e50>

#### Make recommendations

In [12]:
predictions = algo_knnb.test(testset)

#### Recommend 5 Restaurants to users

In [14]:
user_1 = 'BPKpLbR9NuWFAR9SUWpZOw'# ID of the user for whom recommendations are to be made
restaurants = review['business_id'].unique()
user_1_ratings = []
for restaurant in restaurants:
    user_1_ratings.append((restaurant, algo_knnb.predict(user_1, restaurant).est))

sorted_ratings = sorted(user_1_ratings, key=lambda x: x[1], reverse=True)

for i in range(10):
    business_id = sorted_ratings[i][0]
    name = review.loc[review['business_id'] == business_id, 'name'].iloc[0]
    print(f'{i+1}. {name}')

1. "Mesa Grill"
2. "International House of Food"
3. "Rokerij"
4. "Vic & Anthony's Steakhouse"
5. "Scarpetta"
6. "Pan Asian"
7. "EVO"
8. "Ocotillo"
9. "White Oaks"
10. "The Great Dane"


In [15]:
accuracy.mae(predictions)
accuracy.mse(predictions)
accuracy.rmse(predictions)
accuracy.fcp(predictions)

MAE:  0.0427
MSE: 0.0048
RMSE: 0.0696
FCP:  0.5267


0.5266590846274231

### 2. KNNWithMeans

In [16]:
parameters_knnm={
    'name':['cosine','pearson'],  # Similarity measure to be used
    'min_k': [3, 6, 9], # Minimum number of neighbors to consider
    'min_support': [True, False],  # Flag to enable/disable baseline subtraction
    'user_based':[True, False] # Flag to determine user-based or item-based collaborative 
}

In [17]:
# grid search 
grid_knnm=GridSearchCV(KNNWithMeans, param_grid=parameters_knnm,n_jobs=-1)
grid_knnm.fit(data)

In [18]:
print(grid_knnm.best_score)
print(grid_knnm.best_params)

{'rmse': 0.0831709285425499, 'mae': 0.04802412127749702}
{'rmse': {'name': 'cosine', 'min_k': 6, 'min_support': True, 'user_based': True}, 'mae': {'name': 'cosine', 'min_k': 9, 'min_support': True, 'user_based': True}}


In [19]:
algo_knnm = grid_knnm.best_estimator["rmse"]
algo_knnm.fit(data.build_full_trainset())

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fcf0cdd9a60>

#### Make recommendations

In [20]:
predictions = algo_knnm.test(testset)

#### Recommend 5 Restaurants to users

In [21]:
user_1 = 'BPKpLbR9NuWFAR9SUWpZOw' # User for whom the recommendations are being generated
restaurants = review['business_id'].unique() # List of all unique restaurant IDs
user_1_ratings = []
for restaurant in restaurants:
    # Predicting the ratings for all the restaurants and appending them to a list
    user_1_ratings.append((restaurant, algo_knnm.predict(user_1, restaurant).est))

sorted_ratings = sorted(user_1_ratings, key=lambda x: x[1], reverse=True)

for i in range(5):
    business_id = sorted_ratings[i][0]
    # Extracting the name of the recommended restaurant using its business ID
    name = review.loc[review['business_id'] == business_id, 'name'].iloc[0]
    print(f'{i+1}. {name}')

1. "Papa Chevo's Taco Shop"
2. "Barro's Pizza"
3. "Nottingham Inn Kitchen & Creamery"
4. "Chun Fai Chinese Eatery"
5. "IHOP"


In [22]:
accuracy.mae(predictions)
accuracy.mse(predictions)
accuracy.rmse(predictions)
accuracy.fcp(predictions)

MAE:  0.0293
MSE: 0.0029
RMSE: 0.0541
FCP:  0.4973


0.49726533840965426

### 3. KNN With ZScore

In [23]:
parameters_knnz={
    'name':['cosine','pearson'],
    'min_k': [3, 6, 9],
    'min_support': [True, False],
    'user_based':[True, False]
}

In [24]:
grid_knnz=GridSearchCV(KNNWithZScore, param_grid=parameters_knnz,n_jobs=-1)
grid_knnz.fit(data)

In [25]:
print(grid_knnz.best_score)
print(grid_knnz.best_params)

{'rmse': 0.08348379929824526, 'mae': 0.04812953206133315}
{'rmse': {'name': 'cosine', 'min_k': 6, 'min_support': True, 'user_based': True}, 'mae': {'name': 'cosine', 'min_k': 9, 'min_support': True, 'user_based': True}}


In [26]:
algo_knnz = grid_knnz.best_estimator["rmse"]
algo_knnz.fit(data.build_full_trainset())

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x7fcf0d002520>

#### Make recommendations

In [27]:
predictions = algo_knnz.test(testset)

#### Recommend 5 Restaurants to users

In [28]:
user_1 = 'BPKpLbR9NuWFAR9SUWpZOw'
restaurants = review['business_id'].unique()
user_1_ratings = []
for restaurant in restaurants:
    user_1_ratings.append((restaurant, algo_knnz.predict(user_1, restaurant).est))

sorted_ratings = sorted(user_1_ratings, key=lambda x: x[1], reverse=True)

for i in range(5):
    business_id = sorted_ratings[i][0]
    name = review.loc[review['business_id'] == business_id, 'name'].iloc[0]
    print(f'{i+1}. {name}')

1. "Papa Chevo's Taco Shop"
2. "Barro's Pizza"
3. "Nottingham Inn Kitchen & Creamery"
4. "Chun Fai Chinese Eatery"
5. "IHOP"


In [29]:
accuracy.mae(predictions)
accuracy.mse(predictions)
accuracy.rmse(predictions)
accuracy.fcp(predictions)

MAE:  0.0287
MSE: 0.0029
RMSE: 0.0542
FCP:  0.4930


0.4930366986411109

### 5. KNN Basic

In [30]:
parameters_knnbasic={
    'name':['cosine','pearson'],
    'min_k': [3, 6, 9],
    'min_support': [True, False],
    'user_based':[True, False]
}

In [31]:
# This code is performing grid search to find the best hyperparameters for the KNNBasic algorithm.
grid_knnbasic=GridSearchCV(KNNBasic, param_grid=parameters_knnbasic,n_jobs=-1)
# KNNBasic is the algorithm being used for collaborative filtering, parameters_knnbasic is the hyperparameter grid
# The grid search is performed on the data using the specified hyperparameter grid to find the best estimator.

grid_knnbasic.fit(data)

In [39]:
algo_knnbasic = grid_knnbasic.best_estimator["rmse"]

predictions = algo_knnbasic.fit(trainset).test(testset)
accuracy.mae(predictions)
accuracy.mse(predictions)
accuracy.rmse(predictions)
accuracy.fcp(predictions)


Computing the msd similarity matrix...
Done computing similarity matrix.
MAE:  0.0458
MSE: 0.0056
RMSE: 0.0746
FCP:  0.5253


0.5253082736525312

In [41]:
# Initializing the algorithm
algo = KNNBasic()
# Performing 5-fold cross-validation and calculating RMSE, FCP, MSE, and MAE measures
# for the algorithm using the given dataset
# verbose=False ensures that the output is not printed to the console
cross_validate(algo, data, measures=['RMSE', 'FCP', 'MSE', 'MAE'], cv=5, verbose=False)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.08433431, 0.08545874, 0.08614855, 0.08598624, 0.08540879]),
 'test_fcp': array([0.5616848 , 0.56688971, 0.58710803, 0.56705216, 0.55761779]),
 'test_mse': array([0.00711228, 0.0073032 , 0.00742157, 0.00739363, 0.00729466]),
 'test_mae': array([0.0508648 , 0.05066878, 0.0516045 , 0.05208684, 0.05148891]),
 'fit_time': (1.798865556716919,
  1.7698490619659424,
  1.8515751361846924,
  2.0308785438537598,
  1.927640676498413),
 'test_time': (0.14418530464172363,
  0.13407182693481445,
  0.24821758270263672,
  0.2420668601989746,
  0.13592839241027832)}

In [43]:
print(grid_knnbasic.best_score)
print(grid_knnbasic.best_params)

{'rmse': 0.08389916705217312, 'mae': 0.05296204111834677}
{'rmse': {'name': 'cosine', 'min_k': 3, 'min_support': True, 'user_based': True}, 'mae': {'name': 'cosine', 'min_k': 3, 'min_support': True, 'user_based': True}}


#### Make recommendations

In [44]:
predictions = algo_knnbasic.test(testset)

#### Recommend 5 Restaurants to users

In [45]:
# Specifying the user ID for which the recommendations need to be made
user_1 = 'BPKpLbR9NuWFAR9SUWpZOw'
# Extracting the unique business IDs from the reviews data
restaurants = review['business_id'].unique()
user_1_ratings = []
# Predicting the ratings for each restaurant and adding the (restaurant_id, rating) pair to user_1_ratings
for restaurant in restaurants:
    user_1_ratings.append((restaurant, algo.predict(user_1, restaurant).est))

sorted_ratings = sorted(user_1_ratings, key=lambda x: x[1], reverse=True)
# Displaying the top 5 recommended restaurants
for i in range(5):
    business_id = sorted_ratings[i][0]
    name = review.loc[review['business_id'] == business_id, 'name'].iloc[0]
    print(f'{i+1}. {name}')

1. "Black Bear Diner"
2. "The House Brasserie"
3. "T Cook's"
4. "Sapori D'Italia"
5. "Red White & Brew"


### Comparision of KNN Algorithms

In [46]:
# Importing the required libraries
import plotly.graph_objects as go
# Initializing the scores for each algorithm and evaluation metric
knn_basic_scores = [0.0484, 0.0062, 0.0790, 0.5175]
knn_baseline_scores = [0.0464, 0.0055, 0.0745, 0.5520]
knn_zscore_scores = [0.0357, 0.0038, 0.5415, 0.5415]
knn_means_scores = [0.0278, 0.0024, 0.0492, 0.5522]

# Specifying the evaluation metrics and algorithms to be compared
metrics = ['MAE', 'MSE', 'RMSE', 'FCP']
algorithms = ['KNN Basic', 'KNN Baseline', 'KNN with Z-Score', 'KNN with Means']

# Creating a heatmap with the scores for each algorithm and evaluation metric
data = [
    go.Heatmap(
        x=metrics,
        y=algorithms,
        z=[knn_basic_scores, knn_baseline_scores, knn_zscore_scores, knn_means_scores],
        colorscale='Viridis'
    )
]

# Specifying the layout of the plot
layout = go.Layout(
    title='Performance Comparison of KNN Algorithms',
    xaxis=dict(title='Evaluation Metric'),
    yaxis=dict(title='KNN Algorithm')
)

# Creating the plot with the specified data and layout
fig = go.Figure(data=data, layout=layout)

fig.show()

In [47]:
# Creating scatter plot data for each algorithm
knn_basic_data = go.Scatter(x=metrics, y=knn_basic_scores, mode='lines+markers', name='KNN Basic')
knn_baseline_data = go.Scatter(x=metrics, y=knn_baseline_scores, mode='lines+markers', name='KNN Baseline')
knn_zscore_data = go.Scatter(x=metrics, y=knn_zscore_scores, mode='lines+markers', name='KNN with Z-Score')
knn_means_data = go.Scatter(x=metrics, y=knn_means_scores, mode='lines+markers', name='KNN with Means')

# Specifying the layout of the plot
layout = go.Layout(
    title='Performance Comparison of KNN Algorithms',
    xaxis=dict(title='Evaluation Metric'),
    yaxis=dict(title='Mean Score')
)

# Creating the plot with the specified data and layout
fig = go.Figure(data=[knn_basic_data, knn_baseline_data, knn_zscore_data, knn_means_data], layout=layout)

fig.show()

## 2. NMF

### 1. SVD

In [50]:
parameters_svd={
    'n_factors':[20,50,80],
    'reg_all': [0.04, 0.06, 0.10],
    'n_epochs': [10, 20, 30, 50, 100],
    'lr_all':[0.002, 0.005, 0.01]
}

In [53]:
# Create a GridSearchCV object to perform a grid search on the SVD algorithm with the specified hyperparameters
grid_svd=GridSearchCV(SVD, param_grid=parameters_svd,n_jobs=-1)
# Fit the GridSearchCV object to the data to perform the grid search and find the best set of hyperparameters
grid_svd.fit(data)

In [54]:
print(grid_svd.best_score)
print(grid_svd.best_params)

{'rmse': 0.08124887839910602, 'mae': 0.04918536892075037}
{'rmse': {'n_factors': 50, 'reg_all': 0.1, 'n_epochs': 100, 'lr_all': 0.01}, 'mae': {'n_factors': 50, 'reg_all': 0.1, 'n_epochs': 100, 'lr_all': 0.01}}


In [55]:
algo_svd = grid_svd.best_estimator["rmse"]
algo_svd.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcf0d120280>

In [56]:
cross_validate(algo_svd, data, measures=['RMSE', 'FCP', 'MSE', 'MAE'], cv=5, verbose=False)

{'test_rmse': array([0.07997775, 0.08053537, 0.08318084, 0.07984584, 0.08262086]),
 'test_fcp': array([0.53325904, 0.51191617, 0.50430627, 0.50566666, 0.51123288]),
 'test_mse': array([0.00639644, 0.00648595, 0.00691905, 0.00637536, 0.00682621]),
 'test_mae': array([0.04828102, 0.04919163, 0.05101161, 0.04849256, 0.05027775]),
 'fit_time': (1.8486266136169434,
  1.8385465145111084,
  1.8329989910125732,
  2.0512070655822754,
  2.850985050201416),
 'test_time': (0.044441938400268555,
  0.26224231719970703,
  0.04266929626464844,
  0.07973051071166992,
  0.2905843257904053)}

#### Make recommendations

In [57]:
predictions = algo_svd.test(testset)

#### Recommend 5 Restaurants to users

In [58]:
user_1 = 'BPKpLbR9NuWFAR9SUWpZOw'
restaurants = review['business_id'].unique()
user_1_ratings = []
# Creates a list of unique restaurant IDs from the review DataFrame and assigns it to the variable restaurants.
for restaurant in restaurants:
    user_1_ratings.append((restaurant, algo_svd.predict(user_1, restaurant).est))

sorted_ratings = sorted(user_1_ratings, key=lambda x: x[1], reverse=True)

# Sorts the user_1_ratings list in descending order based on the predicted rating for each restaurant.
for i in range(5):
    business_id = sorted_ratings[i][0]
    name = review.loc[review['business_id'] == business_id, 'name'].iloc[0]
    print(f'{i+1}. {name}')

1. "Charlies  Lakeside Restaurant & Lounge"
2. "Sonic Drive-In"
3. "Metro Diner"
4. "Sid's Cafe"
5. "Rosita's Place"


In [59]:
accuracy.mae(predictions)
accuracy.mse(predictions)
accuracy.rmse(predictions)
accuracy.fcp(predictions)

MAE:  0.0273
MSE: 0.0023
RMSE: 0.0478
FCP:  0.5353


0.5353064850006839

### 2. SVDPP

##### 1. Using Implicit Feeback (Checkins)

In [60]:
checkin_df = pd.read_csv("drive/MyDrive/DIC/yelp_checkin.csv")

In [61]:
checkin_df = checkin_df.drop(['weekday','hour'], axis=1)
checkin_df

Unnamed: 0,business_id,checkins
0,3Mc-LxcqeguOXOVT_2ZtCg,12
1,SVFx6_epO22bZTZnKwlX7g,4
2,vW9aLivd4-IorAfStzsHww,1
3,tEzxhauTQddACyqdJ0OPEQ,1
4,CEyZU32P-vtMhgqRCaXzMA,1
...,...,...
3911213,CZKHXlDuy3IagC2W881fyA,7
3911214,mUWE-uNGyCiifmboWbyMqQ,1
3911215,nFR7dDedxRuBeZz_6Cdalg,1
3911216,y3YRUsh8FVih0AhYLx-DWA,1


In [62]:
checkin_df = checkin_df.groupby('business_id')['checkins'].sum()

In [63]:
business_checkin_df = pd.merge(review, checkin_df, on='business_id', how='left')

In [64]:
business_checkin_df.fillna(0, inplace=True)
business_checkin_df
print(business_checkin_df['checkins'].max())
print(business_checkin_df['checkins'].min())

16322.0
0.0


In [65]:
# Define the Reader object with the rating scale
reader_checkin = Reader(rating_scale=(0, 32393))
# Load the check-in data from the business_checkin_df dataframe into the Dataset object using the reader
data_checkins = Dataset.load_from_df(business_checkin_df[['user_id', 'business_id', 'checkins']], reader_checkin)
# Split the data into training and testing sets
trainset_checkin, testset_checkin = train_test_split(data, test_size=0.2)
# Define the SVD++ algorithm
algo = SVDpp()
# Train the algorithm on the training set
algo.fit(trainset_checkin)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fcf02596100>

###### Make recommendations

In [66]:
predictions = algo.test(testset_checkin)

###### Recommend 5 Restaurants to users

In [67]:
user_1 = 'BPKpLbR9NuWFAR9SUWpZOw'
# Unique list of restaurants
restaurants = review['business_id'].unique()
# List to store predicted ratings for each restaurant for user_1
user_1_ratings = []
for restaurant in restaurants:
        # Append the predicted rating of the restaurant for user_1
    user_1_ratings.append((restaurant, algo.predict(user_1, restaurant).est))

# Sort the restaurants by their predicted ratings for user_1
sorted_ratings = sorted(user_1_ratings, key=lambda x: x[1], reverse=True)

# Print the top 5 restaurants with the highest predicted ratings for user_1
for i in range(5):
    business_id = sorted_ratings[i][0]
    name = review.loc[review['business_id'] == business_id, 'name'].iloc[0]
    print(f'{i+1}. {name}')

1. "Bachi Burger"
2. "Grand Café"
3. "Sonic Drive-In"
4. "Carmine's Pizza Kitchen"
5. "Metro Diner"


In [68]:
accuracy.mae(predictions)
accuracy.mse(predictions)
accuracy.rmse(predictions)
accuracy.fcp(predictions)

MAE:  0.0572
MSE: 0.0076
RMSE: 0.0870
FCP:  0.4860


0.4859530535862487

##### 2. Using Explicit Feedback (Ratings)

###### Make recommendations

In [69]:
parameters={
    'n_factors':[20,50],
    'reg_all': [0.04, 0.06],
    'n_epochs': [10, 20],
    'lr_all':[0.002, 0.005]
}

In [70]:
# Perform grid search to find the best set of hyperparameters
grid=GridSearchCV(SVDpp, param_grid=parameters,n_jobs=-1)
grid.fit(data)

In [71]:
print(grid.best_score)
print(grid.best_params)

{'rmse': 0.0871196438122511, 'mae': 0.056399116698772164}
{'rmse': {'n_factors': 20, 'reg_all': 0.06, 'n_epochs': 20, 'lr_all': 0.005}, 'mae': {'n_factors': 20, 'reg_all': 0.06, 'n_epochs': 20, 'lr_all': 0.005}}


In [72]:
algo = grid.best_estimator["rmse"]
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fcf0cddd700>

In [73]:
predictions = algo.test(testset)

###### Recommend 5 Restaurants to users

In [74]:
# set user_1's id and get all unique business ids
user_1 = 'BPKpLbR9NuWFAR9SUWpZOw'
restaurants = review['business_id'].unique()
# create a list of tuples with restaurant ids and predicted ratings for user_1
user_1_ratings = []
for restaurant in restaurants:
    user_1_ratings.append((restaurant, algo.predict(user_1, restaurant).est))

# sort the list of predicted ratings in descending order
sorted_ratings = sorted(user_1_ratings, key=lambda x: x[1], reverse=True)

# print the top 5 recommended restaurants with their names
for i in range(5):
    business_id = sorted_ratings[i][0]
    name = review.loc[review['business_id'] == business_id, 'name'].iloc[0]
    print(f'{i+1}. {name}')

1. "Papa Chevo's Taco Shop"
2. "Barro's Pizza"
3. "Bachi Burger"
4. "Capo's Italian Cuisine"
5. "The Peppermill Restaurant & Fireside Lounge"


In [75]:
accuracy.mae(predictions)
accuracy.mse(predictions)
accuracy.rmse(predictions)
accuracy.fcp(predictions)

MAE:  0.0467
MSE: 0.0050
RMSE: 0.0709
FCP:  0.5278


0.5278499433527367

### 3. NMF

In [76]:
parameters={
    'n_factors':[20,50, 80],
    'n_epochs': [10, 20, 30]
}

In [77]:
# Perform grid search to find the best set of hyperparameters
grid=GridSearchCV(NMF, param_grid=parameters,n_jobs=-1)
grid.fit(data)

In [78]:
print(grid.best_score)
print(grid.best_params)

{'rmse': 0.09001170938603195, 'mae': 0.04905255883377468}
{'rmse': {'n_factors': 50, 'n_epochs': 10}, 'mae': {'n_factors': 50, 'n_epochs': 10}}


In [79]:
algo = grid.best_estimator["rmse"]
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7fcf014e0280>

In [80]:
algo = NMF()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7fcf014e0ac0>

#### Make recommendations

In [81]:
predictions = algo.test(testset)

#### Recommend 5 Restaurants to users

In [82]:
# set user_1's id and get all unique business ids
user_1 = 'BPKpLbR9NuWFAR9SUWpZOw'
restaurants = review['business_id'].unique()
# create a list of tuples with restaurant ids and predicted ratings for user_1
user_1_ratings = []
for restaurant in restaurants:
    user_1_ratings.append((restaurant, algo.predict(user_1, restaurant).est))

# sort the list of predicted ratings in descending order
sorted_ratings = sorted(user_1_ratings, key=lambda x: x[1], reverse=True)

# print the top 5 recommended restaurants with their names
for i in range(5):
    business_id = sorted_ratings[i][0]
    name = review.loc[review['business_id'] == business_id, 'name'].iloc[0]
    print(f'{i+1}. {name}')

1. "Panda Express"
2. "Red Robin Gourmet Burgers"
3. "Nara Hookah Lounge & Bar"
4. "Hong Phat Restaurant"
5. "Denny's"


In [83]:
accuracy.mae(predictions)
accuracy.mse(predictions)
accuracy.rmse(predictions)
accuracy.fcp(predictions)

MAE:  0.0644
MSE: 0.0048
RMSE: 0.0690
FCP:  0.5550


0.5549707191294283

### Comparing Performance of NMF techniques

In [84]:
import plotly.graph_objects as go

# Define the names of the algorithms
algorithm_names = ['SVD', 'SVDpp', 'NMF']
# Define the values for the different evaluation metrics
mae_values = [0.0439, 0.0506, 0.0452]
mse_values = [0.0045, 0.0059, 0.0072]
rmse_values = [0.0670, 0.0766, 0.0851]
fcp_values = [0.5509, 0.5359, 0.2291]

# Create a new plot
fig = go.Figure()
# Add the different bars to the plot
fig.add_trace(go.Bar(x=algorithm_names, y=rmse_values, name='RMSE'))
fig.add_trace(go.Bar(x=algorithm_names, y=mse_values, name='MSE'))
fig.add_trace(go.Bar(x=algorithm_names, y=mae_values, name='MAE'))
fig.add_trace(go.Bar(x=algorithm_names, y=fcp_values, name='FCP'))

# Update the layout of the plot
fig.update_layout(
    title='Comparison of evaluation metrics for different algorithms',
    xaxis_title='Algorithm',
    yaxis_title='Value')

# Show the plot
fig.show()

## 3. BaselineOnly

In [85]:
parameters = {'bsl_options': {'method': ['als', 'sgd'],
                              'reg_u': [10, 20, 50],
                              'reg_i': [5, 10, 15],
                              'n_epochs': [100, 150, 200],
                              'learning_rate': [0.001, 0.01, 0.1]}}

In [86]:
# performing grid search
grid=GridSearchCV(BaselineOnly, param_grid=parameters,n_jobs=-1)
grid.fit(data)

In [87]:
print(grid.best_score)
print(grid.best_params)

{'rmse': 0.08069920279896112, 'mae': 0.049451412982127775}
{'rmse': {'bsl_options': {'method': 'sgd', 'reg_u': 10, 'reg_i': 5, 'n_epochs': 200, 'learning_rate': 0.001}}, 'mae': {'bsl_options': {'method': 'sgd', 'reg_u': 10, 'reg_i': 5, 'n_epochs': 200, 'learning_rate': 0.001}}}


In [88]:
algo = grid.best_estimator["rmse"]
algo.fit(data.build_full_trainset())

Estimating biases using sgd...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7fcf000bb130>

#### Make recommendations

In [89]:
predictions = algo.test(testset)

#### Recommend 5 Restaurants to users

In [90]:
# set user_1's id and get all unique business ids
user_1 = 'BPKpLbR9NuWFAR9SUWpZOw'
restaurants = review['business_id'].unique()
# create a list of tuples with restaurant ids and predicted ratings for user_1
user_1_ratings = []
for restaurant in restaurants:
    user_1_ratings.append((restaurant, algo.predict(user_1, restaurant).est))

# sort the list of predicted ratings in descending order

sorted_ratings = sorted(user_1_ratings, key=lambda x: x[1], reverse=True)

# print the top 5 recommended restaurants with their names
for i in range(5):
    business_id = sorted_ratings[i][0]
    name = review.loc[review['business_id'] == business_id, 'name'].iloc[0]
    print(f'{i+1}. {name}')

1. "El Torito Taqueria Bar"
2. "Pier 54"
3. "Novanta"
4. "Rokerij"
5. "Giuseppe's Italian Grille"


In [91]:
accuracy.mae(predictions)
accuracy.mse(predictions)
accuracy.rmse(predictions)
accuracy.fcp(predictions)

MAE:  0.0386
MSE: 0.0038
RMSE: 0.0615
FCP:  0.5331


0.5331302056590838

## Comparing all Collaborative Filtering Models

In [93]:
import plotly.graph_objects as go
import pandas as pd
from plotly.subplots import make_subplots

# Define the values for the different evaluation metrics
data = {'test_rmse': [0.0928, 0.0755, 0.08692292],
        'test_mae': [0.0567, 0.0480, 0.05766862],
        'test_fcp': [0.5609, 0.5379, 0.63493503],
        'fit_time': [1.8923487663269043, 1.9373219013214111, 2.66087007522583],
        'test_time': [0.11574029922485352, 0.0867300033569336, 0.2719612121582031]}

# Define the names of the algorithms
index = ['SVD', 'BaselineOnly', 'KNNBasic']
df = pd.DataFrame(data=data, index=index)

fig = make_subplots(rows=2, cols=2, subplot_titles=('test_rmse', 'test_mae', 'test_fcp', 'fit_time'))
# Add the different bars to the plot
fig.add_trace(go.Bar(x=df.index, y=df['test_rmse'], name='RMSE'), row=1, col=1)
fig.add_trace(go.Bar(x=df.index, y=df['test_mae'], name='MAE'), row=1, col=2)
fig.add_trace(go.Bar(x=df.index, y=df['test_fcp'], name='FCP'), row=2, col=1)
fig.add_trace(go.Bar(x=df.index, y=df['fit_time'], name='Fit Time'), row=2, col=2)

# Update the layout of the plot
fig.update_layout(height=600, width=800, title_text="Performance Metrics by Algorithm")
fig.show()

## References - 

https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b
https://surprise.readthedocs.io/