# **Recommendation System Model**

## *Imports*

In [None]:
!pip install pandas
!pip install numpy
!pip install statistics
!pip install sklearn
!pip install scipy
!pip install seaborn
!pip install matplotlib
!pip install tqdm
!pip install surprise
!pip install lightfm
!pip install gensim
!pip install pickle
!pip install ./scikit-learn

In [None]:
import pandas as pd
import numpy as np
import statistics

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pickle
import gensim

from typing import List

from sklearn.neighbors import BallTree
from sklearn.decomposition import NMF, MiniBatchNMF
from sklearn.model_selection import train_test_split
from sklearn.utils.extmath import randomized_svd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.impute import SimpleImputer

from scipy.sparse import csr_matrix, coo_matrix
from scipy.sparse.linalg import svds
from surprise import Dataset, Reader

from gensim.models import Word2Vec, KeyedVectors

from smart_substitution_model import df_recipe

## **Data Set: User Interactions**

### Data Cleaning

The provided dataset contains a split for the test, train, and validation sets. Data contains dated user interactions with recipes found on food.com.

In [None]:
test = pd.read_csv('data/interactions_test.csv')
train = pd.read_csv('data/interactions_train.csv')
validation = pd.read_csv('data/interactions_validation.csv')

In [None]:
dataframes = [test, train, validation]

for df in dataframes:
    df.insert(0, 'u', df.pop('u'))
    df.insert(1, 'i', df.pop('i'))
    df.drop(['user_id', 'date', 'recipe_id'], axis=1, inplace=True)

In [None]:
train = train.rename(columns={'u': 'user_id'})
train = train.rename(columns={'i': 'recipe_id'})

test = test.rename(columns={'u': 'user_id'})
test = test.rename(columns={'i': 'recipe_id'})

validation = validation.rename(columns={'u': 'user_id'})
validation = validation.rename(columns={'i': 'recipe_id'})

In [None]:
combined = pd.concat([test, train, validation], axis=0)

### Trim Data

In order to reduce sparseness of the rating matrix, users with less than a specified amount of reviews was removed from the dataset. New train, test, validation datasets were created

In [None]:
user_counts = combined['user_id'].value_counts().sort_index()
bad_users = []

for user, val in user_counts.items():
  if val < 500:
    bad_users.append(user)
  
print(str(len(bad_users)) + " --> new dataset: " + str(25075 - len(bad_users)))

In [None]:
trimmed = combined[~combined['user_id'].isin(bad_users)].dropna()
print(trimmed.shape)

In [None]:
train, test = train_test_split(trimmed, test_size=0.15, random_state=36)
train, validation = train_test_split(train, test_size=0.15, random_state=36)

print('total size: ' + str(train.shape[0] + test.shape[0] + validation.shape[0]))
print('train: ' + str(train.shape))
print('test: ' + str(test.shape))
print('validation: ' + str(validation.shape))

### Create Rating Matricies

In [None]:
# train
rating_matrix = pd.pivot_table(train, values='rating', index='user_id', columns='recipe_id')

In [None]:
# validate
rating_matrix_v = pd.pivot_table(validation, values='rating', index='user_id', columns='recipe_id')

In [None]:
training_cols = set(rating_matrix.columns)
validation_cols = set(rating_matrix_v.columns)
common = training_cols.intersection(validation_cols)
common = list(common)

In [None]:
rating_matrix = rating_matrix.reindex(columns=common, fill_value=0)
rating_matrix_v = rating_matrix_v.reindex(columns=common, fill_value=0)

In [None]:
rating_matrix = rating_matrix.reindex(sorted(rating_matrix.columns), axis=1)
rating_matrix_v = rating_matrix_v.reindex(sorted(rating_matrix_v.columns), axis=1)

In [None]:
rating_matrix_array = rating_matrix.values
rating_matrix_array_v = rating_matrix_v.values

### Visualize Rating Matricies

In [None]:
sns.heatmap(rating_matrix_array[:250], cmap='YlGnBu')
plt.title('Rating Matrix - Train')
plt.xlabel('Recipe')
plt.ylabel('User')
plt.show()

In [None]:
sns.heatmap(rating_matrix_array_v[:250], cmap='YlGnBu')
plt.title('Rating Matrix - Validation')
plt.xlabel('Recipe')
plt.ylabel('User')
plt.show()

## Create Model

In [None]:
def create_model(n_components=20, alpha_W=0.0, alpha_H='same', l1_ratio=0.0):
    init = 'random'
    solver = 'mu'
    beta_loss = 'frobenius'
    tol = 1e-4
    max_iter = 1000 
    random_state = 10
    verbose = 0
    shuffle = False
    model = NMF(n_components=n_components, init=init, solver=solver, beta_loss=beta_loss, tol=tol, max_iter=max_iter, random_state=random_state, alpha_W=alpha_W, alpha_H=alpha_H, l1_ratio=l1_ratio, verbose=verbose, shuffle=shuffle)
    
    return model

In [None]:
def run_model(model):
    user_features = model.fit_transform(rating_matrix_array)
    recipe_features = model.components_
    val_set_transformed = model.transform(rating_matrix_array_v)
    predicted_val_ratings = np.dot(val_set_transformed, recipe_features)
    rating_matrix_v_masked = np.invert(np.isnan(rating_matrix_array_v)).astype(int)
    rmse = np.sqrt(mean_squared_error(rating_matrix_v_masked, predicted_val_ratings))
    
    return rmse

In [None]:
# default model
n_components = 2
init = 'random'
solver = 'mu'
beta_loss = 'frobenius'
tol = 1e-4
max_iter = 1000 
random_state = 10 
alpha_W = 0.0 
alpha_H = 'same'
l1_ratio = 0.0
verbose = 0
shuffle = False

model = NMF(n_components=n_components, init=init, solver=solver, beta_loss=beta_loss, tol=tol, max_iter=max_iter, random_state=random_state, alpha_W=alpha_W, alpha_H=alpha_H, l1_ratio=l1_ratio, verbose=verbose, shuffle=shuffle)

In [None]:
# features
user_features = model.fit_transform(rating_matrix_array)
recipe_features = model.components_

In [None]:
# transform
val_set_transformed = model.transform(rating_matrix_array_v)

In [None]:
# predict
predicted_val_ratings = np.dot(val_set_transformed, recipe_features)

In [None]:
# make mask
rating_matrix_v_masked = np.invert(np.isnan(rating_matrix_array_v)).astype(int)

In [None]:
# visualize
sns.heatmap(predicted_val_ratings[:100], cmap='YlGnBu', vmin=0, vmax=5)
plt.title('Rating Matrix: Predictions')
plt.xlabel('Recipe')
plt.ylabel('User')
plt.show()

### Determination of Optimal Parameters + Evaluation

Here, GridSearchCV was employed, as well as observing the effect of fitting the model to different parameters over several epochs. Variables experimented with were:
- n_components
- alpha_H
- alpha_W
- l1_ratio

In [None]:
# evaluate
rmse = np.sqrt(mean_squared_error(rating_matrix_v_masked, predicted_val_ratings))
print('Calculated RMSE Value:', rmse)

In [None]:
# changing n_components
rmse_list = []
reconstructed_errors = []

for num_comp in range(1,30):
    model = create_model(n_components=num_comp)
    rmse_val = run_model(model)
    rmse_list.append(rmse_val)
    reconstructed_errors.append(model.reconstruction_err_)
    print('finished: #' + str(num_comp))
    
    
plt.plot(rmse_list)
plt.xlabel('#')
plt.ylabel('rmse')
plt.show()

plt.plot(reconstructed_errors)
plt.xlabel('#')
plt.ylabel('reconst')
plt.show()

In [None]:
# changing alpha_H
rmse_list = []
reconstructed_errors = []

new_alpha_H = 0.001/40
for i in range(25):
    new_alpha_H *= 40 
    model = create_model(alpha_H=new_alpha_H)
    rmse_val = run_model(model)
    rmse_list.append(rmse_val)
    reconstructed_errors.append(model.reconstruction_err_)
    print('finished: #' + str(i))
    
    
plt.plot(rmse_list)
plt.xlabel('#')
plt.ylabel('rmse')
plt.show()

plt.plot(reconstructed_errors)
plt.xlabel('#')
plt.ylabel('reconst')
plt.show()

In [None]:
# changing alpha_W
rmse_list = []
reconstructed_errors = []

new_alpha_W = 0.001/40
for i in range(25):
    new_alpha_W *= 40 
    model = create_model(alpha_W=new_alpha_W)
    rmse_val = run_model(model)
    rmse_list.append(rmse_val)
    reconstructed_errors.append(model.reconstruction_err_)
    print('finished: #' + str(i))
    
    
plt.plot(rmse_list)
plt.xlabel('#')
plt.ylabel('rmse')
plt.show()

plt.plot(reconstructed_errors)
plt.xlabel('#')
plt.ylabel('reconst')
plt.show()

In [None]:
# changing l1_ratio
rmse_list = []
reconstructed_errors = []

new_l1_ratio = -0.05
for i in range(20):
    new_l1_ratio += 0.05 
    model = create_model(l1_ratio=new_l1_ratio)
    rmse_val = run_model(model)
    rmse_list.append(rmse_val)
    reconstructed_errors.append(model.reconstruction_err_)
    print('finished: #' + str(i))
    
    
plt.plot(rmse_list)
plt.xlabel('#')
plt.ylabel('rmse')
plt.show()

plt.plot(reconstructed_errors)
plt.xlabel('#')
plt.ylabel('reconst')
plt.show()

In [None]:
# GridSearchCV
model_params = {'n_components': [15,16,17,18,19,20,21,22,23,24,25],
                'beta_loss': ['frobenius', 'kullback-leibler'],
                'alpha_W': [0.001, 0.01, 0.1, 1],
                'alpha_H': [0.001, 0.01, 0.1, 1],
                'l1_ratio': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}

n_components = 2
init = 'random'
solver = 'mu'
max_iter = 1000 
random_state = 10
nmf_model = NMF(init=init, solver=solver, beta_loss=beta_loss, max_iter=max_iter, random_state=random_state)

In [None]:
grid_search = GridSearchCV(nmf_model, model_params, scoring='neg_mean_squared_error')

In [None]:
grid_search.fit(rating_matrix_v_masked, predicted_val_ratings)

print(grid_search.best_estimator_)
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)

## Combined Model

Following data cleaning and finding optimal parameters for the NMF model, the recommendation system needs to be merged with the smart substitution algorithm. We can use `from recipe_similarity import df_recipe` to retrieve the dataframe with similarity scores.

In [None]:
predict_rating_matrix = pd.DataFrame(data=predicted_val_ratings, index=rating_matrix.index, columns=rating_matrix.columns)
final_predictions_df = predict_rating_matrix.stack().reset_index()
final_predictions_df.columns = ['user_id', 'recipe_id', 'rating']
final_predictions_df.head()

In [None]:
demo_id = int(input('Input a number for a specified user: '))
df_user = final_predictions_df[final_predictions_df['user_id'] == demo_id][['recipe_id', 'rating']]
df_user.head()

In [None]:
df_recipe = df_recipe.reset_index()
df_recipe = df_recipe.rename(columns={'index':'recipe_id'})
df_recipe.head()

In [None]:
final_rec = pd.merge(df_user, df_recipe, on='recipe_id')
final_rec = final_rec.drop(columns=['ids'])
final_rec.head(10)

In [None]:
final_rec["rating"] = final_rec["rating"] / 5
final_rec["average"] = (final_rec["rating"] + final_rec["score"]) / 2

## Final Top 10 Ratings

The two models were combined by using the user predicted ratings from the NMF model and the similarity scores from the smart substitution model. We can normalize the rating to get a number from 0-1, then average the result of the rating and the similarity score to obtain a cumulative score. We can then use this number to determine what the top recommended recipes are based on the inputted user.

In [None]:
final_rec = final_rec.sort_values(by="average", ascending=False)
final_rec.head(10)