In [15]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import os
from dotenv import load_dotenv
from supabase import create_client
from surprise import (
    Dataset,
    KNNBaseline,
    KNNBasic,
    KNNWithMeans,
    KNNWithZScore,
    NMF,
    SVD,
    Reader,
)
from surprise.model_selection import cross_validate, KFold, GridSearchCV, train_test_split
from tabulate import tabulate
import datetime
import time


In [2]:
def initialising_supabase():
    """
    Initializes the Supabase client using environment variables.

    Environment Variables:
        SUPABASE_URL (str): The URL of the Supabase project.
        SUPABASE_API_KEY (str): The API key for accessing the Supabase project.

    Returns:
        supabase.Client: An initialized Supabase client instance.
    """

    load_dotenv()
    SUPABASE_URL = os.getenv("SUPABASE_URL")
    SUPABASE_API_KEY = os.getenv("SUPABASE_API_KEY")
    supabase = create_client(SUPABASE_URL, SUPABASE_API_KEY)
    return supabase

In [3]:
def load_product_data(supabase=None):
    """
    Initializes the MongoDB client and connects to the specified database.

    Environment Variables:
        MONGODB_URI (str): The connection URI for MongoDB.
        FLIPKART (str): The name of the MongoDB database to connect to.

    Returns:
        pymongo.database.Database: A MongoDB database instance.
    """

    if supabase is None:
        supabase = initialising_supabase()
    # Load data from the flipkart_cleaned table in supabase
    catalogue_data = supabase.table('flipkart_cleaned_2k').select('*').execute().data
 
    return catalogue_data
flipkart = load_product_data()

def load_order_data(supabase=None): 
    """
    Loads product data from the 'flipkart_cleaned_2k' table in Supabase.

    Args:
        supabase (supabase.Client, optional): An initialized Supabase client. 
                                              If not provided, a new client is initialized.

    Returns:
        list[dict]: A list of dictionaries containing product data from the Supabase table.
    """
    
    if supabase is None:
        supabase = initialising_supabase()
    users_data = pd.DataFrame(supabase.table('synthetic_v2_2k').select('*').execute().data)
    return users_data
orderdata = load_order_data()

### Memory Based Recommendations

In [4]:
np.random.seed(0)
random.seed(0)

In [5]:
reader = Reader(rating_scale=(0, 5))
dataset = Dataset.load_from_df(orderdata[['User ID' ,'uniq_id', 'User rating for the product']], reader)

In [6]:
memory_algos = (
    KNNBasic(verbose=False),
    KNNWithMeans(verbose=False),
    KNNBaseline(verbose=False),
    KNNWithZScore(verbose=False),
)

In [7]:
kf =  KFold(random_state=0)
table = []

#summary of all the memory based algorithms based on k-nearest neighbours
for algo in memory_algos:
    start = time.time()
    out = cross_validate(algo, dataset, ["rmse", "mae"], kf)
    algo_name = algo.__class__.__name__
    mean_rmse = "{:.3f}".format(np.mean(out["test_rmse"]))
    mean_mae = "{:.3f}".format(np.mean(out["test_mae"]))

    new_line = [algo_name, mean_rmse, mean_mae]
    table.append(new_line)

header = ["Memory-Based Algorithm", "RMSE", "MAE"]
print(tabulate(table, header, tablefmt="pipe"))

| Memory-Based Algorithm   |   RMSE |   MAE |
|:-------------------------|-------:|------:|
| KNNBasic                 |  1.15  | 0.914 |
| KNNWithMeans             |  1.179 | 0.938 |
| KNNBaseline              |  1.15  | 0.914 |
| KNNWithZScore            |  1.176 | 0.937 |


In [8]:
trainset, testset = train_test_split(dataset, test_size=0.3, random_state=42)

sim_options = {
    'name': 'pearson',
    'user_based': False,
    'min_support': 5
}

memory_model = KNNBaseline(k=50, min_k=5, sim_options=sim_options)
memory_model.fit(trainset)
memory_predictions = memory_model.test(testset)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


### Non-Negative Matrix Factorisation

In [9]:
model_nmf = NMF()
cv_results_nmf = cross_validate(model_nmf, dataset, cv=3)
nmf_mean_rmse = np.mean(cv_results_nmf["test_rmse"])
nmf_mean_mae = np.mean(cv_results_nmf["test_mae"])
table.append(["NMF", nmf_mean_rmse, nmf_mean_mae])
pd.DataFrame(cv_results_nmf).mean()

test_rmse    1.210964
test_mae     0.970133
fit_time     0.289852
test_time    0.022789
dtype: float64

### Singular Value Decomposition

In [10]:
model_svd = SVD()
cv_results_svd = cross_validate(model_svd, dataset, cv=3)
svd_mean_rmse = np.mean(cv_results_svd["test_rmse"])
svd_mean_mae = np.mean(cv_results_svd["test_mae"])
table.append(["SVD", svd_mean_rmse, svd_mean_mae])
pd.DataFrame(cv_results_svd).mean()

test_rmse    1.006604
test_mae     0.809042
fit_time     0.131919
test_time    0.046473
dtype: float64

#### With grid-search to optimize further

In [16]:
param_grid = {'n_factors': [80,100,120],
              'n_epochs': [5, 10, 20],
              'lr_all': [0.002, 0.005],
              'reg_all': [0.2, 0.4, 0.6]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(dataset)
best_rmse = gs.best_score['rmse']
best_mae = gs.best_score['mae']
best_rmse_params = gs.best_params['rmse']
table.append(["SVD (GridSearch)", best_rmse, best_mae])
print(best_rmse_params)

{'n_factors': 80, 'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}


### Overall Results

In [13]:
# Convert to DataFrame for a single table
results_df = pd.DataFrame(table, columns=["Algorithm", "Mean RMSE", "Mean MAE"])

# Print the table
print(results_df)

          Algorithm Mean RMSE  Mean MAE
0          KNNBasic     1.150     0.914
1      KNNWithMeans     1.179     0.938
2       KNNBaseline     1.150     0.914
3     KNNWithZScore     1.176     0.937
4               NMF  1.210964  0.970133
5               SVD  1.006604  0.809042
6  SVD (GridSearch)  0.982635  0.792074
