## Loading Data

In [1]:
import kagglehub as kh
import sqlite3 as s3

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = kh.dataset_download("snap/amazon-fine-food-reviews")
print("Path to dataset files:", path)
conn = s3.connect(path+'/database.sqlite')

Path to dataset files: C:\Users\saura\.cache\kagglehub\datasets\snap\amazon-fine-food-reviews\versions\2


## Exploring Data

In [20]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise import accuracy
from collections import defaultdict
import pickle

In [4]:
df = pd.read_sql_query(""" SELECT * FROM Reviews""", conn)

In [5]:
print(df.head())
print(df.info())

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [6]:
df = df[['UserId', 'ProductId', 'Score']]
df.columns = ['user_id', 'item_id', 'rating']
print(df.head())

          user_id     item_id  rating
0  A3SGXH7AUHU8GW  B001E4KFG0       5
1  A1D87F6ZCVE5NK  B00813GRG4       1
2   ABXLMWJIXXAIN  B000LQOCH0       4
3  A395BORC6FGVXV  B000UA0QIQ       2
4  A1UQRSCLF8GW1T  B006K2ZZ7K       5


In [7]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
print(f"Dataset size after cleaning: {df.shape}")

df['user_id'] = df['user_id'].astype(str)
df['item_id'] = df['item_id'].astype(str)
df['rating'] = df['rating'].astype(float)

# Keep users with at least 5 reviews
min_reviews = 5
user_counts = df['user_id'].value_counts()
df = df[df['user_id'].isin(user_counts[user_counts >= min_reviews].index)]

# Keep products with at least 5 reviews
item_counts = df['item_id'].value_counts()
df = df[df['item_id'].isin(item_counts[item_counts >= min_reviews].index)]

print(f"Dataset size after filtering: {df.shape}")

Dataset size after cleaning: (562630, 3)
Dataset size after filtering: (189648, 3)


In [8]:
# Define the reader with the rating scale
reader = Reader(rating_scale=(1, 5))

# Load the dataset into Surprise format
data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)

# Split into train (80%) and test (20%)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [9]:
# Train SVD
svd_model = SVD()
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x257117f5640>

In [10]:
# Make predictions on the test set
predictions = svd_model.test(testset)

# Calculate RMSE (Root Mean Square Error)
rmse = accuracy.rmse(predictions)
print(f"Test RMSE: {rmse}")

RMSE: 0.7911
Test RMSE: 0.7910868477497006


In [12]:
# Load data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.2)

# Train SVD model
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x257117e5790>

In [13]:
# Make predictions on the test set
predictions = model.test(testset)

# Evaluate RMSE
rmse = accuracy.rmse(predictions)

RMSE: 0.7974


In [15]:
def get_top_n_recommendations(model, user_id, n=5):
    all_products = df['item_id'].unique()
    predictions = [(item, model.predict(user_id, item).est) for item in all_products]
    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    return top_n

# Example: Get top 5 recommendations for user 123
recommendations = get_top_n_recommendations(model, user_id=123, n=5)
print(recommendations)

[('B000ED9L9E', 4.850989917835667), ('B001ONPMN2', 4.781103115979346), ('B000EDG3UE', 4.773104683554928), ('B000H7ELTW', 4.772782748736063), ('B0054TWQMM', 4.7576378360660145)]


In [18]:
param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [10, 20],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.1]
}

grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
grid_search.fit(data)

# Best model
best_model = grid_search.best_estimator['rmse']


In [21]:
# Save the trained model
with open("recommendation_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

print("Model saved successfully!")

Model saved successfully!


In [22]:
# Load the trained model
with open("recommendation_model.pkl", "rb") as model_file:
    loaded_model = pickle.load(model_file)

print("Model loaded successfully!")

Model loaded successfully!
