In [1]:
%reset -f

In [2]:
# make sure python version is between 3.7 and 3.10
import sys
print(sys.version)

3.9.21 (main, Dec 11 2024, 16:35:24) [MSC v.1929 64 bit (AMD64)]


## Importing Libraries

In [None]:
#Data import
import kagglehub as kh
import sqlite3 as s3

#Data Exploring
import pandas as pd
import numpy as np

#Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#Data PreProcessing & Modeling
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise import accuracy
from collections import defaultdict
import pickle

## Loading Data

In [16]:
path = kh.dataset_download("snap/amazon-fine-food-reviews")
print("Path to dataset files:", path)
# conn = s3.connect(path+'/database.sqlite')

Path to dataset files: C:\Users\saura\.cache\kagglehub\datasets\snap\amazon-fine-food-reviews\versions\2


In [None]:
df = pd.read_sql_query(""" SELECT * FROM Reviews""", conn)

## Exploring Data

In [None]:
print(df.head())

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

## PreProcessing Data

In [None]:
#Drop null and duplicates
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [None]:
df = df[['UserId', 'ProductId', 'Score']]
df.columns = ['user_id', 'item_id', 'rating']

In [None]:
print(f"Dataset size after cleaning: {df.shape}")

df['user_id'] = df['user_id'].astype(str)
df['item_id'] = df['item_id'].astype(str)
df['rating'] = df['rating'].astype(float)

# Keep users with at least 50 reviews
min_reviews = 50
user_counts = df['user_id'].value_counts()
df = df[df['user_id'].isin(user_counts[user_counts >= min_reviews].index)]

# Keep products with at least 5 reviews
# item_counts = df['item_id'].value_counts()
# df = df[df['item_id'].isin(item_counts[item_counts >= min_reviews].index)]

print(f"Dataset size after filtering: {df.shape}")

In [None]:
# Number of unique user id and product id in the data
print('Number of unique USERS in Raw data = ', df['user_id'].nunique())
print('Number of unique ITEMS in Raw data = ', df['item_id'].nunique())

In [None]:
# Define reader with rating scale
reader = Reader(rating_scale=(df['rating'].min(), df['rating'].max()))

# Load data into Surprise dataset
data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)

## Data Visualization

In [None]:
# Distribution of Ratings
df['rating'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Top 10 Most Rated Products
df['item_id'].value_counts().nlargest(10).plot(kind='bar')
plt.title('Top 10 Most Rated Products')
plt.xlabel('Item ID')
plt.ylabel('Number of Ratings')
plt.show()

In [None]:
### 1. User-Product Interaction Heatmap ###
interaction_matrix = df.pivot_table(index='user_id', columns='item_id', values='rating', fill_value=0)
plt.figure(figsize=(12, 8))
sns.heatmap(interaction_matrix, cmap="coolwarm", cbar=True)
plt.title('User-Product Interaction Heatmap')
plt.xlabel('Product ID')
plt.ylabel('User ID')
plt.show()

## Model Training

In [None]:
# Split into train (80%) and test (20%)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [None]:
# Train SVD
svd_model = SVD(n_factors=50, random_state=42)
svd_model.fit(trainset)

In [None]:
# Make predictions on the test set
predictions = svd_model.test(testset)

# Calculate RMSE (Root Mean Square Error)
rmse = accuracy.rmse(predictions)
print(f"Test RMSE: {rmse}")

In [None]:
def get_top_n_recommendations(model, user_id, n=5):
    all_products = df['item_id'].unique()
    predictions = [(item, model.predict(user_id, item).est) for item in all_products]
    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    return top_n

# Example: Get top 5 recommendations for user 123
recommendations = get_top_n_recommendations(model, user_id=123, n=5)
print(recommendations)

In [None]:
param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [10, 20],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.1]
}

grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
grid_search.fit(data)

# Best model
best_model = grid_search.best_estimator['rmse']


In [None]:
# Save the trained model
with open("./models/recommendation_model.pkl", "wb") as model_file:
    pickle.dump(best_model, model_file)

print("Model saved successfully!")

In [None]:
# Load the trained model
with open("./models/recommendation_model.pkl", "rb") as model_file:
    loaded_model = pickle.load(model_file)

print("Model loaded successfully!")