<a href="https://www.kaggle.com/code/shwetakolekar/retailrocket-recommender-system?scriptVersionId=163514301" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

import datetime
import calendar

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from surprise import Reader,Dataset,SVD
from surprise.model_selection import cross_validate,GridSearchCV
import random




*           A recommender system  is a software designed to suggest items or content to users based on their preferences, behavior, or characteristics.The goal is to help users discover items that they might be interested in, thereby enhancing user experience and increasing engagement.Also increasing the sales.RetailRocket is an e-commerce platform, and it provides personalized recommendation solutions for e-commerce bussinesses.


* The Retailrocket dataset comes in three files:

    1 .events.csv: This file contains the visitor-item interaction data
    2 .item_properties.сsv: This file contains item properties
    3 .category_tree.csv: This file contains the category tree
    
* Here are the steps we perform in this model
    
    1. Understand the Dataset
    2. Exploratory Data Analysis
    3. Feature Engineering
    4. Data Preprocessig
    5. Tune Hyperparameters
    6. Training model on SVD model
    7. Evaluate the Recommendor System
    
    
*  In recommendor system their are two types :
   1. Content based filtering
   2. Collaborative filtering
   
                                          
*  Here We used Collaborative filtering .In that We used SVD matrix factorization technique.Matrix factorization which separates a matrix into two other matrices.

    

# 1. Data Understanding

In [None]:

e_event=pd.read_csv('/kaggle/input/ecommerce-dataset/events.csv')
e_event.head()

In [None]:
# Here convert timestamp into datetime format

e_event['event_datetime']=pd.to_datetime(e_event['timestamp'],unit='ms')
e_event

In [None]:
e_event.shape

In [None]:
e_event.isnull().sum()

In [None]:
e_event.event.value_counts().reset_index()

In [None]:
#checking out how much timespam dataset is present here

print('Start Date of Dataset: ' ,e_event['event_datetime'].min())
print('End Date of Dataset: ' ,e_event['event_datetime'].max())
      

In [None]:
#concatenate two item files and creat one e_item file.

e_items1=pd.read_csv('/kaggle/input/ecommerce-dataset/item_properties_part1.csv')
e_items2 = pd.read_csv('/kaggle/input/ecommerce-dataset/item_properties_part2.csv')
e_items=pd.concat([e_items1,e_items2])
e_items

In [None]:
e_items['event_datetime']=pd.to_datetime(e_items['timestamp'],unit='ms')
e_items

In [None]:
e_category = pd.read_csv('/kaggle/input/ecommerce-dataset/category_tree.csv')
e_category

In [None]:
e_category.shape

In [None]:
e_items.loc[(e_items.property == 'categoryid')&(e_items.value== '570')].head()

#  2.Exploratory Data Analysis

In [None]:
# distribution by event

totalcases=e_event.shape[0]
sns.set_style('whitegrid')

fig, (ax1,ax2) = plt.subplots(1,2, figsize=(20,8))
sns.histplot(x = 'event', data = e_event, bins = 3, ax = ax1)
ax1.set_ylabel('Number of Events (Unit: million)')
ax1.set_title('Distribution by Event Type')
for x, counts in zip(e_event['event'].unique().tolist(), e_event['event'].value_counts().tolist()):
    text=str(round((counts/totalcases)*100,2)) +'%'
    ax1.text(x, counts, text, fontsize=12)
    

# unique number of visitors by event

events = e_event['event'].unique().tolist()
unique_num = []
ratios = []
for event in events:
    uni_visit = len(e_event['visitorid'][e_event['event']==event].unique())
    unique_num.append(uni_visit)
    ratios.append(uni_visit/len(set(e_event['visitorid'])))

sns.barplot(x = events, y = unique_num, ax = ax2)

ratio_p = [str(round(r*100,2)) + '%' for r in ratios]
for e, c, r in zip(range(3), unique_num, ratio_p):
    ax2.text(e, c, r, fontsize=12)
    
ax2.set_title('Unique Number of Visitors')
plt.show()

In [None]:
#Top 10 Viewed items
top_viewed_items=e_event[e_event['event']=='view']['itemid'].value_counts().head(10)
sns.barplot(x=top_viewed_items.index , y=top_viewed_items.values)
plt.title('Top 10 Viewed Items')
plt.xlabel('Item ID')
plt.ylabel('Number of Views')
plt.figure(figsize=(10,6))
plt.show()

In [None]:
#Top 10 sold items
top_sold_items=e_event[e_event['event']=='transaction']['itemid'].value_counts().head(10)
sns.barplot(x=top_sold_items.index , y=top_sold_items.values)
plt.title('Top 10 sold Items')
plt.xlabel('Item ID')
plt.ylabel('Number of transaction')
plt.figure(figsize=(10,6))
plt.show()

In [None]:
#Top 10 added_to_cart items
top_added_to_cart_items=e_event[e_event['event']=='addtocart']['itemid'].value_counts().head(10)
sns.barplot(x=top_added_to_cart_items.index , y=top_added_to_cart_items.values)
plt.title('Top 10 added_to_cart Items')
plt.xlabel('Item ID')
plt.ylabel('Number of addtocart')
plt.figure(figsize=(10,6 ) )
plt.show()

In [None]:
#Top 10 most active users
top_active_users=e_event['visitorid'].value_counts().head(10)
plt.figure(figsize=(10,6))
sns.barplot(x=top_active_users.index, y=top_active_users.values)
plt.title('Top 10 Most active USers')
plt.xlabel('User Id')
plt.ylabel('Number of events')
plt.show()

In [None]:
# Events over time
events_daily = e_event.resample('D', on='event_datetime').count()
plt.figure(figsize=(12, 6))
events_daily['event'].plot()
plt.title('Events Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Events')
plt.show()

*  **The plot shows the trend or pattern of how the number of events changes over time. Each point on the line represents the count of events on a specific day.**

In [None]:
print('number of unique purchases: ', len(e_event[e_event.transactionid.notnull()].visitorid.unique()))
print('Total purchases : ', len(e_event[e_event.transactionid.notnull()]))

In [None]:
#I covert the timestamp into readable time format.
e_items['event_datetime']=pd.to_datetime(e_items['timestamp'],unit='ms')
e_items

In [None]:
e_items.shape

In [None]:
merged_events=e_event.merge(e_items, on=['itemid','timestamp'],how='left').merge(e_category,left_on='property', right_on='categoryid',how='left')
merged_events.head()

In [None]:
merged_events.info()

In [None]:
#Top categories by views
top_categories_views = merged_events[merged_events['event'] == 'view']['parentid'].value_counts().head(10)

if not top_categories_views.empty:
    plt.figure(figsize=(10, 6))
    sns.barplot(x=top_categories_views.index, y=top_categories_views.values)
    plt.title('Top 10 Categories by Views')
    plt.xlabel('Category ID')
    plt.ylabel('Number of Views')
    plt.show()
else:
    print("No data available for Top Categories by Views")

In [None]:
#Top categories by transactions
top_categories_transactions = merged_events[merged_events['event'] == 'transaction']['parentid'].value_counts().head(10)

if not top_categories_transactions.empty:
    plt.figure(figsize=(10, 6))
    sns.barplot(x=top_categories_transactions.index, y=top_categories_transactions.values)
    plt.title('Top 10 Categories by Transactions')
    plt.xlabel('Category ID')
    plt.ylabel('Number of Transactions')
    plt.show()
else:
    print("No data available for Top Categories by Transactions")

In [None]:
def findDay(x):
    day = calendar.day_name[x.weekday()]
    return day

In [None]:
# I found events in day ,year,month,day,hour,minute format.
e_event['day_of_week']=e_event['event_datetime'].map(findDay)
e_event['year']=e_event['event_datetime'].map(lambda x:x.year)
e_event['Month']=e_event['event_datetime'].map(lambda x:x.month)
e_event['Day'] = e_event['event_datetime'].map(lambda x: x.day)
e_event['Hour'] = e_event['event_datetime'].map(lambda x: x.hour)
e_event['minute'] = e_event['event_datetime'].map(lambda x: x.minute)
e_event

In [None]:
e_event.describe()

In [None]:
def get_time_periods(hour):
    if hour >= 3 and hour < 7:
        return 'Dawn'
    elif hour >=7 and hour < 12:
        return 'Morning'
    elif hour >=12 and hour < 16:
        return 'Afternoon'
    elif hour >=16 and hour< 22:
        return 'Evening'
    else:
        return 'Night'

In [None]:
#Day period wise event
e_event['Day period']=e_event['Hour'].map(get_time_periods)
e_event['Day period'].value_counts()

In [None]:
#Add-to-cart to transaction conversion rate

add_to_cart = e_event[e_event['event'] == 'addtocart']
transactions = e_event[e_event['event'] == 'transaction']
conversion_rate = len(transactions) / len(add_to_cart)

print(f"Add-to-cart to transaction conversion rate: {conversion_rate:.2%}")

1 . **User-based features**: Extract information about user behavior, such as average time spent on the platform, average number of items viewed, and average number of items added to cart. We can also calculate the average time between actions (e.g., view, add to cart, and transaction).

2 . **Item-based features**: To calculate item popularity based on the number of views, add to carts, and transactions. Additionally, we can calculate the average time an item spends in the cart before being purchased or removed.

3 . **User-item interaction features**: Calculate the frequency of user-item interactions and the average time between interactions. These features will help determine how often a user interacts with a specific item and how much time passes between interactions.

4 .**Category-based features:** Calculate category popularity based on the number of views and transactions for items in each category. This information can be useful for identifying popular categories and understanding user preferences.

5 .**Timestamp-based features:** Extract information from timestamps, such as day of the week, hour of the day, or time since 

# 3.Feature Engineering

In [None]:
# Load the data 

data = merged_events.copy()
data.drop('event_datetime_y',axis=1)
data.head()

In [None]:
data.shape

In [None]:
np.random.seed(1)

rows_to_keep = np.random.choice(data.index, size=100000, replace=False)
sample_data = data.loc[rows_to_keep]

# Display the sampled DataFrame
sample_data.head()

In [None]:

#user_based Feature

user_behavior = data.groupby('visitorid').agg({
    'itemid':'count',
    'property': 'nunique',
    'parentid':'nunique',
    'timestamp': ['min' ,'max','mean']
}).reset_index()

user_behavior.columns = ['visitorid', 'item_count', 'property_count', 'category_count', 'first_interaction', 'last_interaction', 'avg_interaction_time']
user_behavior['time_on_platform'] = user_behavior['last_interaction'] - user_behavior['first_interaction']
user_behavior['time_between_actions'] = user_behavior['time_on_platform'] / (user_behavior['item_count'] - 1)

user_behavior

In [None]:
#item_based feature
item_popularity = sample_data.groupby('itemid').agg({
    'visitorid':'count',
     'event': ['nunique',lambda  x : sum(x == 'view'),
              lambda x:sum(x =='addtocart'),
              lambda x:sum(x =='transaction')]
   }).reset_index()

item_popularity.columns=['itemid','user_count','event_count','view_count','addtocart_count','transaction_count']
item_popularity['conversion rate'] = item_popularity['transaction_count']/item_popularity['addtocart_count']
item_popularity

In [None]:
#user_item interaction feature
user_item_interaction = data.groupby(['visitorid' ,'itemid']).agg({
    'event':'count',
    'timestamp' : ['min' ,'max']
    }).reset_index()
user_item_interaction.columns =['visitorid','itemid' ,'interaction_count','first_interaction','last_interaction']
user_item_interaction['time_between_interaction'] = user_item_interaction['last_interaction'] -user_item_interaction['first_interaction']
user_item_interaction

In [None]:
## Merge the features into a single DataFrame
user_features = user_behavior.drop(['first_interaction', 'last_interaction'], axis=1)
item_features = item_popularity.drop(['view_count', 'addtocart_count', 'transaction_count'], axis=1)
interaction_features = user_item_interaction.drop(['first_interaction', 'last_interaction'], axis=1)

features = interaction_features.merge(user_features, on='visitorid').merge(item_features, on='itemid')
features.head()

In [None]:
# Replace infinity values with NaN
features=features.replace([np.inf,-np.inf],np.nan)

# Fill NaN values with the median
features=features.fillna(features.median())
features.head()

# 4. Data Preprocessing

In [None]:
## Split the data
train_data ,test_data = train_test_split(features,test_size =0.2,random_state=42)


scaler = MinMaxScaler()
train_data_normalized = scaler.fit_transform(train_data)
test_data_normalized = scaler.transform(test_data)

In [None]:
# Collaborative filtering using matrix factorization (SVD)
reader = Reader(rating_scale =(0,1))
train_dataset=Dataset.load_from_df(train_data[['visitorid' ,'itemid','interaction_count']],reader)

# covert train_dataset into specific format
trainset=train_dataset.build_full_trainset()

# 5.Hyperparameter Tuning

In [None]:
#perform hyperparameter tuning for an SVD (Singular Value Decomposition) recommendation model using grid search. 

param_grid = {'n_factors': [10, 50, 100], 'n_epochs': [20, 50], 'lr_all': [0.002, 0.005], 'reg_all': [0.02, 0.1]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2,n_jobs=-1)


In [None]:
# fit the GridSearchCV object to the training dataset
gs.fit(train_dataset)

In [None]:
# Best hyperparameters
best_params = gs.best_params['rmse']
print(f"best hyperparameter for RSME :{best_params}")

# 6. Model Training

* # SVD Model

In [None]:
# Train the model with the best hyperparameters
algo =SVD(n_factors=best_params['n_factors'],n_epochs=best_params['n_epochs'],lr_all=best_params['lr_all'],reg_all=best_params['reg_all'])
algo.fit(trainset)

#model evalution
testset=list(zip(test_data['visitorid'].values, test_data['itemid'].values, test_data['interaction_count'].values))
prediction=algo.test(testset)

In [None]:
# Calculate evaluation metrics
MAE = mean_absolute_error(test_data['interaction_count'],[pred.est for pred in prediction])
MSE = mean_squared_error(test_data['interaction_count'],[pred.est for pred in prediction])
rmse = np.sqrt(MSE)

print(f"Mean Absolute Error: {MAE}")
print(f"Mean Squared Error: {MSE}")
print(f"Root Mean Squared Error: {rmse}")

* **For MAE the value is very low so this indicates good model performance**
*  **A low RMSE indicates good model performance.**

In [None]:
# Visualizing the results
## Top-N recommendations for a user

def get_top_n(prediction, n=10):
    top_n = {}
    for uid, iid, true_r, est, _ in prediction:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))
        
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
        
    return top_n

top_n = get_top_n(prediction, n=10)

# Pick a random user ID from the top_n dictionary

user_id = random.choice(list(top_n.keys()))


try:
    recommended_items = [item[0] for item in top_n[user_id]]
    print(f"Top 10 recommended items for user {user_id}: {recommended_items}")
except KeyError:
    print(f"User {user_id} not found in the recommendations.")
    