<a href="https://www.kaggle.com/code/shwetakolekar/retailrocket-recommender-system?scriptVersionId=155167222" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from surprise import Reader,Dataset,SVD
from surprise.model_selection import cross_validate,GridSearchCV



The Retailrocket dataset comes in three files:

*    events.csv: This file contains the visitor-item interaction data
*   item_properties.сsv: This file contains item properties
*    category_tree.csv: This file contains the category tree

In [None]:

e_event=pd.read_csv('/kaggle/input/ecommerce-dataset/events.csv')
e_event.head()

In [None]:
# Here convert timestamp into datetime format

e_event['event_datetime']=pd.to_datetime(e_event['timestamp'],unit='ms')
e_event

In [None]:
e_event.shape

In [None]:
e_event.isnull().sum()

In [None]:
e_event.event.value_counts().reset_index()

In [None]:
#checking out how much timespam dataset is present here

print('Start Date of Dataset: ' ,e_event['event_datetime'].min())
print('End Date of Dataset: ' ,e_event['event_datetime'].max())
      

In [None]:
#concatenate two item files and creat one e_item file.

e_items1=pd.read_csv('/kaggle/input/ecommerce-dataset/item_properties_part1.csv')
e_items2 = pd.read_csv('/kaggle/input/ecommerce-dataset/item_properties_part2.csv')
e_items=pd.concat([e_items1,e_items2])
e_items

In [None]:
#e_items['e_event_datetime']=pd.to_datetime(e_event['timestamp'],unit='ms')
#e_items
e_items['event_datetime']=pd.to_datetime(e_items['timestamp'],unit='ms')
e_items

In [None]:
e_category = pd.read_csv('/kaggle/input/ecommerce-dataset/category_tree.csv')
e_category

In [None]:
e_category.shape

In [None]:
# distribution by event

totalcases=e_event.shape[0]
sns.set_style('whitegrid')

fig, (ax1,ax2) = plt.subplots(1,2, figsize=(20,8))
sns.histplot(x = 'event', data = e_event, bins = 3, ax = ax1)
ax1.set_ylabel('Number of Events (Unit: million)')
ax1.set_title('Distribution by Event Type')
for x, counts in zip(e_event['event'].unique().tolist(), e_event['event'].value_counts().tolist()):
    text=str(round((counts/totalcases)*100,2)) +'%'
    ax1.text(x, counts, text, fontsize=12)
    

# unique number of visitors by event

events = e_event['event'].unique().tolist()
unique_num = []
ratios = []
for event in events:
    uni_visit = len(e_event['visitorid'][e_event['event']==event].unique())
    unique_num.append(uni_visit)
    ratios.append(uni_visit/len(set(e_event['visitorid'])))

sns.barplot(x = events, y = unique_num, ax = ax2)

ratio_p = [str(round(r*100,2)) + '%' for r in ratios]
for e, c, r in zip(range(3), unique_num, ratio_p):
    ax2.text(e, c, r, fontsize=12)
    
ax2.set_title('Unique Number of Visitors')
plt.show()

In [None]:
events = e_event['event'].unique().tolist()
unique_num = []
ratios = []
for event in events:
    uni_visit = len(e_event['visitorid'][e_event['event']==event].unique())
    unique_num.append(uni_visit)
    ratios.append(uni_visit/len(set(e_event['visitorid'])))

sns.barplot(x = events, y = unique_num, ax = ax2)
ratio_p = [str(round(r*100,2)) + '%' for r in ratios]
ax2.set_title('Unique Number of Visitors')
plt.show()

In [None]:
top_viewed_items=e_event[e_event['event']=='view']['itemid'].value_counts().head(10)
sns.barplot(x=top_viewed_items.index , y=top_viewed_items.values)
plt.title('Top 10 Viewed Items')
plt.xlabel('Item ID')
plt.ylabel('Number of Views')
plt.figure(figsize=(10,6))
plt.show()

In [None]:
top_sold_items=e_event[e_event['event']=='transaction']['itemid'].value_counts().head(10)
sns.barplot(x=top_sold_items.index , y=top_sold_items.values)
plt.title('Top 10 sold Items')
plt.xlabel('Item ID')
plt.ylabel('Number of transaction')
plt.figure(figsize=(10,6))
plt.show()

In [None]:
top_added_to_cart_items=e_event[e_event['event']=='addtocart']['itemid'].value_counts().head(10)
sns.barplot(x=top_added_to_cart_items.index , y=top_added_to_cart_items.values)
plt.title('Top 10 added_to_cart Items')
plt.xlabel('Item ID')
plt.ylabel('Number of addtocart')
plt.figure(figsize=(10,6 ) )
plt.show()

In [None]:
top_active_users=e_event['visitorid'].value_counts().head(10)
plt.figure(figsize=(10,6))
sns.barplot(x=top_active_users.index, y=top_active_users.values)
plt.title('Top 10 Most active USers')
plt.xlabel('User Id')
plt.ylabel('Number of events')
plt.show()

In [None]:
# Events over time
events_daily = e_event.resample('D', on='event_datetime').count()
plt.figure(figsize=(12, 6))
events_daily['event'].plot()
plt.title('Events Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Events')
plt.show()

In [None]:
print('number of unique purchases: ', len(e_event[e_event.transactionid.notnull()].visitorid.unique()))
print('Total purchases : ', len(e_event[e_event.transactionid.notnull()]))

In [None]:
e_items['event_datetime']=pd.to_datetime(e_items['timestamp'],unit='ms')
e_items

In [None]:
e_items.shape

In [None]:
merged_events=e_event.merge(e_items, on=['itemid','timestamp'],how='left').merge(e_category,left_on='property', right_on='categoryid',how='left')
merged_events.head()

In [None]:
merged_events.info()

In [None]:
#Top categories by views
top_categories_views = merged_events[merged_events['event'] == 'view']['parentid'].value_counts().head(10)

if not top_categories_views.empty:
    plt.figure(figsize=(10, 6))
    sns.barplot(x=top_categories_views.index, y=top_categories_views.values)
    plt.title('Top 10 Categories by Views')
    plt.xlabel('Category ID')
    plt.ylabel('Number of Views')
    plt.show()
else:
    print("No data available for Top Categories by Views")

In [None]:
#Top categories by transactions
top_categories_transactions = merged_events[merged_events['event'] == 'transaction']['parentid'].value_counts().head(10)

if not top_categories_transactions.empty:
    plt.figure(figsize=(10, 6))
    sns.barplot(x=top_categories_transactions.index, y=top_categories_transactions.values)
    plt.title('Top 10 Categories by Transactions')
    plt.xlabel('Category ID')
    plt.ylabel('Number of Transactions')
    plt.show()
else:
    print("No data available for Top Categories by Transactions")

In [None]:
import datetime
import calendar

In [None]:
def findDay(x):
    day = calendar.day_name[x.weekday()]
    return day

In [None]:
e_event['day_of_week']=e_event['event_datetime'].map(findDay)
e_event['year']=e_event['event_datetime'].map(lambda x:x.year)
e_event['Month']=e_event['event_datetime'].map(lambda x:x.month)
e_event['Day'] = e_event['event_datetime'].map(lambda x: x.day)
e_event['Hour'] = e_event['event_datetime'].map(lambda x: x.hour)
e_event['minute'] = e_event['event_datetime'].map(lambda x: x.minute)
e_event

In [None]:
e_event.describe()

In [None]:
def get_time_periods(hour):
    if hour >= 3 and hour < 7:
        return 'Dawn'
    elif hour >=7 and hour < 12:
        return 'Morning'
    elif hour >=12 and hour < 16:
        return 'Afternoon'
    elif hour >=16 and hour< 22:
        return 'Evening'
    else:
        return 'Night'

In [None]:
e_event['Day period']=e_event['Hour'].map(get_time_periods)
e_event['Day period'].value_counts()

In [None]:
data = merged_events.copy()
data.drop('event_datetime_y',axis=1)

In [None]:
 #feature engineering 
#user_based Feature

user_behavior = data.groupby('visitorid').agg({
    'itemid':'count',
    'property': 'nunique',
    'parentid':'nunique',
    'timestamp': ['min' ,'max','mean']
}).reset_index()

user_behavior.columns = ['visitorid', 'item_count', 'property_count', 'category_count', 'first_interaction', 'last_interaction', 'avg_interaction_time']
user_behavior['time_on_platform'] = user_behavior['last_interaction'] - user_behavior['first_interaction']
user_behavior['time_between_actions'] = user_behavior['time_on_platform'] / (user_behavior['item_count'] - 1)

user_behavior

In [None]:
#item_based feature
item_popularity = data.groupby('itemid').agg({
    'visitorid':'count',
     'event': ['nunique',lambda  x : sum(x == 'view'),
              lambda x:sum(x =='addtocart'),
              lambda x:sum(x =='transaction')]
   }).reset_index()

item_popularity.columns=['itemid','user_count','event_count','view_count','addtocart_count','transaction_count']
item_popularity['conversion rate'] = item_popularity['transaction_count']/item_popularity['addtocart_count']
item_popularity

In [None]:
#user_item interaction feature
user_item_interaction = data.groupby(['visitorid' ,'itemid']).agg({
    'event':'count',
    'timestamp' : ['min' ,'max']
    }).reset_index()
user_item_interaction.columns =['visitorid','itemid' ,'interaction_count','first_interaction','last_interaction']
user_item_interaction['time_between_interaction'] = user_item_interaction['last_interaction'] -user_item_interaction['first_interaction']
user_item_interaction

In [None]:
## Merge the features into a single DataFrame
user_features = user_behavior.drop(['first_interaction', 'last_interaction'], axis=1)
item_features = item_popularity.drop(['view_count', 'addtocart_count', 'transaction_count'], axis=1)
interaction_features = user_item_interaction.drop(['first_interaction', 'last_interaction'], axis=1)

features = interaction_features.merge(user_features, on='visitorid').merge(item_features, on='itemid')
features 

In [None]:
# Replace infinity values with NaN
features=features.replace([np.inf,-np.inf],np.nan)

# Fill NaN values with the median
features=features.fillna(features.median())
features

* # Data Preprocessing

In [None]:
train_data ,test_data = train_test_split(features,test_size =0.2,random_state=42)


scaler = MinMaxScaler()
train_data_normalized = scaler.fit_transform(train_data)
test_data_normalized = scaler.transform(test_data)