In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.display import display, Markdown, Latex
import plotly.express as px
import datetime

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## 0. Data Loading & Summary

In [None]:
# Read each of the csv files
df_1910 = pd.read_csv('/kaggle/input/ecommerce-events-history-in-cosmetics-shop/2019-Oct.csv')
df_1911 = pd.read_csv('/kaggle/input/ecommerce-events-history-in-cosmetics-shop/2019-Nov.csv')
df_1912 = pd.read_csv('/kaggle/input/ecommerce-events-history-in-cosmetics-shop/2019-Dec.csv')
df_2001 = pd.read_csv('/kaggle/input/ecommerce-events-history-in-cosmetics-shop/2020-Jan.csv')
df_2002 = pd.read_csv('/kaggle/input/ecommerce-events-history-in-cosmetics-shop/2020-Feb.csv')

In [None]:
df_1910['event_month'] = "2019-10"
df_1911['event_month'] = "2019-11"
df_1912['event_month'] = "2019-12"
df_2001['event_month'] = "2020-01"
df_2002['event_month'] = "2020-02"

In [None]:
# concat all monthly data loaded to one dataframe
df_alldata = pd.concat([df_1910, df_1911, df_1912, df_2001, df_2002])

In [None]:
df_alldata.head()

In [None]:
mdquery = "2019-Oct has " + str(len(df_1910)) + " events, "
mdquery += "2019-Nov has " + str(len(df_1911)) + " events, "
mdquery += "2019-Dec has " + str(len(df_1912)) + " events, "
mdquery += "2021-Jan has " + str(len(df_2001)) + " events, "
mdquery += "2021-Feb has " + str(len(df_2002)) + " events. "
mdquery += "Totally " + str(len(df_alldata)) + " events."
display(Markdown(mdquery))

In [None]:
# count MISSING values in each column
df_alldata.isnull().sum()

Since there is no missing in event_time, event_type, product_id, category_id, price and user_id, we **do not need pd.dropna()**

In [None]:
event_types = df_alldata['event_type'].unique()
mdquery = "In the eCommerce Events History, there are " + str(len(event_types)) + " event types: **"
for type in event_types[:-1]:
    mdquery = mdquery + type + ", "
mdquery = mdquery + event_types[-1] + "**."
display(Markdown(mdquery))

## 1. Events Count Bar Chart Compare between Month

In [None]:
event_type_count = df_alldata.groupby(["event_month", "event_type"]).count()["event_time"]

In [None]:
fig = px.bar(event_type_count.reset_index(), x="event_month", y="event_time", color="event_type", title="Events by Month")
fig.show()

Nov 2019 have the most count of viewed and purchased events. Consider the Black Friday shopping season in November, this phenomenon is quite reasonable in western.

## 2. Monthly User Behavior Funnel Chart

In [None]:
months = ['2019-10', '2019-11', '2019-12', '2020-01', '2020-02']
month_names = ["Oct 2019", "Nov 2019", "Dec 2019", "Jan 2020", "Feb 2020"]
for i in range(5):
    month = months[i]
    month_name = month_names[i]
    user_behavior = event_type_count.reset_index().query("event_month == @month & event_type != 'remove_from_cart'")\
                                                    .sort_values("event_time", ascending=False)
    fig = px.funnel(user_behavior, x='event_time', y='event_type', title="Online user behavior funnel analysis, " + month_name)
    fig.show()

## 3. TOP 5 Repurchased Products and Categories

In [None]:
all_purchase = df_alldata.query("event_type=='purchase'")[["product_id", "category_id", "price", "user_id", "user_session", "event_month"]]
all_purchase

### 3.1 TOP 5 Repurchased Products

In [None]:
product_purchase_count = all_purchase.groupby(["product_id", 'user_id']).event_month.count().rename("purchase_count").reset_index()
avg_product_purchase_count = product_purchase_count.groupby("product_id").purchase_count.mean().rename("avg_purchase_count").sort_values(ascending=False).reset_index()

In [None]:
top_5_count_value = avg_product_purchase_count.loc[4, "avg_purchase_count"]

mdquery = "The following table shows the Top 5 products with largest average repurchased count:"
display(Markdown(mdquery))

avg_product_purchase_count.query("avg_purchase_count >= @top_5_count_value")

### 3.2 TOP 5 Repurchased Categories

In [None]:
category_purchase_count = all_purchase.groupby(["category_id", 'user_id']).event_month.count().rename("purchase_count").reset_index()
avg_category_purchase_count = category_purchase_count.groupby("category_id").purchase_count.mean().rename("avg_purchase_count").sort_values(ascending=False).reset_index()

In [None]:
top_5_count_value = avg_category_purchase_count.loc[4, "avg_purchase_count"]

mdquery = "The following table shows the Top 5 categories with largest average repurchased count:"
display(Markdown(mdquery))

avg_category_purchase_count.query("avg_purchase_count >= @top_5_count_value")

## 4. MOST Purchased Product by UNIQUE User

In [None]:
most_purchased = all_purchase.groupby("product_id").user_id.nunique().rename("user_buys").sort_values(ascending=False).reset_index().head(1)

mdquery = "The MOST purchased product is **product #" + str(most_purchased.loc[0, "product_id"]) + "**. It is purchased by " \
                                                        + str(most_purchased.loc[0, "user_buys"]) + " UNIQUE users."
display(Markdown(mdquery))

## 5. Loyal Users Analysis

### 5.1 The User with the Highest Purchase Frequency

In [None]:
loyal_user = df_alldata.query("event_type == 'purchase'")\
                        .groupby(["user_id", "event_time"]).product_id.count().rename("purchase_count")\
                        .sort_values(ascending=False).reset_index()\
                        .groupby("user_id").event_time.count().rename("purchase_count")\
                        .sort_values(ascending=False).reset_index().head(1)


mdquery = "The user with the highest purchase frequency is **user #" + str(loyal_user.loc[0, "user_id"]) + "**. This user purchased " \
                                                                     + str(loyal_user.loc[0, "purchase_count"]) + " times in 5 months."
display(Markdown(mdquery))

### 5.2 Shopping Activities Analysis of the User Purchased the Most Number of Items

In [None]:
loyal_user = all_purchase.groupby("user_id").product_id.count().rename("purchase_count").sort_values(ascending=False).reset_index().head(1)

mdquery = "The user purchased the most number of items is **user #" + str(loyal_user.loc[0, "user_id"]) + "**. This user purchased " \
                                                                     + str(loyal_user.loc[0, "purchase_count"]) + " items in 5 months."
display(Markdown(mdquery))

## 6. Rate of cart/view, purchase/cart, and purchase/view

In [None]:
product_event =  df_alldata.groupby(["product_id", "event_type"]).count()['user_id'].rename("count").reset_index()
product_event = product_event.pivot(index='product_id',columns='event_type',values='count').drop(columns=["remove_from_cart"])

In [None]:
product_event["cart/view"] = product_event["cart"] / product_event["view"]
product_event["purchase/cart"] = product_event["purchase"] / product_event["cart"]
product_event["purchase/view"] = product_event["purchase"] / product_event["view"]

In [None]:
product_event.sort_values("purchase/view", ascending=False)

At this point, the rate of purchase/view and purchase/cart rate shows that the purchase can be done without viewing or carting.    
**Future Steps Can Be Done:**    
 - The sequence of events done by one user to finish one or multiple purchase(s) can be analyzed.   
 - The average time took for one purchase been done (count from view to purchase).