In [None]:
# 786
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import plotly.graph_objs as go

import plotly as py
from plotly import tools
from plotly.offline import iplot
py.offline.init_notebook_mode(connected = True)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):

    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Loading & Preparation

In [None]:
dt = pd.read_csv("../input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv", parse_dates=["created_at", "Working Date"], low_memory=False)
print("Data Dimensions are: ", dt.shape)
print("Columns: ", dt.columns)

In [None]:
print(dt.info())

Data contains 1048574 rows but maximum columns contain 584524 records. 

Half of row are completely empty, so we will drop them. The tricky part is we can't drop all na rows as actual data set  also contain few NA entries. We need to keep them.
We will drop NA values where all entries are Null. 

Also, we will drop last 5 empty columns.

In [None]:
dt = dt.iloc[:, :-5]
dt = dt.dropna(how = 'all') 

The column MV contains leading and trailing space that might cause problem. We will rename it first.

In [None]:
dt.rename(columns = {' MV ':'MV'}, inplace = True)
dt.columns

As we can see above, few columns are not in correct data type. We need to perform casting.

In [None]:
dt['Customer ID'] = dt['Customer ID'].astype(str)
dt['item_id'] = dt['item_id'].astype(str)
dt['qty_ordered'] = dt['qty_ordered'].astype(int)  
dt['Year'] = dt['Year'].astype(int)  
dt['Month'] = dt['Month'].astype(int)  
# dt['MV'] = dt['MV'].astype(float, errors = 'raise')

In [None]:
dt.tail()

### Let's look into summary of data
Data Summary of non-numeric data

In [None]:
dt.describe()

Data Summary of non-numeric data

In [None]:
dt.describe(include=['object', 'bool'])

# Exploratory Analysis to Understand Data

In [None]:
dt = dt.sort_values('created_at')

### Few new features extracted

In [None]:
dtg = dt.groupby('created_at')['grand_total'].sum().reset_index()
dtq = dt.groupby('created_at')['qty_ordered'].sum().reset_index()
dtd = dt.groupby('created_at')['discount_amount'].sum().reset_index()
# comput count for non numeric values
dts = dt.groupby('created_at')['sku'].count().reset_index() 
dtst = dt.groupby('created_at')['status'].count().reset_index()

In [None]:
# new data set
p = pd.DataFrame(dtg) 
p['qty_ordered'] = dtq['qty_ordered']
p['discount_amount'] = dtd['discount_amount']
p['sku'] = dts['sku']
p['status'] = dtst['status']
#Cumulative Sum
p['cum_grand_total'] = p['grand_total'].cumsum()
p['cum_qty_ordered'] = p['qty_ordered'].cumsum()
p['cum_discount_amount'] = p['discount_amount'].cumsum()
p['cum_sku_cnt'] = p['sku'].cumsum()
p['cum_status_cnt'] = p['status'].cumsum()


In [None]:
# Date features
p['Dateofmonth'] = p['created_at'].dt.day
p['Month'] = p['created_at'].dt.month
p['Week'] = p['created_at'].dt.week
p['Dayofweek'] = p['created_at'].dt.dayofweek # 0 = monday.
p['Weekdayflg'] = (p['Dayofweek'] // 5 != 1).astype(float)
p['Month'] = p['created_at'].dt.month
p['Quarter'] = p['created_at'].dt.quarter
p['Dayofyear'] = p['created_at'].dt.dayofyear

In [None]:
p.head()

## Daily Sales vs. Discount

In [None]:
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=p['created_at'], y=p['grand_total'],
                    mode='lines+markers',
                    name='grand_total'))
fig.add_trace(go.Scatter(x=p['created_at'], y=p['discount_amount'],
                    mode='lines+markers',
                    name='discount_amount'))
fig.show()

### Cumulative Sums of Grand_Total and discount_amount

In [None]:
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=p['created_at'], y=p['cum_grand_total'],
                    mode='lines+markers',
                    name='xcum_grand_total'))
fig.add_trace(go.Scatter(x=p['created_at'], y=p['cum_discount_amount'],
                    mode='lines+markers',
                    name='cum_discount_amount'))
fig.show()

**In above graphs we can observe that sales boosted when discount offer initiated.**

But this can we tempting without looking into item status.

In [None]:
n = dt.groupby(['Year' ,'status'])['grand_total'].sum().reset_index()
fig = px.bar(n, x="Year", y="grand_total", color="status", title="Long-Form Input")
fig.show()

**In each year order cancellation is high. We need to drop Cancelled items and recheck sales growth**

Note: We will do this after looking into other data points. 

In [None]:
n = dt.groupby(['Year' ,'payment_method'])['grand_total'].sum().reset_index()
fig = px.bar(n, x="Year", y="grand_total", color="payment_method", title="Long-Form Input")
fig.show()

### Order Status

In [None]:
n = dt.groupby(['status'])['grand_total'].sum().reset_index()
fig = px.bar(n, y='grand_total', x='status', text='grand_total')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [None]:
n = dt.groupby(['created_at' ,'status'])['grand_total'].sum().reset_index()
px.box(n, y="grand_total", color = "status")

### Category Type

In [None]:
n = dt.groupby(['category_name_1'])['grand_total'].sum().reset_index()
fig = px.bar(n, y='grand_total', x='category_name_1', text='grand_total')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [None]:
n = dt.groupby(['category_name_1','status'])['grand_total'].sum().reset_index()
fig = px.bar(n, x="category_name_1", y="grand_total",
             color='status', barmode='group')
fig.show()

## Payment Methods


In [None]:
n = dt.groupby(['payment_method'])['grand_total'].sum().reset_index()

fig = px.bar(n, y='grand_total', x='payment_method', text='grand_total')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

## Growth Analysis

As we analysed above, we need to drop cancelled orders


In [None]:
ord_cncl_ind = dt[dt['status'] == 'canceled' ].index
dt.drop(ord_cncl_ind , inplace=True)
dt.shape

# Recomputing daily figures

In [None]:
dtg = dt.groupby('created_at')['grand_total'].sum().reset_index()
dtq = dt.groupby('created_at')['qty_ordered'].sum().reset_index()
dtd = dt.groupby('created_at')['discount_amount'].sum().reset_index()
# comput count for non numeric values
dts = dt.groupby('created_at')['sku'].count().reset_index() 
dtst = dt.groupby('created_at')['status'].count().reset_index()

# new data set
p = pd.DataFrame(dtg) 
p['qty_ordered'] = dtq['qty_ordered']
p['discount_amount'] = dtd['discount_amount']
p['sku'] = dts['sku']
p['status'] = dtst['status']
#Cumulative Sum
p['cum_grand_total'] = p['grand_total'].cumsum()
p['cum_qty_ordered'] = p['qty_ordered'].cumsum()
p['cum_discount_amount'] = p['discount_amount'].cumsum()
p['cum_sku_cnt'] = p['sku'].cumsum()
p['cum_status_cnt'] = p['status'].cumsum()


In [None]:
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=p['created_at'], y=p['grand_total'],
                    mode='lines+markers',
                    name='grand_total'))
fig.add_trace(go.Scatter(x=p['created_at'], y=p['discount_amount'],
                    mode='lines+markers',
                    name='discount_amount'))
fig.show()

## A quick view of Regession model (OLS)

In [None]:
fig = px.scatter(p, x= 'created_at', y = 'grand_total', trendline = "ols")
fig.show()
results = px.get_trendline_results(fig)
results

### Density Graph

In [None]:
n = dt.groupby('created_at')['grand_total'].sum().reset_index()
px.density_contour(n,x="created_at",y="grand_total",marginal_x="histogram",marginal_y="histogram")

In [None]:
# Graph for quantity
n = dt.groupby('created_at')['qty_ordered'].sum().reset_index()
px.density_contour(n,x="created_at",y="qty_ordered",marginal_x="histogram",marginal_y="histogram", title="no of orders")

In [None]:
n = dt.groupby(['created_at' ,'category_name_1', 'status'])['qty_ordered'].sum().reset_index()
px.scatter(n, x="created_at", y="qty_ordered", color="status", size="qty_ordered", hover_data=['category_name_1','status'])


In [None]:
n = dt.groupby(['created_at' ,'status'])['qty_ordered'].sum().reset_index()
px.line(n, x="created_at", y="qty_ordered", color="status", )

To be Continue...

**You can fork this kernel and continue your analysis.**

**Way Forward**
* Data Cleansing at SKU and Status columns
* Segregate analysis by dropping Cancel status orders. 
* Quarterly, Monthly, Weekday and Weekend Analysis
* Seasonality Analysis
* What are the Trends in Top 10 Categories
* Weekly Moving Average Analysis

# Quarterly, Monthly, Weekday and Weekend Analysis

In [None]:
pa = pd.DataFrame(dt)
# pa.reset_index(inplace=True)
pa.drop(pa[pa.status=="canceled"].index,inplace=True) #dropping cancelled orders
df = pd.DataFrame(pa)
#adding few more date features
df["month_name"] = df["created_at"].dt.month_name()
df["week_day_name"] = df["created_at"].dt.day_name()
df["week_day"] = df["created_at"].dt.weekday
df["week"] = df["created_at"].dt.isocalendar().week
df["month_start"] = df["created_at"].dt.is_month_start
df["month_end"]= df["created_at"].dt.is_month_end
df["quarter"] = df["created_at"].dt.quarter
df["quarter_start"] = df["created_at"].dt.is_quarter_end
df["quarter_end"]= df["created_at"].dt.is_quarter_start

df["year_start"] = df["created_at"].dt.is_year_start
df["year_end"] = df["created_at"].dt.is_year_end
df["month"] = df["created_at"].dt.month
df.columns = df.columns.str.lower()

In [None]:
quarterly = df[["grand_total","discount_amount"]].groupby(df.quarter).sum() # Extracting quarterly turnover 
fig = px.bar(quarterly,x=quarterly.index,y=["grand_total","discount_amount"], title="Quaterly Turnover")
fig.show()

In [None]:
monthly = df[["grand_total","discount_amount"]].groupby(df.month_name).sum() #Extracting monthly turnover 
fig = px.bar(monthly,x=monthly.index,y=["grand_total","discount_amount"], title="Monthly Turnover")
fig.show()

In [None]:
weekday = df[["grand_total","discount_amount"]].groupby(df.week_day_name).sum() # Extracting day wise 
fig = px.bar(weekday,x=weekday.index,y=["grand_total","discount_amount"], title="Day-wise Turnover")
fig.show()

In [None]:
month_end = df[["grand_total","discount_amount"]].groupby(df.month_end).sum() 
fig = px.bar(month_end,x=month_end.index,y=["grand_total","discount_amount"], title="Month end days vs otherdays")
fig.show()

In [None]:
year = df[["grand_total","discount_amount"]].groupby(df.year).sum() # Extracting year wise 
fig = px.bar(year,x=year.index,y=["grand_total","discount_amount"], title="Yearly Turnover")
fig.show()

# Top Ten Categories in each Year

In [None]:
y_2016 = df.groupby([df.year,df.category_name_1]).grand_total.sum().loc[2016].nlargest(10)
y_2017 = df.groupby([df.year,df.category_name_1]).grand_total.sum().loc[2017].nlargest(10)
y_2018 = df.groupby([df.year,df.category_name_1]).grand_total.sum().loc[2018].nlargest(10)

In [None]:
from plotly.subplots import make_subplots

# Create subplots, using 'domain' type for pie charts
specs = [[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]] #adopted from https://plotly.com/python/pie-charts/
fig = make_subplots(rows=2, cols=2, specs=specs) 
# Define pie charts
fig.add_trace(go.Pie(labels=y_2016.index, values=y_2016, title='2016'), 1, 1)
fig.add_trace(go.Pie(labels=y_2017.index, values=y_2017, title='2017'), 1, 2)
fig.add_trace(go.Pie(labels=y_2018.index, values=y_2018, title='2018'), 2, 1)
# Tune layout and hover info
# fig.update_traces(hoverinfo='label+percent+name', textinfo='none')
fig.update(layout_title_text= "Yearly share of Top Ten Categories by Grand Total",
           layout_showlegend=True)
fig.show()

In [None]:
df.payment_method =  df.payment_method.str.lower()
y_2016pm = df.groupby([df.year,df.payment_method]).grand_total.sum().loc[2016].nlargest(10)
y_2017pm = df.groupby([df.year,df.payment_method]).grand_total.sum().loc[2017].nlargest(10)
y_2018pm = df.groupby([df.year,df.payment_method]).grand_total.sum().loc[2018].nlargest(10)

# Top (10)  most preferable payment methods

In [None]:
# Create subplots, using 'domain' type for pie charts
specs = [[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]] #adopted from https://plotly.com/python/pie-charts/
fig = make_subplots(rows=2, cols=2, specs=specs) 
# Define pie charts

fig.add_trace(go.Pie(labels=y_2016pm.index, values=y_2016pm, title='2016'), 1, 1)
fig.add_trace(go.Pie(labels=y_2017pm.index, values=y_2017pm, title='2017'), 1, 2)
fig.add_trace(go.Pie(labels=y_2018pm.index, values=y_2018pm, title='2018'), 2, 1)
# Tune layout and hover info
fig.update_traces(hole=.4, hoverinfo='label+percent+name',)
fig.update(layout_title_text= "Most used payments methods for each year",
           layout_showlegend=True)
fig.show()

#  Unveiling Mean reverting behaviour 

The Growth analysis unravel some interesting patterns in daily order count.However, one may ask does 
order count keep on growing persistently. Mainly, is there any mean reverting behavior. In simple words,if a time series exibhit mean reversion it plunges to its long-run or shor run average value. 
The average value act as a magnetic force, pulling the series towards it. In order to unveil mean reverting behaviour we will carry on some moving average analysis.

# Moving Average Analysis for daily orders count

In [None]:
dto = df.groupby("created_at").sum()[["grand_total","discount_amount"]] # aggregating sum day-wise
dto["opd"] = df.groupby("created_at").size() # extracting daily count for orders
# Simple Moving Average for Grand total 
dto["3 Days Moving Average"] = dto.opd.rolling(3).mean() # Window = 3 days
dto["7 Days Moving Average"] = dto.opd.rolling(7).mean() # Window = 3 days
dto["twenty_1_sma"] = dto.opd.rolling(window=21).mean() # Window = 3 days
# Exponentialy weighted moving avg
alpha=0.2
dto["EWM_Avg"] = dto.opd.ewm(alpha=alpha).mean()

In [None]:
from plotly.subplots import make_subplots
subplot_titles = ["3 days Simple Moving Average","7 days Simple Moving Average", 
                  "21 days Simple Moving Average",
                 f"Exponential weighted moving average with Alpha = {alpha}"]
fig = make_subplots(rows=4, cols=1,shared_yaxes=False,shared_xaxes=True,vertical_spacing=0.1,
                    subplot_titles=subplot_titles)
fig.add_scatter(x=dto.index, y=dto.opd, row=1, col=1, name="Orders Per Day")
fig.add_scatter(x=dto.index, y=dto["3 Days Moving Average"], name="3 Days MA",row=1, col=1)
fig.add_scatter(x=dto.index, y=dto.opd, row=2, col=1, name="Orders Per Day")
fig.add_scatter(x=dto.index, y=dto["7 Days Moving Average"], name="7 Days MA",row=2, col=1)
fig.add_scatter(x=dto.index, y=dto.opd,name="Orders Per Day", row=3, col=1)
fig.add_scatter(x=dto.index, y=dto.twenty_1_sma,name="21 Days MA", row=3, col=1)
fig.add_scatter(x=dto.index, y=dto.opd,name="Orders Per Day", row=4, col=1)
fig.add_scatter(x=dto.index, y=dto.EWM_Avg,name="EMW Average", row=4, col=1)
fig.update_layout(height=900,width=850, showlegend=True,
                  title_text="Moving Average Analysis for Order Count",
                 legend=dict( orientation="v"))
fig.show()

# Moving Average Analysis for daily Grand total (Turnover)

In [None]:
# Simple Moving Average for Grand total 
dto["tdayma_gt"] = dto.grand_total.rolling(window=3).mean()  # window = 3 days
dto["sdayma_gt"] = dto.grand_total.rolling(window=7).mean() # window = 7 days
dto["twenty_1_sma_gt"] = dto.grand_total.rolling(window=21).mean() # Window = 21 days
# Exponentialy weighted moving avg
alpha=0.2 # setting aplha equal to 0.2 
dto["EWM_Grand_Total"] = dto.grand_total.ewm(alpha=alpha).mean() 

In [None]:
fig = make_subplots(rows=4, cols=1,shared_yaxes=False,shared_xaxes=True,vertical_spacing=0.1,
                   subplot_titles=subplot_titles)
fig.add_scatter(x=dto.index, y=dto.grand_total, row=1, col=1, name="Daily Grand Total")
fig.add_scatter(x=dto.index, y=dto.tdayma_gt, name="3 Days MA for Grand Total",row=1, col=1)
fig.add_scatter(x=dto.index, y=dto.grand_total, row=2, col=1, name="Daily Grand Total")
fig.add_scatter(x=dto.index, y=dto.sdayma_gt, name="7 Days MA for Grand Total",row=2, col=1)
fig.add_scatter(x=dto.index, y=dto.grand_total,name="Daily Grand Total", row=3, col=1)
fig.add_scatter(x=dto.index, y=dto.twenty_1_sma_gt,name="21 Days Moving Average", row=3, col=1)
fig.add_scatter(x=dto.index, y=dto.grand_total,name="Daily Grand Total", row=4, col=1)
fig.add_scatter(x=dto.index, y=dto.EWM_Grand_Total,name="EMW Average", row=4, col=1)
fig.update_layout(height=900,width=850, showlegend=True,
                  title_text="Moving Average Analysis for Grand Total",
                 legend=dict( orientation="v",yanchor='top',xanchor="left"))
fig.show()

# Moving Average analysis clearly indicates that both series  daily order counts  and daily grand total kept on hovering around average value hence exbiting mean reversion. 

# Work in progress..... more to come