In [None]:
# 786
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import plotly.graph_objs as go

import plotly as py
from plotly import tools
from plotly.offline import iplot
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):

    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Loading & Preparation

In [None]:
dt = pd.read_csv("../input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv", parse_dates=["created_at", "Working Date"], low_memory=False)
print("Data Dimensions are: ", dt.shape)
print("Columns: ", dt.columns)

In [None]:
print(dt.info())

Data contains 1048574 rows but maximum columns contain 584524 records. 

Half of row are completely empty, so we will drop them. The tricky part is we can't drop all na rows as actual data set  also contain few NA entries. We need to keep them.
We will drop NA values where all entries are Null. 

Also, we will drop last 5 empty columns.

In [None]:
dt = dt.iloc[:, :-5]
dt = dt.dropna(how = 'all') 

The column MV contains leading and trailing space that might cause problem. We will rename it first.

In [None]:
dt.rename(columns = {' MV ':'MV'}, inplace = True)
dt.columns

As we can see above, few columns are not in correct data type. We need to perform casting.

In [None]:
dt['Customer ID'] = dt['Customer ID'].astype(str)
dt['item_id'] = dt['item_id'].astype(str)
dt['qty_ordered'] = dt['qty_ordered'].astype(int)  
dt['Year'] = dt['Year'].astype(int)  
dt['Month'] = dt['Month'].astype(int)  
# dt['MV'] = dt['MV'].astype(float, errors = 'raise')

In [None]:
dt.tail()

### Let's look into summary of data
Data Summary of non-numeric data

In [None]:
dt.describe()

Data Summary of non-numeric data

In [None]:
dt.describe(include=['object', 'bool'])

# Exploratory Analysis to Understand Data

In [None]:
dt = dt.sort_values('created_at')

### Few new features extracted

In [None]:
dtg = dt.groupby('created_at')['grand_total'].sum().reset_index()
dtq = dt.groupby('created_at')['qty_ordered'].sum().reset_index()
dtd = dt.groupby('created_at')['discount_amount'].sum().reset_index()
# comput count for non numeric values
dts = dt.groupby('created_at')['sku'].count().reset_index() 
dtst = dt.groupby('created_at')['status'].count().reset_index()

In [None]:
# new data set
p = pd.DataFrame(dtg) 
p['qty_ordered'] = dtq['qty_ordered']
p['discount_amount'] = dtd['discount_amount']
p['sku'] = dts['sku']
p['status'] = dtst['status']
#Cumulative Sum
p['cum_grand_total'] = p['grand_total'].cumsum()
p['cum_qty_ordered'] = p['qty_ordered'].cumsum()
p['cum_discount_amount'] = p['discount_amount'].cumsum()
p['cum_sku_cnt'] = p['sku'].cumsum()
p['cum_status_cnt'] = p['status'].cumsum()


In [None]:
# Date features
p['Dateofmonth'] = p['created_at'].dt.day
p['Month'] = p['created_at'].dt.month
p['Week'] = p['created_at'].dt.week
p['Dayofweek'] = p['created_at'].dt.dayofweek # 0 = monday.
p['Weekdayflg'] = (p['Dayofweek'] // 5 != 1).astype(float)
p['Month'] = p['created_at'].dt.month
p['Quarter'] = p['created_at'].dt.quarter
p['Dayofyear'] = p['created_at'].dt.dayofyear

In [None]:
p.head()

## Daily Sales vs. Discount

In [None]:
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=p['created_at'], y=p['grand_total'],
                    mode='lines+markers',
                    name='grand_total'))
fig.add_trace(go.Scatter(x=p['created_at'], y=p['discount_amount'],
                    mode='lines+markers',
                    name='discount_amount'))
fig.show()

### Cumulative Sums of Grand_Total and discount_amount

In [None]:
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=p['created_at'], y=p['cum_grand_total'],
                    mode='lines+markers',
                    name='xcum_grand_total'))
fig.add_trace(go.Scatter(x=p['created_at'], y=p['cum_discount_amount'],
                    mode='lines+markers',
                    name='cum_discount_amount'))
fig.show()

**In above graphs we can observe that sales boosted when discount offer initiated.**

But this can we tempting without looking into item status.

In [None]:
n = dt.groupby(['Year' ,'status'])['grand_total'].sum().reset_index()
fig = px.bar(n, x="Year", y="grand_total", color="status", title="Long-Form Input")
fig.show()

**In each year order cancellation is high. We need to drop Cancelled items and recheck sales growth**

Note: We will do this after looking into other data points. 

In [None]:
n = dt.groupby(['Year' ,'payment_method'])['grand_total'].sum().reset_index()
fig = px.bar(n, x="Year", y="grand_total", color="payment_method", title="Long-Form Input")
fig.show()

### Order Status

In [None]:
n = dt.groupby(['status'])['grand_total'].sum().reset_index()
fig = px.bar(n, y='grand_total', x='status', text='grand_total')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [None]:
n = dt.groupby(['created_at' ,'status'])['grand_total'].sum().reset_index()
px.box(n, y="grand_total", color = "status")

### Category Type

In [None]:
n = dt.groupby(['category_name_1'])['grand_total'].sum().reset_index()
fig = px.bar(n, y='grand_total', x='category_name_1', text='grand_total')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [None]:
n = dt.groupby(['category_name_1','status'])['grand_total'].sum().reset_index()
fig = px.bar(n, x="category_name_1", y="grand_total",
             color='status', barmode='group')
fig.show()

## Payment Methods


In [None]:
n = dt.groupby(['payment_method'])['grand_total'].sum().reset_index()

fig = px.bar(n, y='grand_total', x='payment_method', text='grand_total')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

## Growth Analysis

As we analysed above, we need to drop cancelled orders


In [None]:
ord_cncl_ind = dt[dt['status'] == 'canceled' ].index
dt.drop(ord_cncl_ind , inplace=True)
dt.shape

Recomputing daily figures

In [None]:
dtg = dt.groupby('created_at')['grand_total'].sum().reset_index()
dtq = dt.groupby('created_at')['qty_ordered'].sum().reset_index()
dtd = dt.groupby('created_at')['discount_amount'].sum().reset_index()
# comput count for non numeric values
dts = dt.groupby('created_at')['sku'].count().reset_index() 
dtst = dt.groupby('created_at')['status'].count().reset_index()

# new data set
p = pd.DataFrame(dtg) 
p['qty_ordered'] = dtq['qty_ordered']
p['discount_amount'] = dtd['discount_amount']
p['sku'] = dts['sku']
p['status'] = dtst['status']
#Cumulative Sum
p['cum_grand_total'] = p['grand_total'].cumsum()
p['cum_qty_ordered'] = p['qty_ordered'].cumsum()
p['cum_discount_amount'] = p['discount_amount'].cumsum()
p['cum_sku_cnt'] = p['sku'].cumsum()
p['cum_status_cnt'] = p['status'].cumsum()


In [None]:
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=p['created_at'], y=p['grand_total'],
                    mode='lines+markers',
                    name='grand_total'))
fig.add_trace(go.Scatter(x=p['created_at'], y=p['discount_amount'],
                    mode='lines+markers',
                    name='discount_amount'))
fig.show()

## A quick view of Regession model (OLS)

In [None]:
fig = px.scatter(p, x= 'created_at', y = 'grand_total', trendline = "ols")
fig.show()
results = px.get_trendline_results(fig)
results

### Density Graph

In [None]:
n = dt.groupby('created_at')['grand_total'].sum().reset_index()
px.density_contour(n,x="created_at",y="grand_total",marginal_x="histogram",marginal_y="histogram")

In [None]:
# Graph for quantity
n = dt.groupby('created_at')['qty_ordered'].sum().reset_index()
px.density_contour(n,x="created_at",y="qty_ordered",marginal_x="histogram",marginal_y="histogram", title="no of orders")

In [None]:
n = dt.groupby(['created_at' ,'category_name_1', 'status'])['qty_ordered'].sum().reset_index()
px.scatter(n, x="created_at", y="qty_ordered", color="status", size="qty_ordered", hover_data=['category_name_1','status'])


In [None]:
n = dt.groupby(['created_at' ,'status'])['qty_ordered'].sum().reset_index()
px.line(n, x="created_at", y="qty_ordered", color="status", )

To be Continue...

**You can fork this kernel and continue your analysis.**

**Way Forward**
* Data Cleansing at SKU and Status columns
* Segregate analysis by dropping Cancel status orders. 
* Quarterly, Monthly, Weekday and Weekend Analysis
* Seasonality Analysis
* What are the Trends in Top 10 Categories
* Weekly Moving Average Analysis