In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go

import plotly as py
from plotly import tools
from plotly.offline import iplot
from plotly.subplots import make_subplots
import seaborn as sns
sns.set()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data=pd.read_csv(r'../input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv',parse_dates=['created_at'])
print("Data Dimensions are: ", data.shape)
data.columns=data.columns.str.replace(" ", "_").str.lower()
print("Columns: ", data.columns)

In [None]:
data.head()

In [None]:
#Data exploration
data.info()

Let's check data types of columns

In [None]:
data.dtypes

**Checking for missing values**

In [None]:
# Quantifying null values
print(data.isnull().sum())

In [None]:
# Visualizing the percentage of null values
data.isnull().mean().plot.bar(figsize=(12,6))
plt.ylabel('Percentage of missing values')
plt.xlabel('Variables')
plt.title('Quantifying missing data')

****Above visualization shows percentage of missing value for each column in dataset ****
1. Columns Unnamed 21 to Unnamed 25 are 100% null
2. Columns sales_commission_code has above 50% null values
3. All the remaining columns have about 40% null values

In [None]:
data.drop(["unnamed:_21", "unnamed:_22", "unnamed:_23", "unnamed:_24", "unnamed:_25"], axis = 1, inplace=True)
data.dropna(how='all', axis=0, inplace=True)
data.rename(columns={"_mv_": "mv", "category_name_1": "category_name"}, inplace = True)

In [None]:
# Quantifying null values
print(data.isnull().sum())

In [None]:
# inspect unique values - categorical variable
data['status'].unique()

In [None]:
data.groupby('bi_status')['status'].value_counts()

**From above observations it is concluded that all statuses falls under group Gross can be marked as Canceled, Net and Valid group of orders can be considered under complete category**

In [None]:
data['status'] = data['status'].replace(r'\\N', 'Cancelled', regex=True)

In [None]:
# For simplicity we can merge all statuses into Completed, Cancelled and Refund
dict_status = {'Completed':['complete','closed','received','cod','paid','exchange','payment_review','pending','processing','holded','pending_paypal'],'Refund':['order_refunded','refund'], 'Cancelled':['canceled','fraud',np.nan]}
for n in range(len(dict_status)):
    key,value=list(dict_status.items())[n]
    data['status'].replace(value, key,inplace=True)
    n+=1

In [None]:
data['status'].value_counts()

In [None]:
# bar plots for status categorical variables

data['status'].value_counts().plot.bar()
plt.xticks(rotation=0)
plt.ylabel('Count')
plt.title('Status  - Distinct Counts')

# **Let's check columns for duplicate entries, non relevant data**

In [None]:
# Check for relevance of different features of dataset
data[['created_at','working_date','sku','qty_ordered','price','grand_total','mv','discount_amount','sales_commission_code','customer_id']].head()

****We can drop the columns working date, mv, sales_commission_code & other irrelevant cols because we have all the required relevant information in columns created_at, grand_total and discount_amount ****

In [None]:
data.drop(['working_date', 'mv', 'increment_id','bi_status','sales_commission_code'],axis=1,inplace=True)

In [None]:
data.info()

In [None]:
data.isnull().sum()

**Checking null/not defined values in categorical variable category_name**

In [None]:
print("Count Different Categories: ")
print(data['category_name'].value_counts(dropna=False)) 

In [None]:
# Extracting all unique categories for category value'\N'
skunique=data[data['category_name']==r'\N']['sku'].unique().tolist()

In [None]:
#Now we check for sku's found against '\N' category in other categories 
sku_nil=data[data['sku'].isin(skunique)]
sku_nil['category_name'].value_counts()

In [None]:
# We found sku with '\N' category also in categories as mentioned above
# Updating the sku category '\N' where same sku found in above categories and all remaining values of sku with Other's  
dict_sku={}
cat=["Men's Fashion",'Others','Superstore','Mobiles & Tablets',"Women's Fashion",'Entertainment','Appliances']
for n in cat:
    dict_sku[n]= n
for n in range(len(dict_sku)):
    key,value=list(dict_sku.items())[n]
    dict_sku[key]=sku_nil[sku_nil['category_name']== key]['sku'].unique().tolist()
    data.loc[((data['sku'].isin(dict_sku[key])) & (data['category_name']==r'\N')),'category_name']= key
    n+=1


In [None]:
data.loc[(data['category_name']==r'\N'),'category_name']= 'Others'
data['category_name'] = data['category_name'].replace(np.nan, 'Others', regex=True)

In [None]:
# Categories after updating all '\N' and null categories
data['category_name'].value_counts(dropna=False).plot.bar(figsize=(12,6))
#plt.xticks(rotation=0)
plt.ylabel('Sales')
plt.title('Sales Category Wise')

**Checking categorical variable sku for null entries**

In [None]:
data[data['sku'].isnull()]

In [None]:
# As out of 20 null sku values,  most of the order statuses are either cancelled or refund with grand_total 0.
# Replacing nan values with sku_nan
data['sku'].fillna("sku_nan",inplace=True)

In [None]:
#Checking null values in columns customer_id and customer_since
data[data['customer_id'].isnull()]

In [None]:
#For customer_id null most of the orders are with order status as cancelled or refund
#We can replace the null customer_id with value 0 and customer_since with 2018
data['customer_id'].fillna("0",inplace=True)
data['customer_since'].fillna("1-2018",inplace=True)

In [None]:
#Replacing values < 0 with 0
data.loc[(data['grand_total']< 0), 'grand_total']=0

In [None]:
print(data.describe())

# **Exploring different consumer patterns**

In [None]:
# Filter all competed orders for distibution spread 
dataordercomp= data[data['status']=='Completed']

In [None]:
# Checking order statuses against price  
g=(sns.FacetGrid(data[data['price'] >0],
               hue='status', height=5, aspect=2)
  .map(sns.kdeplot, 'price', shade=True)
 .add_legend()
)
plt.xlim(0,5000)


******Ratio of order cancellation and refund increases after price tag of 2000/- 

In [None]:
#For orders above 5000 we check order completion and cancel ratio
g=(sns.FacetGrid(data[data['price'] >5000],
               hue='status', height=5, aspect=2)
  .map(sns.kdeplot, 'price', shade=True)
 .add_legend()
)

plt.xlim(5000,60000)
plt.ylim(0.00000,0.00006)


**Ratio of order cancellation is high for orders greater than 5000 but there is improvement in price range 20000 to 25000**

In [None]:
# Scatter plot of category ad price for completed orders
fig = px.scatter(dataordercomp, x='category_name', y='price')
fig.show()

In [None]:
def displaygraph(df,groupbycol,groupbycriteria,title,xlabel,ylabel):
    lsfilterby=df[groupbycriteria].unique().tolist()
    df_dict = {}
    for n in lsfilterby:
        df1 = df[df[groupbycriteria]==n]
        df_dict[n] = df1.groupby(groupbycol)[groupbycriteria].count().reset_index()

    fig = go.Figure()

    for n in lsfilterby:
        fig.add_trace(go.Scatter(x=df_dict[n][groupbycol], y=df_dict[n][groupbycriteria],
                    mode='lines+markers',
                    name=n))    

    fig.update_layout(
    title_text=title, # title of plot
    xaxis_title_text=xlabel, # xaxis label
    yaxis_title_text=ylabel, # yaxis label
    
    )
    fig.show()
    

In [None]:
#Order Month year vs order status

displaygraph(data,'m-y','status','Order Status by Month','Month','Count')

In [None]:
#Order category vs order status

displaygraph(data,'category_name','status','Order Status by Category','Order Category','Count')

In [None]:
#Order payment method vs order status

displaygraph(data,'payment_method','status','Order Status by Payment Method','Payment Method','Count')

In [None]:
dataordercomp['price']=dataordercomp['price'].astype(int)
print('Categories Highest Income')
print(dataordercomp.groupby('category_name')['price'].sum().sort_values(ascending=False))
dataordercomp.groupby('category_name')['price'].sum().sort_values(ascending=False).plot.bar(figsize=(12,6))#value_counts().sort_values()
plt.ylabel('Sales')
plt.title('Categories Sorted by Sales')


In [None]:
dict_traces={}
lscategory=["Mobiles & Tablets",'Appliances','Entertainment']
for n in lscategory:
        dfcat=dataordercomp[dataordercomp['category_name']==n]
        dict_traces[n] = go.Violin(x=dfcat['category_name'],y =dfcat['price'], meanline_visible = True)

fig.update_layout(
    title_text='Distribution of Top Income generating Categories',
    yaxis_title_text='price', # yaxis label
    showlegend=False
    )        

  
cdata=[]
for n in range(len(dict_traces)):
    key,value=list(dict_traces.items())[n]
    cdata.append(dict_traces[key])
    n+=1

fig = go.Figure(data = cdata)
iplot(fig)

**Actual customer base for all top categories is below 40K with average price of around 11K for Mobiles & Tablets, 8,701 for Appliances and 17.7K for Entertainment. 
Upper fence for Mobiles & Tablets:34k
Upper fence for Appliances:19.6k
Upper fence for Entertainment:39.9k**

In [None]:
pd.set_option('mode.chained_assignment', None)
dataordercomp['quarter']=dataordercomp['created_at'].dt.quarter
dataordercomp['quarter']=dataordercomp['quarter'].astype(str)
dataordercomp['quarter'].replace(['1','2','3','4'],['Q1','Q2','Q3','Q4'],inplace=True)
dataordercomp['q-y']=dataordercomp['fy'].astype(str)+'-'+dataordercomp['quarter']

In [None]:
qigroup=dataordercomp.groupby(['q-y'])['price'].sum().reset_index(name='Total Sales')#plot.bar()
fig = px.bar(qigroup, x='q-y', y='Total Sales', title="Quarterly Sales",labels={'q-y':'Quarters'})
fig.show()

In [None]:
qcgroup=dataordercomp.groupby(['q-y','category_name'],sort=True)['price'].sum().reset_index(name='Sales')
qcgroup=qcgroup.sort_values(by = ['Sales'], ascending=[True])
fig = px.bar(qcgroup, x='q-y', y='Sales', color='category_name',title="Quarterly Category Wise Sale",labels={'q-y':'Quarters','category_name':'Category name'})
fig.show()

**Top 4 consistent profitable categories are Mobiles & Tablets, Appliances, Entertainment and Women's Fashion**

In [None]:
dfq4=dataordercomp.loc[(dataordercomp['q-y']=='FY18-Q4'),['sku','category_name','price']].value_counts()[:15].reset_index(name='Number of Orders')
dfq4=dfq4.sort_values(by = ['Number of Orders'], ascending=[False])
print('Top 15 selling Items of Quarter-FY18-Q4')
print(dfq4)
fig = px.bar(dfq4, x='category_name', y='Number of Orders', color='sku',title="Top 15 selling Items of Quarter-FY18-Q4",labels={'q-y':'Quarters','category_name':'Category name'},hover_name="sku", hover_data=["category_name", "Number of Orders", "price"])
fig.show()

In [None]:
qorders=dataordercomp.groupby('q-y')['category_name'].value_counts().reset_index(name='Number of Orders')
qorders=qorders.sort_values(by = ['Number of Orders'], ascending=[True])
fig = px.bar(qorders, x='q-y', y='Number of Orders', color='category_name',title="Quarterly Category Wise Sale-Number of Orders",labels={'q-y':'Quarters','category_name':'Category name'})
fig.show()

In [None]:
def human_format(num):
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    # add more suffixes if you need them
    return '%.2f%s' % (num, ['', 'K', 'M', 'B', 'T', 'P'][magnitude])


In [None]:
pabovefifty=dataordercomp.loc[(dataordercomp['price']>50000),'price']
pabovefifty['price']=dataordercomp.loc[(dataordercomp['price']>50000),'price']
pbins=pd.qcut(pabovefifty['price'], q=5).value_counts().reset_index(name='No of Orders')#plot.bar(figsize=(12,6))
pbins.rename(columns = {'index' : 'price'}, inplace = True)
pbins['price']=pbins['price'].astype(str)
# plt.ylabel('Number of Orders')
# plt.title('Price Bins For Orders having Price above Fifty Thousand- Total Sales: Rs%s' % human_format(pabovefifty['price'].sum()))
fig = px.pie(pbins, values='No of Orders', names='price', title='Price Intervals For Orders having Price above Fifty Thousand- Total Sales: Rs%s' % human_format(pabovefifty['price'].sum()))
fig.show()

In [None]:
pbelowfifty=dataordercomp.loc[(dataordercomp['price']<50000),'price']
pbelowfifty['price']=dataordercomp.loc[(dataordercomp['price']<50000),'price']
pbins=pd.qcut(pbelowfifty['price'], q=5).value_counts().reset_index(name='No of Orders')#.plot.bar(figsize=(12,6))
pbins.rename(columns = {'index' : 'price'}, inplace = True)
pbins['price']=pbins['price'].astype(str)
fig = px.pie(pbins, values='No of Orders', names='price', title='Price Intervals For Orders having Price below Fifty Thousand- Total Sales: Rs%s' % human_format(pbelowfifty['price'].sum()))
fig.show()

In [None]:
# Time based Sales analysis 
df=pd.DataFrame({'order_count':dataordercomp.groupby(['created_at']).size()})


In [None]:
weekly = df.resample('W').sum()
weekly.plot(figsize=(12,6))
plt.ylabel('Weekly Order count')
plt.xlabel('Date');

In [None]:
daily = df.resample('D').sum()
daily.rolling(30, center=True).sum().plot(figsize=(12,6))
plt.ylabel('Mean hourly order count')
plt.xlabel('Date');

In [None]:
by_weekday = df.groupby(df.index.dayofweek).mean()
by_weekday.index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
by_weekday.plot(figsize=(12,6)); 

Peak sales months are Nov 2016 and 2017, whereas Friday has on average highest orders as compared to other week days. This may be due to Friday sale.

In [None]:
daily = df.loc['11-2016'].resample('D').sum()
daily.plot(figsize=(12,6))
plt.title('Daily Order count for November-2016')
plt.ylabel('Order count')
plt.xlabel('Date');

In [None]:
daily = df.loc['11-2017'].resample('D').sum()
daily.plot(figsize=(12,6))
plt.title('Daily Order count for November-2017')
plt.ylabel('Order count')
plt.xlabel('Date');

Working on further pattren exploration.  Please comment. 