In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go

In [None]:
df = pd.read_csv('/kaggle/input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv', low_memory=False)

In [None]:
df.sample(3)

In [None]:
df.shape

In [None]:
df.info()

# To Drop Null Columns & NA Data
* First we will drop last five empty columns
* Second data contains 1048575 rows but maximum columns contains 584524 records, so we will drop NA values where all columns are null


In [None]:
df = df.iloc[:, :-5]
df = df.dropna(how='all')

In [None]:
df.shape  # now we have only 584524 records and 21 columns

# To Set Columns Name & Type
* I will change columns name to lower & remove leading & trailing space with strip() and use '_' to seperate name
* Change column name 'category_name_1' to 'category'
* Change data type of dates columns & quanty to int, etc.

In [None]:
df.columns = df.columns.str.lower().str.strip().str.replace(' ','_')
df.rename(columns = {'category_name_1': 'category'}, inplace=True)
df.columns

In [None]:
df['created_at'] = pd.to_datetime(df.created_at)
df['working_date'] = pd.to_datetime(df.working_date)
df['qty_ordered'] = df.qty_ordered.astype(int)
df['year'] = df.year.astype(int)
df['month'] = df.month.astype(int)
df.dtypes

In [None]:
df.status.value_counts()

# To Set Status Columns
* Since, status (complete, received, cod & paid) all are looks like complete, so we will change all to as complete
* and (refund & order_refunded) are looks same, so we will change to order_refunded

In [None]:
df['status'] = df.status.replace(['received', 'paid', 'cod'], 'complete')
df['status'] = df.status.replace(['refund'], 'order_refunded')
df.status.value_counts()

In [None]:
n = df.groupby(['year', 'status']).grand_total.sum().reset_index()
fig = px.bar(n, x='year', y='grand_total', color='status', title='Year Wise Order-Status')
fig.show()

In [None]:
n = df.groupby(['category', 'status']).grand_total.sum().reset_index()
fig = px.bar(n, x='category', y='grand_total', color='status', title='Category Wise Order-Status')
fig.show()

# Observations
* In each year & in every category order-Cancellation is too high.

In [None]:
n = df.groupby(['category']).grand_total.count().reset_index()
fig = px.bar(n, y='grand_total', x='category', text='grand_total', title='Top Category as Per Number of Orders')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()


n = df.groupby(['category']).grand_total.sum().reset_index()
fig = px.bar(n, y='grand_total', x='category', text='grand_total', title='Top Category as Per Value of Orders')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

# Observation about Top Categories
1. As per number of orders data says that, Top most category are Mobiles & Tablets, Men's Fashion, Women's Fashion then Appliances
2. As per value of orders data says that, Top most category are Mobiles & Tablets, Appliances, Entertainment, Women's Fashion then Computing
1. It is noticed that Men's Fashion has 50% more orders than Women's Fashion, but value of orders is vise versa


In [None]:
n = df.groupby(['year','category']).grand_total.count().reset_index()
fig = px.bar(n, x='year', y='grand_total', color='category', title='Year Wise Top Category as Per Number of Orders')
fig.show()

n = df.groupby(['year','category']).grand_total.sum().reset_index()
fig = fig = px.bar(n, x='year', y='grand_total', color='category', title='Year Wise Category as Per Value of Orders')
fig.show()

n = df.groupby(['year','category']).grand_total.sum().reset_index()
fig = px.scatter(n, x='year', y='grand_total', color='category', size='grand_total' , title='Year Wise Top Category as Per Value of Orders')
fig.show()

In [None]:
df['month'] = df.created_at.dt.month_name()
df['month'] = df.month.str[0:3]
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

In [None]:
n = df.groupby(['month','category']).grand_total.count().reset_index()
fig = px.bar(n, x='month', y='grand_total', color='category', title='Month Wise Top Category as Per Number of Orders',category_orders={'month':month_order})
fig.show()

n = df.groupby(['month','category']).grand_total.sum().reset_index()
fig = fig = px.bar(n, x='month', y='grand_total', color='category', title='Month Wise Category as Per Value of Orders', category_orders={'month':month_order})
fig.show()

n = df.groupby(['month','category']).grand_total.sum().reset_index()
fig = px.scatter(n, x='month', y='grand_total', color='category', size='grand_total' , title='Month Wise Top Category as Per Value of Orders',category_orders={'month':month_order} )
fig.show()

Observation about Month Wise Orders
1. November represents the best month to collect orders followed by March and May, because November is the "Black (Good) Friday" and March is the 23rd March & May for Labor day sales months respectively.

In [None]:
n = df.groupby(['year', 'payment_method']).grand_total.count().reset_index()
fig = px.bar(n, x='year', y='grand_total', color='payment_method', title='Year Wise Payment Method as Per Number of Orders')
fig.show()

n = df.groupby(['year', 'payment_method']).grand_total.sum().reset_index()
fig = px.bar(n, x='year', y='grand_total', color='payment_method', title='Year Wise Payment Method as Per Value of Orders')
fig.show()

In [None]:
n = df.groupby(['status']).grand_total.count().reset_index()
fig = px.bar(n, x='status', y='grand_total', text='grand_total', title='Order Status as Per Number of Orders')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

n = df.groupby(['status']).grand_total.sum().reset_index()
fig = px.bar(n, x='status', y='grand_total', text='grand_total', title='Order Status as Per Value of Orders')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [None]:
n = df.groupby(['payment_method']).grand_total.sum().reset_index()
fig = px.bar(n, x='payment_method', y='grand_total')

fig.show()

# Growth Analysis after Delete Canceled Orders
* Since, in each year & in every category order-Cancellation is too high, so to analyse on real sale pattern I am deleting canceled order


In [None]:
order_canceled_index = df[df['status'] == 'canceled'].index
df.drop(order_canceled_index, inplace=True)

In [None]:
df.shape

In [None]:
n = df.groupby(['category']).grand_total.count().reset_index()
fig = px.bar(n, y='grand_total', x='category', text='grand_total', title='Top Category as Per Number of Orders')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()


n = df.groupby(['category']).grand_total.sum().reset_index()
fig = px.bar(n, y='grand_total', x='category', text='grand_total', title='Top Category as Per Value of Orders')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [None]:
df = df.sort_values('created_at')
dfgt = df.groupby('created_at').grand_total.sum().reset_index()
dfqty = df.groupby('created_at').qty_ordered.sum().reset_index()
dfdisc = df.groupby('created_at').discount_amount.sum().reset_index()
dfstat = df.groupby('created_at').status.count().reset_index()

# a new dataset p

p = pd.DataFrame(dfgt)
p['qty_ordered'] = dfqty['qty_ordered']
p['discount_amount'] = dfdisc['discount_amount']
p['status'] = dfstat['status']

# Cumulutive Sum

p['cum_gt'] = p.grand_total.cumsum()
p['cum_qty'] = p.qty_ordered.cumsum()
p['cum_disc'] = p.discount_amount.cumsum()
p['cum_stat'] = p.status.cumsum()

In [None]:
p

# Daily Sales Vs Discount

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=p.created_at, y=p.grand_total, mode='lines+markers', name='grand_total'))
fig.add_trace(go.Scatter(x=p.created_at, y=p.discount_amount, mode='lines+markers', name='discount_amount'))
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=p.created_at, y=p.cum_gt, mode='lines+markers', name='cum_grand_total'))
fig.add_trace(go.Scatter(x=p.created_at, y=p.cum_disc, mode='lines+markers', name='cum_discount_amount'))
fig.show()

# Observations
* Above graph shows that sales boosted when discount offer initiated.

# A quick view of Regession model (OLS)

In [None]:
fig = px.scatter(p, x='created_at', y='grand_total', trendline='ols')
fig.show()

results = px.get_trendline_results(fig)
results

# Density Graph

In [None]:
n = df.groupby('created_at').grand_total.sum().reset_index()
px.density_contour(n, x='created_at', y='grand_total', marginal_x='histogram', marginal_y='histogram')

In [None]:
n = df.groupby('created_at').qty_ordered.sum().reset_index()
px.density_contour(n, x='created_at', y='qty_ordered', marginal_x='histogram', marginal_y='histogram', title='No of Orders')

In [None]:
n = df.groupby(['created_at', 'category', 'status']).qty_ordered.sum().reset_index()
px.scatter(n, x='created_at', y='qty_ordered', color='status', size='qty_ordered', hover_data=['category', 'status'])

In [None]:
n = df.groupby(['created_at', 'status']).qty_ordered.sum().reset_index()
px.line(n, x='created_at', y='qty_ordered', color='status')