In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

sns.set()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading the dataset

* Load the dataset into a dataframe 
* Parse the date columns
* Ignore last five columns, they have null values and are not required in our analysis
* Show the information about the columns

In [None]:
df = pd.read_csv('/kaggle/input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv',
                 low_memory=False, usecols=[*range(0,21)], parse_dates=['created_at', 'Working Date'])
df.dropna(how='all', inplace=True)
df.info()

## Rename columns

Remove whitespaces from column names and make them all lower case for easy understanding and processing of data

In [None]:
df.columns =['item_id', 'status', 'created_at', 'sku', 'price', 'qty_ordered', 'grand_total', 'increment_id', 'category_name_1','sales_commission_code','discount_amount', 'payment_method', 'working_date', 'bi_status', 'mv', 'year', 'month', 'customer_since', 'my', 'fy', 'customer_id']
df = df.astype({'year': 'int32', 'month' : 'int32'})
df.head()

## Top categories

Show the top categores in bar chart

In [None]:
ax = df.category_name_1.value_counts().nlargest(n=5).plot(kind='bar')
ax.set_title('Top categories');

## Year wise order status statistics

Find the order status statistics over the years and compare them in bar chart

In [None]:
years = sorted(df.year.unique())
fig, axis = plt.subplots(1, len(years), sharey=True, figsize=(10,5))

def plotstatus(year, ax):
    data = df[df.year == year].status.value_counts().nlargest(n=5)
    ax.bar(data.index, data.values)
    ax.set_title('Year ' + str(year))
    ax.tick_params(labelrotation=90, axis='x')

for i, y in enumerate(years):
    plotstatus(y, axis[i])

plt.suptitle('Year wise order status')
plt.show()

In [None]:
order_price_status = df.groupby(['status', 'year']).size().unstack('year', fill_value=0)
order_price_status.sort_values(ascending=False, by=[2016,2017,2018], inplace=True)
ax = order_price_status.head().plot(kind='bar')
ax.set_title('Year wise order status');

## Year wise payment method statistics

Find the payment method statistics over the years and compare them in bar chart

In [None]:
fig, axis = plt.subplots(1, len(years), sharey=True, figsize=(10,5))

def plotstatus(year, ax):
    data = df[df.year == year].payment_method.value_counts().nlargest(n=5)
    ax.bar(data.index, data.values)
    ax.set_title('Year ' + str(year))
    ax.tick_params(labelrotation=90, axis='x')

for i, y in enumerate(years):
    plotstatus(y, axis[i])

plt.suptitle('Year wise payment method')
plt.show()

## Revenue over the years

Calculate and show the revenue over the years in millions

In [None]:
def millions_formatter(x, pos):
    return f'{x / 1000000:0.0f} m'

price = df[df['status'] == 'complete'].groupby(['year'])['grand_total'].sum()
price = price.astype(int)
fig, ax = plt.subplots(figsize=(3,4))
ax.yaxis.set_major_formatter(FuncFormatter(millions_formatter))
ax.set_title('Revenue by year in millions')
ax.bar(price.index, price.values);