In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
# Reading the csv file
data = pd.read_csv('/kaggle/input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv')
df = data.copy()

## Step 1: Data Pre-processing

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
# Checking for missing / NaN values
df.isnull().sum()

In [None]:
# Doing a visual inspection of all columns
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

##### Observations
- Out of 26 columns, last 5 columns in the dataset contain NaN values for all records
- Records at 464051 indices (from the bottom) contain NaN values for all columns
- ' MV ' is an ambiguous column name with extra spaces
- Some of the columns have incorrect data types

##### Actions
- Last 5 columns need to be dropped from the dataset
- 464051 rows, containing NaN values need to be dropped from the dataset
- Renamed the columns ' MV ' and 'category_name_1' to 'MV' and 'category_name'

In [None]:
df.drop(["Unnamed: 21", "Unnamed: 22", "Unnamed: 23", "Unnamed: 24", "Unnamed: 25"], axis = 1, inplace=True)
df.dropna(subset=["item_id"], axis=0, inplace=True)
df.rename(columns={" MV ": "MV", "category_name_1": "category_name"}, inplace = True)

##### Dropping duplicate entries, if any, from the dataset

In [None]:
df=df.drop_duplicates()

##### Basic data quality and integrity checks

In [None]:
print("The number of rows with negative or zero Quantity:",sum(n <= 0 for n in df.qty_ordered))
print("The number of rows with negative Price:",sum(n < 0 for n in df.price))

##### Convert all values in 'sku' column to upper case for uniformity

In [None]:
df['sku']=df['sku'].str.upper()

#### Exploring all columns, finding and Imputing Null Values
#### Categorical Variables

In [None]:
df['status'].value_counts()

##### Observations
- There are a lot of labels for 'status' column.
- Need to check if any relationship exists between 'status' and 'BI Status' columns

In [None]:
df.groupby('BI Status')['status'].value_counts()

##### Observations
- All transactions marked as either **'complete' or 'closed'**, fall in the **'Net' category** for 'BI Status'
- All transactions marked as **'received','paid','cod','exchanged' or something related to refund** are marked in **'Valid' category**
- All transactions marked as **either 'canceled' or something to do with incomplete transation** are marked in **'Gross' category**
- '#REF!' looks an erroneus label.

##### Actions
**Replace values inside the 'status' column by creating new labels**

- **'complete','closed','received','paid','cod'** will belong to category **'Completed'**
- **'order_refunded','refund', 'exchange'** will belong to category **'Refund'**
- **'pending','payment_review','processing','holded','pending_paypal','\N'** will beling to **'Pending'**
- **'canceled'** will belong to **'Cancelled'**
- **'fraud'** will belong to **'Fraud'**
**Also replace the '#REF!'' entry to 'Net' in 'BI status'**

In [None]:
df['status'] = df['status'].replace('complete', 'Completed')
df['status'] = df['status'].replace('closed', 'Completed')
df['status'] = df['status'].replace('received', 'Completed')
df['status'] = df['status'].replace('paid', 'Completed')
df['status'] = df['status'].replace('cod', 'Completed')
df['status'] = df['status'].replace('order_refunded', 'Refund')
df['status'] = df['status'].replace('refund', 'Refund')
df['status'] = df['status'].replace('exchange', 'Refund')
df['status'] = df['status'].replace('pending', 'Pending')
df['status'] = df['status'].replace('payment_review', 'Pending')
df['status'] = df['status'].replace('processing', 'Pending')
df['status'] = df['status'].replace('holded', 'Pending')
df['status'] = df['status'].replace('pending_paypal', 'Pending')
df['status'] = df['status'].replace(r'\\N', 'Pending', regex=True)
df['status'] = df['status'].replace('fraud', 'Fraud')
df['status'] = df['status'].replace('canceled', 'Cancelled')

In [None]:
df['status'].value_counts()

In [None]:
df['BI Status'] = df['BI Status'].replace('#REF!', 'Net')

In [None]:
df['BI Status'].value_counts()

##### Handling Null values in 'status' column

In [None]:
df[df['status'].isnull()]

##### Observation
- 15 NaN values in 'status' column have 'Gross' in the BI column meaning all these transactions are not valid

##### Actions
- Replacing NaN values with label **'Cancelled'** in line with our understanding of the data

In [None]:
df['status'].fillna("Cancelled",inplace=True)

#### Handling NaN values in 'category_name' column

In [None]:
df['category_name'].value_counts()

##### Observations
- There are 164 NaN values in the **'category_name'** column that can be filled using some information from **'sku'** column. Not doing it right now
- 7850 transactions have a unicode label associated with them.
- 164 transactions have NaN values.

##### Actions
- Replacing the unicode label and NaN values with label 'Unknown'

In [None]:
df['category_name'] = df['category_name'].replace(r'\\N', 'Unknown', regex=True)
df['category_name'].fillna("Unknown",inplace=True)

#### Handling NaN values in 'sku' column

In [None]:
df[df['sku'].isnull()]

##### Obsevations
- 20 NaN values for **'sku'** exist in the dataset and these values can be replaced.

##### Action
- Replace NaN values with a new sku code **'Missing'**

In [None]:
df['sku'].fillna("Missing",inplace=True)

#### Handling missing values in 'Sales_commission_code' column

In [None]:
df['sales_commission_code'].value_counts()

In [None]:
df[df['sales_commission_code'].isnull()]

##### Observations
- The column has a large number of NaN values and there are more than 7000 types of values in this column
- The column does not seem to add any value for further analysis and can be dropped at a later stage
- At this stage, NaN values as well as unicode labels can be replaced with 'Missing'

##### Actions
- Replacing NaN and unicode values with **'Missing'**

In [None]:
df['sales_commission_code'].fillna("Missing",inplace=True)
df['sales_commission_code'] = df['sales_commission_code'].replace(r'\\N', 'Missing', regex=True)

#### Handling missing values in 'Customer ID' and 'Customer Since' columns

In [None]:
df[df['Customer ID'].isnull()]

##### Observations
- There are a total of 11 rows where the 'Customer ID' column is NaN and exactly the same rows in 'Customer since' are also NaN, which makes sense and shows that these columns have a relationship.
- All 11 records are from FY18, with the first record from 01-2018.
- For keeping the records in dataset for analysis, a fake 'Customer ID' value of '0' can be assigned with '01-2018' assigned to all records in 'Customer Since' column

##### Actions
- Replaced 'Customer ID' with value **'0'** and 'Customer Since' with value **'01-2018'** for all NaN values

In [None]:
df['Customer ID'].fillna("0",inplace=True)
df['Customer Since'].fillna("1-2018",inplace=True)

#### Checking for Null values again and setting appropriate datatypes

In [None]:
df.isnull().sum()

#### Convert the datatypes of columns

In [None]:
df[["item_id"]] = df[["item_id"]].astype("str")
df[["Month"]] = df[["Month"]].astype("int")
df[["Year"]] = df[["Year"]].astype("int")
df[["qty_ordered"]] = df[["qty_ordered"]].astype("int")
df[["Customer ID"]] = df[["Customer ID"]].astype("str")
df[["increment_id"]] = df[["increment_id"]].astype("str")

In [None]:
df.info()

In [None]:
df = df.reset_index()

## Step 2: Exploratory Data Analysis

### Is there a relationship between Payment method and Order Status?

##### We can explore the relationship between payment method and order status through the fol. steps

1. Examining both column labels
2. See yearly trends in Transactions and Sales by Payment Method
3. See yearly trends in Transactions and Sales by Order Status
4. Examined the combined effect of both on Transactions and Sales with and without the time dimension
5. Make a conclusion based on our observations from the trends

##### Examine both column labels
We have already explored and modified the column labels for 'order_status' during pre-processing. Let's examine the **'payment_method'** column labels

In [None]:
df['payment_method'].value_counts()

##### Observations
- **'Easypay'** and **'Easypay_MA'** can be combined under the label **'Easypay'**
- **'cod'** and **'cashatdoorstep'** can be combined under 'cod'
- **'marketingexpense'** and **'financesettlement'** having very low transactions can be combined under **'Others'**

##### Actions
- Combine 'Easypay' and 'Easypay_MA'
- Combine 'cod' and 'cashatdoorstep'
- Combine 'marketingexpense' and 'financesettlement' under 'Others'

In [None]:
df['payment_method'] = df['payment_method'].replace('Easypay_MA', 'Easypay')
df['payment_method'] = df['payment_method'].replace('cashatdoorstep', 'cod')
df['payment_method'] = df['payment_method'].replace('marketingexpense', 'Others')
df['payment_method'] = df['payment_method'].replace('financesettlement', 'Others')

In [None]:
df['payment_method'].value_counts()

#### Examine yearly trends in Transactions and Sales by Payment Method

In [None]:
import plotly.express as px
df1 = df.groupby(['FY','payment_method']).size().reset_index(name='count')
df2 = df.groupby(['FY','payment_method'])['grand_total'].sum().reset_index(name='sum')
temp = pd.concat([df1, df2['sum']], axis=1)
fig = px.bar(temp, x="FY", y="count", color="payment_method", title="Yearly Transactions by Payment method")
fig.show()

In [None]:
fig = px.bar(temp, x="FY", y="sum", color="payment_method", title="Yearly Potential Revenue by Payment method")
fig.show()

##### Observations

**Transactions**
- **'cod' is the dominant method** for order placement over both FY17 and FY18
- **Payaxis** had a higher share for FY17 which has decreased in FY18
- **New digital or e-payment** methods like **Easypay, Easypay_voucher and bank alfalah** have started getting traction in FY18

**Total Sales**
- **Potential Revenue** has almost **doubled in FY18 as compared to FY17** due to new digital / e-payment methods.

##### Actions
- See the combined effect of Payment method with Order Status

#### Examine yearly trends in Transactions and Sales by Order Status

In [None]:
df1 = df.groupby(['FY','status']).size().reset_index(name='count')
df2 = df.groupby(['FY','status'])['grand_total'].sum().reset_index(name='sum')
temp = pd.concat([df1, df2['sum']], axis=1)
temp['Sales per Transaction'] = temp['sum'] / temp['count']
fig = px.bar(temp, x="FY", y="count", color="status", title="Yearly Transactions by Order Status")
fig.show()

In [None]:
fig = px.bar(temp, x="FY", y="sum", color="status", title="Yearly Potential Revenue by Order Status")
fig.show()

In [None]:
fig = px.bar(temp, x="FY", y="Sales per Transaction", color="status", title="Yearly Sale per Transactions by Order Status")
fig.show()

##### Observations
- Trend for transactions and revenue is the same for both FY17 and FY18, but increase in revenue through **completed orders has almost doubled**, as seen from **Sales per transaction metric.**
- However, **revenue lost has also increased** from FY17 to FY18 (12k to 14.6K) per transaction, which is a worrying metric for the business.
- Trend for **Refund** is almost the same over both years.

##### Actions
- Examine the combined affect of order status and payment method on transaction/ order qty and Sales
- Examine 'Completed','Cancelled' and 'Refund' transactions for payment method over time to get any insights

#### Relationship between Payment method and Order Status Frequency

In [None]:

df1 = df.groupby(['payment_method','status'])['qty_ordered'].sum().reset_index(name='count')
fig = px.bar(df1, x="payment_method", y="count", color="status", title="Qty Ordered by Payment method")
fig.show()

##### Observations
- **Highest ordered qty** happened through **cod** 
- **Majority of Completed transactions** are also happening through **cod**
- Overall most ordered qty **(more than 50%)** through **Payaxis, jazzwallet, Easypay, Easypay_MA and bankalfalah** are getting cancelled
- Most 'Refund' ordered qty taking place for 'cod'

##### Actions
- Is there a similar trend for Revenue as well because businesses are more interested in revenue than both number of transactions and ordered qty?

In [None]:
df1 = df.groupby(['payment_method','status'])['grand_total'].sum().reset_index(name='sum')
fig = px.bar(df1, x="payment_method", y="sum", color="status", title="Total Sales by Payment method")
fig.show()

##### Observations
- Unlike, the qty ordered, **cod is not the highest contributor** towards potential revenue, it is **payaxis**
- For **completed** orders, overall **cod** still has a major share, with **easypay_voucher** the 2nd biggest contrbutor.  
- For **cancelled** orders, **Payaxis, Easypay and bankalfalah** have a much higher contribution than the rest which clearly shows that these technologies have **technology integration issues** on the E-commerce store website.
- Bulk of the **Refunds** are happening through **cod** payment method
- **Cash payments (cod) and voucher based payment methods (easypay_voucher and jazzvoucher)** make up majority of the revenue generated through **Completed transactions** and the rest of the technologies have a major share towards **revenue loss via Cancelled transactions**

#### Actions
- Check the yearly trend for payment methods vs order status and compare it with the overall trend. It is better to split the dataset for Completed, Cancelled and Refunds and see if there is any trend

#### Yearly Sales for Completed Orders by Payment method

In [None]:
df1 = df.loc[df['status']=='Completed',['FY','payment_method','grand_total']]
df2 = df1.groupby(['FY','payment_method'])['grand_total'].sum().reset_index(name='sum')
df2['%'] = 100 * df2['sum'] / df2.groupby('FY')['sum'].transform('sum')
fig = px.bar(df2, x="FY", y="sum", color="payment_method", text=df2['%'].apply(lambda x: '{0:1.2f}%'.format(x)), title="Yearly Sales for Completed Orders")
fig.show()

##### Observations
- In FY17, **cod, payaxis and jazzvoucher** were responsible for almost **87% of generated revenue**. This combined contribution has **reduced to half (almost 42%)** in FY2018
- **cod** share has almost reduced to half from FY17 to FY18. A 4% decrease also seen for payaxis
- In FY18, almost **55% of Sales contribution** was coming from **Easypay, Easypay_voucher and Bank AlFalah combined**, which has significantly increased from FY17

#### Yearly Revenue Lost through Cancelled Orders by Payment method

In [None]:
df1 = df.loc[df['status']=='Cancelled',['FY','payment_method','grand_total']]
df2 = df1.groupby(['FY','payment_method'])['grand_total'].sum().reset_index(name='sum')
df2['%'] = 100 * df2['sum'] / df2.groupby('FY')['sum'].transform('sum')
fig = px.bar(df2, x="FY", y="sum", color="payment_method", text=df2['%'].apply(lambda x: '{0:1.2f}%'.format(x)), title="Yearly Revenue Lost through Cancelled Orders")
fig.show()

##### Observations
- For both years, **cod share is less than 10% towards revenue losses**.
- In both years, various **digital / e-payment methods** are contributing around **90% towards revenue losses from cancellations**
- Revenue losses due to payaxis have **decrased from 49% to 20%** over FY17 to FY18.

##### Actions
- Examine the monthly trend of cancelled transactions and revenue loss for FY18

#### Yearly Revenue Lost through Refund Orders by Payment method

In [None]:
df1 = df.loc[df['status']=='Refund',['FY','payment_method','grand_total']]
df2 = df1.groupby(['FY','payment_method'])['grand_total'].sum().reset_index(name='sum')
df2['%'] = 100 * df2['sum'] / df2.groupby('FY')['sum'].transform('sum')
fig = px.bar(df2, x="FY", y="sum", color="payment_method", text=df2['%'].apply(lambda x: '{0:1.2f}%'.format(x)), title="Yearly Revenue Lost through Refunded Orders")
fig.show()

##### Observations
- For both FY17 and FY18, Refunds are dominated by cod, however, there has been a decrease in the % age value but the amount is almost the same

### Conclusion
- E-commerce store users used **Cash and voucher based transactions** as the preferred method for FY17 and FY18 in terms of revenue generation through **Completed** transactions, but the **cod** payments %age saw a downward trend in FY18
- **Digital or E-payment methods were mainly responsible for making the revenue earned in FY18 double than it was in FY17**. However, due to a large number of **cancelled** transactions associated with these methods, there is a **strong possibility that the web portal faced integration challenges and resulted in many cancelled transactions**
- **Digital / E-payment have been a driver in revenue growth** but at the same time resulted in **more cancellations and potential revenue lost**.