In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
# Reading the csv file
data = pd.read_csv('/kaggle/input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv')
df = data.copy()

## Step 1: Data Pre-processing

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
# Checking for missing / NaN values
df.isnull().sum()

In [None]:
# Doing a visual inspection of all columns
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

##### Observations
- Out of 26 columns, last 5 columns in the dataset contain NaN values for all records
- Records at 464051 indices (from the bottom) contain NaN values for all columns
- ' MV ' is an ambiguous column name with extra spaces
- Some of the columns have incorrect data types

##### Actions
- Last 5 columns need to be dropped from the dataset
- 464051 rows, containing NaN values need to be dropped from the dataset
- Renamed the columns ' MV ' and 'category_name_1' to 'MV' and 'category_name'

In [None]:
df.drop(["Unnamed: 21", "Unnamed: 22", "Unnamed: 23", "Unnamed: 24", "Unnamed: 25"], axis = 1, inplace=True)
df.dropna(subset=["item_id"], axis=0, inplace=True)
df.rename(columns={" MV ": "MV", "category_name_1": "category_name"}, inplace = True)

##### Dropping duplicate entries, if any, from the dataset

In [None]:
df=df.drop_duplicates()

##### Basic data quality and integrity checks

In [None]:
print("The number of rows with negative or zero Quantity:",sum(n <= 0 for n in df.qty_ordered))
print("The number of rows with negative Price:",sum(n < 0 for n in df.price))

##### Convert all values in 'sku' column to upper case for uniformity

In [None]:
df['sku']=df['sku'].str.upper()

#### Exploring all columns, finding and Imputing Null Values
#### Categorical Variables

In [None]:
df['status'].value_counts()

##### Observations
- There are a lot of labels for 'status' column.
- Need to check if any relationship exists between 'status' and 'BI Status' columns

In [None]:
df.groupby('BI Status')['status'].value_counts()

##### Observations
- All transactions marked as either **'complete' or 'closed'**, fall in the **'Net' category** for 'BI Status'
- All transactions marked as **'received','paid','cod','exchanged' or something related to refund** are marked in **'Valid' category**
- All transactions marked as **either 'canceled' or something to do with incomplete transation** are marked in **'Gross' category**
- '#REF!' looks an erroneus label.

##### Actions
**Replace values inside the 'status' column by creating new labels**

- **'complete','closed','received','paid','cod'** will belong to category **'Completed'**
- **'order_refunded','refund', 'exchange'** will belong to category **'Refund'**
- **'pending','payment_review','processing','holded','pending_paypal','\N'** will beling to **'Pending'**
- **'canceled'** will belong to **'Cancelled'**
- **'fraud'** will belong to **'Fraud'**
**Also replace the '#REF!'' entry to 'Net' in 'BI status'**

In [None]:
df['status'] = df['status'].replace('complete', 'Completed')
df['status'] = df['status'].replace('closed', 'Completed')
df['status'] = df['status'].replace('received', 'Completed')
df['status'] = df['status'].replace('paid', 'Completed')
df['status'] = df['status'].replace('cod', 'Completed')
df['status'] = df['status'].replace('order_refunded', 'Refund')
df['status'] = df['status'].replace('refund', 'Refund')
df['status'] = df['status'].replace('exchange', 'Refund')
df['status'] = df['status'].replace('pending', 'Pending')
df['status'] = df['status'].replace('payment_review', 'Pending')
df['status'] = df['status'].replace('processing', 'Pending')
df['status'] = df['status'].replace('holded', 'Pending')
df['status'] = df['status'].replace('pending_paypal', 'Pending')
df['status'] = df['status'].replace(r'\\N', 'Pending', regex=True)
df['status'] = df['status'].replace('fraud', 'Fraud')
df['status'] = df['status'].replace('canceled', 'Cancelled')

In [None]:
df['status'].value_counts()

In [None]:
df['BI Status'] = df['BI Status'].replace('#REF!', 'Net')

In [None]:
df['BI Status'].value_counts()

##### Handling Null values in 'status' column

In [None]:
df[df['status'].isnull()]

##### Observation
- 15 NaN values in 'status' column have 'Gross' in the BI column meaning all these transactions are not valid

##### Actions
- Replacing NaN values with label **'Cancelled'** in line with our understanding of the data

In [None]:
df['status'].fillna("Cancelled",inplace=True)

#### Handling NaN values in 'category_name' column

In [None]:
df['category_name'].value_counts()

##### Observations
- There are 164 NaN values in the **'category_name'** column that can be filled using some information from **'sku'** column. Not doing it right now
- 7850 transactions have a unicode label associated with them.
- 164 transactions have NaN values.

##### Actions
- Replacing the unicode label and NaN values with label 'Unknown'

In [None]:
df['category_name'] = df['category_name'].replace(r'\\N', 'Unknown', regex=True)
df['category_name'].fillna("Unknown",inplace=True)

#### Handling NaN values in 'sku' column

In [None]:
df[df['sku'].isnull()]

##### Obsevations
- 20 NaN values for **'sku'** exist in the dataset and these values can be replaced.

##### Action
- Replace NaN values with a new sku code **'Missing'**

In [None]:
df['sku'].fillna("Missing",inplace=True)

#### Handling missing values in 'Sales_commission_code' column

In [None]:
df['sales_commission_code'].value_counts()

In [None]:
df[df['sales_commission_code'].isnull()]

##### Observations
- The column has a large number of NaN values and there are more than 7000 types of values in this column
- The column does not seem to add any value for further analysis and can be dropped at a later stage
- At this stage, NaN values as well as unicode labels can be replaced with 'Missing'

##### Actions
- Replacing NaN and unicode values with **'Missing'**

In [None]:
df['sales_commission_code'].fillna("Missing",inplace=True)
df['sales_commission_code'] = df['sales_commission_code'].replace(r'\\N', 'Missing', regex=True)

#### Handling missing values in 'Customer ID' and 'Customer Since' columns

In [None]:
df[df['Customer ID'].isnull()]

##### Observations
- There are a total of 11 rows where the 'Customer ID' column is NaN and exactly the same rows in 'Customer since' are also NaN, which makes sense and shows that these columns have a relationship.
- All 11 records are from FY18, with the first record from 01-2018.
- For keeping the records in dataset for analysis, a fake 'Customer ID' value of '0' can be assigned with '01-2018' assigned to all records in 'Customer Since' column

##### Actions
- Replaced 'Customer ID' with value **'0'** and 'Customer Since' with value **'01-2018'** for all NaN values

In [None]:
df['Customer ID'].fillna("0",inplace=True)
df['Customer Since'].fillna("1-2018",inplace=True)

#### Checking for Null values again and setting appropriate datatypes

In [None]:
df.isnull().sum()

#### Convert the datatypes of columns

In [None]:
df[["item_id"]] = df[["item_id"]].astype("str")
df[["Month"]] = df[["Month"]].astype("int")
df[["Year"]] = df[["Year"]].astype("int")
df[["qty_ordered"]] = df[["qty_ordered"]].astype("int")
df[["Customer ID"]] = df[["Customer ID"]].astype("str")
df[["increment_id"]] = df[["increment_id"]].astype("str")

In [None]:
df.info()

In [None]:
df = df.reset_index()

## Step 2: Exploratory Data Analysis

#### Transactions by Order Status

In [None]:
import plotly.graph_objects as go

df1 = df.groupby('status').size().reset_index(name='count').sort_values(by='count', ascending=False)
df1['Percentage'] = 100 * df1['count']  / df1['count'].sum()

# Use textposition='auto' for direct text
fig = go.Figure(data=[go.Bar(
            x=df1['status'], 
            y=df1['count'],
            text=df1['Percentage'].apply(lambda x: '{0:1.2f}%'.format(x)),
            textposition='auto'
        )])

fig.update_layout(
    title="Transactions by Order Status",
    xaxis_title="Order Status",
    yaxis_title="count",    
    )

fig.show()

##### Observartions
- Highest number of transactions **(315K or 54%)** belong to the **Completed** category
- A very high number of transactions **(201K or 34%)** are getting cancelled
- A sizeable number of transactions **(67K or 11.5%)** have some sort of refund associated.
- Almost **46%** of transactions are not getting completed for some reason, meaning a lot can be done to **improve the conversion ratio**

##### Actions
- Analyze any relationship between **'Order Status' and 'Product Category'** so that it can be seen which product transactions are getting completed or cancelled. Also which products have the most refunds associated.
- Analyze the cancellation and refunds as Revenue lost
- Analyze any relationship between **'Order Status' and 'Payment Method'**

#### Transactions by Category Name

In [None]:
df1 = df.groupby('category_name').size().reset_index(name='count').sort_values(by='count', ascending=False)
df1['Percentage'] = 100 * df1['count']  / df1['count'].sum()

# Use textposition='auto' for direct text
fig = go.Figure(data=[go.Bar(
            x=df1['category_name'], 
            y=df1['count'],
            text=df1['Percentage'].apply(lambda x: '{0:1.2f}%'.format(x)),
            textposition='auto'
        )])

fig.update_layout(
    title="Transactions by Product Category",
    xaxis_title="Product Category",
    yaxis_title="count",    
    )

fig.show()

##### Observations
- Highest number of transactions **(115K or 20%)** happened for **Mobile & Tablets** whereas least number of transactions happened for **Books**

#### Combined impact of Category Name and Order Status on Transactions

In [None]:
import plotly.express as px
px.histogram(df, x='category_name', color = 'status', barmode='relative', title="Product Category wise Order Status")

##### Observations
**Completed Orders**
- **Mens Fashion** has the highest number of transactions followed closely by **Mobile & Tablets**

**Cancelled Orders**
- Common amongst all the Product categories, but more transactions are cancelled than completed for Mobile & Tablets, Others, Entertainment and Unknown Product categories.
- Highest number of transactions belong to **Mobile & Tablets** which are even greater than number of transactions which are completed.

**Refunds**
- Highest number of refunds happen for **Mobile & Tablets, Men's Fashion and Women's Fashion**

##### Actions
- Explore the same trends for 'Total Sales' and see if there is any symmetry or transactions do not have the same monetary impact

#### Does Sales Amount follow the same pattern as Transactions??

In [None]:
print(' Total Sales for all Transactions (inclusive of Discounts): ', df['grand_total'].sum())

In [None]:
# Calculation for sum
df1 = df.groupby('status')['grand_total'].sum().reset_index(name='sum').sort_values(by='sum', ascending=False)
df1['Percentage'] = 100 * df1['sum']  / df1['sum'].sum()
fig = go.Figure(data=[go.Bar(
            x=df1['status'], 
            y=df1['sum'],
            text=df1['Percentage'].apply(lambda x: '{0:1.2f}%'.format(x)),
            marker = dict(color='rgba(255, 0, 0, 1)',line=dict(color='rgba(255, 0, 0, 1)',width=1)),
            textposition='auto'
        )])

fig.update_layout(
    title="Total Sales by Order Status",
    xaxis_title="Order Status",
    yaxis_title="Total Sales",    
    )

fig.show()

##### Observations
- Almost **58% or Rs 2.9 Bn** of the total sales amount is **'cancelled'** which has a much higher percentage than the number of **cancelled transactions**.
- E-commerce store has earned **32.73% or Rs 1.6 Bn** worth of revenue from Sales against a potential revenue of **Rs 4.98 Bn**

##### Actions
- A percentage wise comparison of 'Transactions' and 'Total Sales' needs to be done for better understanding

In [None]:
# Calculation for sum
df1 = df.groupby('status')['grand_total'].sum().reset_index(name='sum')
df1['Percentage'] = 100 * df1['sum']  / df1['sum'].sum()

# Calculation for count
df2 = df.groupby('status').size().reset_index(name='count')
df2['Percentage'] = 100 * df2['count']  / df2['count'].sum()

x = df1['status'];

trace1 = {
  'x': x,
  'y': df2['Percentage'],
  'name': 'Transactions',
  'type': 'bar'
};

trace2 = {
  'x': x,
  'y': df1['Percentage'],
  'name': 'Total Sales',
  'marker': dict(color='rgba(255, 0, 0, 1)'),
  'type': 'bar'
};

data = [trace1, trace2];

fig = go.Figure(data=data)

fig.update_layout(
    title="Transactions and Total Sales by Order Status",
    xaxis_title="Order Status",
    yaxis_title="% of Total",    
    )

fig.show()

##### Observation
- **'Cancelled'** transactions have much higher monetary value than **'Completed'** transactions.
- Almost **9% of revenue** is lost through **'Refund'**. **'Fraud' and 'Pending'** have minor contribution

##### Actions
- Explore the Total Sales generated only by the 'Completed' order status
- Explore the Total Sales generated by order status other than 'Completed'

In [None]:
temp = df.loc[df['status']=='Completed',['category_name','grand_total']]
df1 = temp.groupby('category_name')['grand_total'].sum().reset_index(name='sum').sort_values(by='sum', ascending=False)
df1['Percentage'] = 100 * df1['sum']  / df1['sum'].sum()

# Use textposition='auto' for direct text
fig = go.Figure(data=[go.Bar(
            x=df1['category_name'], 
            y=df1['sum'],
            text=df1['Percentage'].apply(lambda x: '{0:1.2f}%'.format(x)),
            marker=dict(color='rgba(255, 0, 0, 1)'),
            textposition='auto'
        )])

fig.update_layout(
    title="Total Sales of Completed Transactions by Product Category",
    xaxis_title="Product Category",
    yaxis_title="Total Sales",    
    )

fig.show()

#### Best Selling Product Category

##### Observations
- **Mobile & Tablets** is the Best Selling category as it has the highest contribution to sales
- **Mens Fashion**, despite having the most completed transactions, is the **5th highest contributor**
- **Appliances** and **Entertainment** have 2nd and 3rd highest contribution to revenue, although the number of completed transactions for these products is low.
- **Top 5** Productcategories are contributing almost **78% of the overall reveune**

##### Actions
- Dive deep in the best selling category to check the associated 'sku' column and see if top products can be pinpointed

#### To be continued