In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
# Reading the csv file
data = pd.read_csv('/kaggle/input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv')
df = data.copy()

## Step 1: Data Pre-processing

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
# Checking for missing / NaN values
df.isnull().sum()

In [None]:
# Doing a visual inspection of all columns
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

##### Observations
- Out of 26 columns, last 5 columns in the dataset contain NaN values for all records
- Records at 464051 indices (from the bottom) contain NaN values for all columns
- ' MV ' is an ambiguous column name with extra spaces
- Some of the columns have incorrect data types

##### Actions
- Last 5 columns need to be dropped from the dataset
- 464051 rows, containing NaN values need to be dropped from the dataset
- Renamed the columns ' MV ' and 'category_name_1' to 'MV' and 'category_name'

In [None]:
df.drop(["Unnamed: 21", "Unnamed: 22", "Unnamed: 23", "Unnamed: 24", "Unnamed: 25"], axis = 1, inplace=True)
df.dropna(subset=["item_id"], axis=0, inplace=True)
df.rename(columns={" MV ": "MV", "category_name_1": "category_name"}, inplace = True)

##### Dropping duplicate entries, if any, from the dataset

In [None]:
df=df.drop_duplicates()

##### Basic data quality and integrity checks

In [None]:
print("The number of rows with negative or zero Quantity:",sum(n <= 0 for n in df.qty_ordered))
print("The number of rows with negative Price:",sum(n < 0 for n in df.price))

##### Convert all values in 'sku' column to upper case for uniformity

In [None]:
df['sku']=df['sku'].str.upper()

#### Exploring all columns, finding and Imputing Null Values
#### Categorical Variables

In [None]:
df['status'].value_counts()

##### Observations
- There are a lot of labels for 'status' column.
- Need to check if any relationship exists between 'status' and 'BI Status' columns

In [None]:
df.groupby('BI Status')['status'].value_counts()

##### Observations
- All transactions marked as either **'complete' or 'closed'**, fall in the **'Net' category** for 'BI Status'
- All transactions marked as **'received','paid','cod','exchanged' or something related to refund** are marked in **'Valid' category**
- All transactions marked as **either 'canceled' or something to do with incomplete transation** are marked in **'Gross' category**
- '#REF!' looks an erroneus label.

##### Actions
**Replace values inside the 'status' column by creating new labels**

- **'complete','closed','received','paid','cod'** will belong to category **'Completed'**
- **'order_refunded','refund', 'exchange'** will belong to category **'Refund'**
- **'pending','payment_review','processing','holded','pending_paypal','\N'** will beling to **'Pending'**
- **'canceled'** will belong to **'Cancelled'**
- **'fraud'** will belong to **'Fraud'**
**Also replace the '#REF!'' entry to 'Net' in 'BI status'**

In [None]:
df['status'] = df['status'].replace('complete', 'Completed')
df['status'] = df['status'].replace('closed', 'Completed')
df['status'] = df['status'].replace('received', 'Completed')
df['status'] = df['status'].replace('paid', 'Completed')
df['status'] = df['status'].replace('cod', 'Completed')
df['status'] = df['status'].replace('order_refunded', 'Refund')
df['status'] = df['status'].replace('refund', 'Refund')
df['status'] = df['status'].replace('exchange', 'Refund')
df['status'] = df['status'].replace('pending', 'Pending')
df['status'] = df['status'].replace('payment_review', 'Pending')
df['status'] = df['status'].replace('processing', 'Pending')
df['status'] = df['status'].replace('holded', 'Pending')
df['status'] = df['status'].replace('pending_paypal', 'Pending')
df['status'] = df['status'].replace(r'\\N', 'Pending', regex=True)
df['status'] = df['status'].replace('fraud', 'Fraud')
df['status'] = df['status'].replace('canceled', 'Cancelled')

In [None]:
df['status'].value_counts()

In [None]:
df['BI Status'] = df['BI Status'].replace('#REF!', 'Net')

In [None]:
df['BI Status'].value_counts()

##### Handling Null values in 'status' column

In [None]:
df[df['status'].isnull()]

##### Observation
- 15 NaN values in 'status' column have 'Gross' in the BI column meaning all these transactions are not valid

##### Actions
- Replacing NaN values with label **'Cancelled'** in line with our understanding of the data

In [None]:
df['status'].fillna("Cancelled",inplace=True)

#### Handling NaN values in 'category_name' column

In [None]:
df['category_name'].value_counts()

##### Observations
- There are 164 NaN values in the **'category_name'** column that can be filled using some information from **'sku'** column. Not doing it right now
- 7850 transactions have a unicode label associated with them.
- 164 transactions have NaN values.

##### Actions
- Replacing the unicode label and NaN values with label 'Unknown'

In [None]:
df['category_name'] = df['category_name'].replace(r'\\N', 'Unknown', regex=True)
df['category_name'].fillna("Unknown",inplace=True)

#### Handling NaN values in 'sku' column

In [None]:
df[df['sku'].isnull()]

##### Obsevations
- 20 NaN values for **'sku'** exist in the dataset and these values can be replaced.

##### Action
- Replace NaN values with a new sku code **'Missing'**

In [None]:
df['sku'].fillna("Missing",inplace=True)

#### Handling missing values in 'Sales_commission_code' column

In [None]:
df['sales_commission_code'].value_counts()

In [None]:
df[df['sales_commission_code'].isnull()]

##### Observations
- The column has a large number of NaN values and there are more than 7000 types of values in this column
- The column does not seem to add any value for further analysis and can be dropped at a later stage
- At this stage, NaN values as well as unicode labels can be replaced with 'Missing'

##### Actions
- Replacing NaN and unicode values with **'Missing'**

In [None]:
df['sales_commission_code'].fillna("Missing",inplace=True)
df['sales_commission_code'] = df['sales_commission_code'].replace(r'\\N', 'Missing', regex=True)

#### Handling missing values in 'Customer ID' and 'Customer Since' columns

In [None]:
df[df['Customer ID'].isnull()]

##### Observations
- There are a total of 11 rows where the 'Customer ID' column is NaN and exactly the same rows in 'Customer since' are also NaN, which makes sense and shows that these columns have a relationship.
- All 11 records are from FY18, with the first record from 01-2018.
- For keeping the records in dataset for analysis, a fake 'Customer ID' value of '0' can be assigned with '01-2018' assigned to all records in 'Customer Since' column

##### Actions
- Replaced 'Customer ID' with value **'0'** and 'Customer Since' with value **'01-2018'** for all NaN values

In [None]:
df['Customer ID'].fillna("0",inplace=True)
df['Customer Since'].fillna("1-2018",inplace=True)

#### Checking for Null values again and setting appropriate datatypes

In [None]:
df.isnull().sum()

#### Convert the datatypes of columns

In [None]:
df[["item_id"]] = df[["item_id"]].astype("str")
df[["Month"]] = df[["Month"]].astype("int")
df[["Year"]] = df[["Year"]].astype("int")
df['created_at'] = pd.to_datetime(df['created_at'])
df[["qty_ordered"]] = df[["qty_ordered"]].astype("int")
df[["Customer ID"]] = df[["Customer ID"]].astype("str")
df[["increment_id"]] = df[["increment_id"]].astype("str")

## creating new columns to drill down the time dimension
df['day_of_week'] = df['created_at'].dt.dayofweek # 0 = monday.
df['weekday_flag'] = (df['day_of_week'] // 5 != 1).astype(str)
df['date_of_month'] = df['created_at'].dt.day

In [None]:
df.info()

In [None]:
df = df.reset_index()

## Step 2: Exploratory Data Analysis

### Is there a correlation between Order Status and Payment Methods

#### From the notebook on relationship between Order Status and Payment Methods, it was concluded that

- E-commerce store users used Cash and voucher based transactions as the preferred method for FY17 and FY18 in terms of revenue generation through Completed transactions, but the cod payments %age saw a downward trend in FY18
- Digital or E-payment methods were mainly responsible for making the revenue earned in FY18 double than it was in FY17. However, due to a large number of cancelled transactions associated with these methods, there is a strong possibility that the web portal faced integration challenges and resulted in many cancelled transactions
- Digital / E-payment have been a driver in revenue growth but at the same time resulted in more cancellations and potential revenue lost.

#### Let's try and reduce some of the column labels for both columns, especially labels having few entries so see if some kind of relationship / correlation can be explored

#### Exploring the 'payment_methods' feature

In [None]:
df['payment_method'].value_counts()

##### Observations

- **'Easypay' and 'Easypay_MA'** can be combined under the label 'Easypay'
- **'cod' and 'cashatdoorstep'** can be combined under 'cod'
- **'marketingexpense', 'financesettlement', 'productcredit', 'internetbanking', 'mygateway', 'mcblite', 'ublcreditcard', 'apg'** can be combined under 'Others' as all of these have very few entries in dataset

##### Actions
- Combine 'Easypay' and 'Easypay_MA'
- Combine 'cod' and 'cashatdoorstep'
- Combine 'marketingexpense', 'financesettlement', 'productcredit', 'internetbanking', 'mygateway', 'mcblite', 'ublcreditcard', 'apg' under 'Others'

In [None]:
df['payment_method'] = df['payment_method'].replace('Easypay_MA', 'Easypay')
df['payment_method'] = df['payment_method'].replace('cashatdoorstep', 'cod')
df['payment_method'] = df['payment_method'].replace(['marketingexpense','financesettlement','productcredit', 'internetbanking', 'mygateway', 'mcblite', 'ublcreditcard', 'apg'], 'Others')

In [None]:
df['payment_method'].value_counts()

#### Exploring the 'status' feature

In [None]:
df['status'].value_counts()

##### Observations

- 'Pending' and 'Fraud' can also be combined under 'Cancelled' as these are very less in number and do not contribute to the revenue.

##### Actions
- Combine 'Pending' and 'Fraud' under 'Cancelled'

In [None]:
df['status'] = df['status'].replace(['Pending','Fraud'], 'Cancelled')

In [None]:
df['status'].value_counts()

#### Now performing Chi-squared test to examine the relationship between Order Status and Payment Method

To understand what is Chi-Squared Test and how it is used for statistical evaluation, check the link

https://www.statisticshowto.com/probability-and-statistics/chi-square/

#### Chi-Squared Test

If Statistic >= Critical Value: significant result, categorical variables are dependent. If Statistic < Critical Value: not significant result, categorical variables are independent.

In [None]:
import scipy.stats as stats
import plotly.express as px
df1 = pd.crosstab(df['payment_method'], df['status'])
observed = df1.values
val=stats.chi2_contingency(df1)
expected = val[3]

In [None]:
fig = px.line(df1, x=df1.index.values, y=df1.columns.values)
fig.show()

In [None]:
from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(observed,expected)])
chi_square_statistic=chi_square[0]+chi_square[1]

# Specifying alpha as 0.05 or p-value criteria as 95%
alpha = 0.05
no_of_rows=df1.shape[0]
no_of_columns=df1.shape[1]
ddof=(no_of_rows-1)*(no_of_columns-1)

critical_value=chi2.ppf(q=1-alpha,df=ddof)
p_value=1-chi2.cdf(x=chi_square_statistic,df=ddof)

In [None]:
if chi_square_statistic>=critical_value:
    print("There is a relationship between Payment Method and Order Status")
else:
    print("There is no relationship between Payment Method and Order Status")
    
if p_value<=alpha:
    print("There is a relationship between Payment Method and Order Status")
else:
    print("There is no relationship between Payment Method and Order Status")

##### Observations
- Both parameters for chi-squared test result validation indicate that there is a statistical relationship between Payment Method and Order Status
- However, still we cannot say anything in terms of any quantitative measure that how strong is the correlation between Payment Method and Order Status. As both are categorical variables, so Pearson's correlation coefficient cannot be used. 
- A Python library dython gives a set of data analysis tools that calculates categorical-categorical relationship between features and can be used in this case to provide an answer to our question. The link to the library and associated documentation can be seen here

https://pypi.org/project/dython/#description  

In [None]:
!pip install dython

In [None]:
from dython.nominal import associations

df=df[['status','category_name','payment_method']]
associations(df)

### Conclusion

- The result in the heatmap above has been plotted using 3 categorical columns from the dataset. The library uses Cramer's V or Cramer's phi as the underlying measure which gives a measure of association between the categorical variables. Details on this statistical measure can be read on the following link
https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
- The heatmap above gives a value of **0.40** between **Payment Method** and **Order Status** where 0 corresponds to no association between the variables and 1 corresponds to complete association.
- The line plot for the contingency table (crosstab) also validates the heatmap result as a clear trend can be seen between few labels but not so clear trend between other labels.  