In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Reading the csv file
data = pd.read_csv('/kaggle/input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv')
df = data.copy()

## Step 1: Data Pre-processing

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
# Checking for missing / NaN values
df.isnull().sum()

In [None]:
# Doing a visual inspection of all columns
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

##### Observations
- Out of 26 columns, last 5 columns in the dataset contain NaN values for all records
- Records at 464051 indices (from the bottom) contain NaN values for all columns
- ' MV ' is an ambiguous column name with extra spaces
- Some of the columns have incorrect data types

##### Actions
- Last 5 columns need to be dropped from the dataset
- 464051 rows, containing NaN values need to be dropped from the dataset
- Renamed the columns ' MV ' and 'category_name_1' to 'MV' and 'category_name'

In [None]:
df.drop(["Unnamed: 21", "Unnamed: 22", "Unnamed: 23", "Unnamed: 24", "Unnamed: 25"], axis = 1, inplace=True)
df.dropna(subset=["item_id"], axis=0, inplace=True)
df.rename(columns={" MV ": "MV", "category_name_1": "category_name"}, inplace = True)

##### Dropping duplicate entries, if any, from the dataset

In [None]:
df=df.drop_duplicates()

##### Basic data quality and integrity checks

In [None]:
print("The number of rows with negative or zero Quantity:",sum(n <= 0 for n in df.qty_ordered))
print("The number of rows with negative Price:",sum(n < 0 for n in df.price))

##### Convert all values in 'sku' column to upper case for uniformity

In [None]:
df['sku']=df['sku'].str.upper()

#### Exploring all columns, finding and Imputing Null Values
#### Categorical Variables

In [None]:
df['status'].value_counts()

##### Observations
- There are a lot of labels for 'status' column.
- Need to check if any relationship exists between 'status' and 'BI Status' columns

In [None]:
df.groupby('BI Status')['status'].value_counts()

##### Observations
- All transactions marked as either **'complete' or 'closed'**, fall in the **'Net' category** for 'BI Status'
- All transactions marked as **'received','paid','cod','exchanged' or something related to refund** are marked in **'Valid' category**
- All transactions marked as **either 'canceled' or something to do with incomplete transation** are marked in **'Gross' category**
- '#REF!' looks an erroneus label.

##### Actions
**Replace values inside the 'status' column by creating new labels**

- **'complete','closed','received','paid','cod'** will belong to category **'Completed'**
- **'order_refunded','refund', 'exchange'** will belong to category **'Refund'**
- **'pending','payment_review','processing','holded','pending_paypal','\N'** will beling to **'Pending'**
- **'canceled'** will belong to **'Cancelled'**
- **'fraud'** will belong to **'Fraud'**
**Also replace the '#REF!'' entry to 'Net' in 'BI status'**

In [None]:
df['status'] = df['status'].replace('complete', 'Completed')
df['status'] = df['status'].replace('closed', 'Completed')
df['status'] = df['status'].replace('received', 'Completed')
df['status'] = df['status'].replace('paid', 'Completed')
df['status'] = df['status'].replace('cod', 'Completed')
df['status'] = df['status'].replace('order_refunded', 'Refund')
df['status'] = df['status'].replace('refund', 'Refund')
df['status'] = df['status'].replace('exchange', 'Refund')
df['status'] = df['status'].replace('pending', 'Pending')
df['status'] = df['status'].replace('payment_review', 'Pending')
df['status'] = df['status'].replace('processing', 'Pending')
df['status'] = df['status'].replace('holded', 'Pending')
df['status'] = df['status'].replace('pending_paypal', 'Pending')
df['status'] = df['status'].replace(r'\\N', 'Pending', regex=True)
df['status'] = df['status'].replace('fraud', 'Fraud')
df['status'] = df['status'].replace('canceled', 'Cancelled')

In [None]:
df['status'].value_counts()

In [None]:
df['BI Status'] = df['BI Status'].replace('#REF!', 'Net')

In [None]:
df['BI Status'].value_counts()

##### Handling Null values in 'status' column

In [None]:
df[df['status'].isnull()]

##### Observation
- 15 NaN values in 'status' column have 'Gross' in the BI column meaning all these transactions are not valid

##### Actions
- Replacing NaN values with label **'Cancelled'** in line with our understanding of the data

In [None]:
df['status'].fillna("Cancelled",inplace=True)

#### Handling NaN values in 'category_name' column

In [None]:
df['category_name'].value_counts()

##### Observations
- There are 164 NaN values in the **'category_name'** column that can be filled using some information from **'sku'** column. Not doing it right now
- 7850 transactions have a unicode label associated with them.
- 164 transactions have NaN values.

##### Actions
- Replacing the unicode label and NaN values with label 'Unknown'

In [None]:
df['category_name'] = df['category_name'].replace(r'\\N', 'Unknown', regex=True)
df['category_name'].fillna("Unknown",inplace=True)

#### Handling NaN values in 'sku' column

In [None]:
df[df['sku'].isnull()]

##### Obsevations
- 20 NaN values for **'sku'** exist in the dataset and these values can be replaced.

##### Action
- Replace NaN values with a new sku code **'Missing'**

In [None]:
df['sku'].fillna("Missing",inplace=True)

#### Handling missing values in 'Sales_commission_code' column

In [None]:
df['sales_commission_code'].value_counts()

In [None]:
df[df['sales_commission_code'].isnull()]

##### Observations
- The column has a large number of NaN values and there are more than 7000 types of values in this column
- The column does not seem to add any value for further analysis and can be dropped at a later stage
- At this stage, NaN values as well as unicode labels can be replaced with 'Missing'

##### Actions
- Replacing NaN and unicode values with **'Missing'**

In [None]:
df['sales_commission_code'].fillna("Missing",inplace=True)
df['sales_commission_code'] = df['sales_commission_code'].replace(r'\\N', 'Missing', regex=True)

#### Handling missing values in 'Customer ID' and 'Customer Since' columns

In [None]:
df[df['Customer ID'].isnull()]

##### Observations
- There are a total of 11 rows where the 'Customer ID' column is NaN and exactly the same rows in 'Customer since' are also NaN, which makes sense and shows that these columns have a relationship.
- All 11 records are from FY18, with the first record from 01-2018.
- For keeping the records in dataset for analysis, a fake 'Customer ID' value of '0' can be assigned with '01-2018' assigned to all records in 'Customer Since' column

##### Actions
- Replaced 'Customer ID' with value **'0'** and 'Customer Since' with value **'01-2018'** for all NaN values

In [None]:
df['Customer ID'].fillna("0",inplace=True)
df['Customer Since'].fillna("1-2018",inplace=True)

#### Checking for Null values again and setting appropriate datatypes

In [None]:
df.isnull().sum()

#### Convert the datatypes of columns

In [None]:
df[["item_id"]] = df[["item_id"]].astype("str")
df[["Month"]] = df[["Month"]].astype("int")
df[["Year"]] = df[["Year"]].astype("int")
df['created_at'] = pd.to_datetime(df['created_at'])
df[["qty_ordered"]] = df[["qty_ordered"]].astype("int")
df[["Customer ID"]] = df[["Customer ID"]].astype("str")
df[["increment_id"]] = df[["increment_id"]].astype("str")

## creating new columns to drill down the time dimension
df['day_of_week'] = df['created_at'].dt.dayofweek.astype(str) # 0 = monday.
#df['weekday_flag'] = (df['day_of_week'] // 5 != 1).astype(str)
df['date_of_month'] = df['created_at'].dt.day
df['Week'] = df['created_at'].dt.week

In [None]:
df.info()

In [None]:
df = df.reset_index()

## Step 2: Exploratory Data Analysis

### Is there a correlation between Order Date and Item Category?

- In order to explore this relationship, it is important to see the dimension of time with multiple levels of granularity like Month number, Week number, Day of week and Date of month.
- Also the item category labels need to be explored further

#### Exploring the Item Category

In [None]:
df['category_name'].value_counts()

##### Observations
- The Item Categories have sufficient labels which are distinctive and cannot be reduced further
- It has already been established from other notebooks that cancelled transactions are more than completed transactions and do not contrbiute towards revenue.
- Cancelled transactions occur at the same time as Completed transactions and mostly driven by payment methods
- It is better to perform the rest of the analysis for **'Completed transactions'** and for **FY17 and FY18** as one month's data for FY19 can bias the results

##### Actions
- Explore the number of transactions by different time dimensions

#### Transactions by Month

In [None]:
import plotly.express as px

temp = df.loc[(df['status']=='Completed') & (df['FY']!='FY19'),['Month','Week','date_of_month','day_of_week','category_name']]

In [None]:
df1 = temp.groupby(['Month','category_name']).size().reset_index(name='count')
df1['Percentage'] = 100 * df1['count'] / df1.groupby('Month')['count'].transform('sum')
fig = px.bar(df1, x="Month", y="count", color="category_name", text=df1['Percentage'].apply(lambda x: '{0:1.2f}%'.format(x)), title="Monthly Transactions by Item Category")
fig.add_annotation(x=11, y=85000,
            text="Annual Black Friday Sales",
            showarrow=True,
            arrowhead=1)
fig.add_annotation(x=3, y=35000,
            text="23rd March deals",
            showarrow=True,
            arrowhead=1)
fig.add_annotation(x=5, y=35000,
            text="Eid-ul-Fitr",
            showarrow=True,
            arrowhead=1)
fig.show()

In [None]:
plt.figure(figsize=(15,6))
crosstab = pd.crosstab(temp['Month'], temp['category_name'])
sns.heatmap(crosstab, cmap="YlGnBu")

##### Observations
- A substantial increase in transactions seen in **month of November** across all categories, most probably driven by **Annual Black Friday sales**
- Smaller peaks seen in **March and May** which are driven by **23rd March and Eid-ul-Fitr Sales**
- **Appliances, Fashion products for both men and women and Mobiles & Tablets** see a significant increase for the **Black Friday period** while most other categories also see an increase for number of transactions
- **'Others'** category is most active during the **23rd March peak**, which points to some sort of limited time offer or deal. 
- **Entertainment** and **Appliances** almost have higher numbers around the March and May peaks.

#### Transactions by Week Number

In [None]:
df1 = temp.groupby(['Week','category_name']).size().reset_index(name='count')
df1['Percentage'] = 100 * df1['count'] / df1.groupby('Week')['count'].transform('sum')
fig = px.bar(df1, x="Week", y="count", color="category_name", text=df1['Percentage'].apply(lambda x: '{0:1.2f}%'.format(x)), title="Weekly Transactions by Item Category")
fig.add_annotation(x=47, y=52000,
            text="Annual Black Friday Sales",
            showarrow=True,
            arrowhead=1)
fig.add_annotation(x=20, y=13000,
            text="Eid-ul-Fitr",
            showarrow=True,
            arrowhead=1)
fig.show()

In [None]:
plt.figure(figsize=(15,6))
crosstab = pd.crosstab(temp['Week'], temp['category_name'])
sns.heatmap(crosstab, cmap="YlGnBu")

##### Observations
- Significant peak seen for **Week 46 and 47**, which represent the **Black Friday** for FY17 and FY18 respectively
- Another peak seen in **Week 20**, which represents the Eid-ul-Fitr period.

#### Transactions by Date of Month

In [None]:
df1 = temp.groupby(['date_of_month','category_name']).size().reset_index(name='count')
df1['Percentage'] = 100 * df1['count'] / df1.groupby('date_of_month')['count'].transform('sum')
fig = px.bar(df1, x="date_of_month", y="count", color="category_name", text=df1['Percentage'].apply(lambda x: '{0:1.2f}%'.format(x)), title="Daily Transactions by Item Category")
fig.show()

In [None]:
plt.figure(figsize=(15,6))
crosstab = pd.crosstab(temp['date_of_month'], temp['category_name'])
sns.heatmap(crosstab, cmap="YlGnBu")

##### Observations
- Week 3 and Week 4 of have considerably more transactions than Week 1 and Week 2. Although, this has a bias because of the **Black Friday** and **23rd March deal dates**, which fall in Week 4 every year. Also the **Eid-ul-Fitr dates** for **FY17 and FY18 happened in Week 3 and Week 4**, but still there is a significant pattern seen in the plot above.
- The increase is most significant for **Appliances, Fashion Products, Entertainment, Home & Living and Mobiles & Tablets**, which means consumers purchase these products more in Week 3 and Week 4 than the rest of the month.
- **Books, Kids & Baby, School & Education products** do not follow this pattern

#### Transactions by Day of Week

In [None]:
df1 = temp.groupby(['day_of_week','category_name']).size().reset_index(name='count')
df1['Percentage'] = 100 * df1['count'] / df1.groupby('day_of_week')['count'].transform('sum')
fig = px.bar(df1, x="day_of_week", y="count", color="category_name", text=df1['Percentage'].apply(lambda x: '{0:1.2f}%'.format(x)), title="Daily Transactions by Item Category")
fig.show()

In [None]:
plt.figure(figsize=(15,6))
crosstab = pd.crosstab(temp['day_of_week'], temp['category_name'])
sns.heatmap(crosstab, cmap="YlGnBu")

##### Observations
- **Day 4 (Friday)** has significantly more transactions than other weekdays, which is partly because of the **Black Friday sales** impact in the dataset, but the start of the weekend factor may also have a role. 
- Rest of the days have no significant pattern, other than **Sunday**, where transactions have been **lowest**
- **Men's Fashion** as well as **Women's Fashion** products have **more %age of orders** over the weekend than on week days

### Statistical relationship betwen Order Date and Item Category

In [None]:
import scipy.stats as stats

df1 = pd.crosstab(df['date_of_month'], df['category_name'])
observed = df1.values
val=stats.chi2_contingency(df1)
expected = val[3]

In [None]:
from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(observed,expected)])
chi_square_statistic=chi_square[0]+chi_square[1]

# Specifying alpha as 0.05 or p-value criteria as 95%
alpha = 0.05
no_of_rows=df1.shape[0]
no_of_columns=df1.shape[1]
ddof=(no_of_rows-1)*(no_of_columns-1)

critical_value=chi2.ppf(q=1-alpha,df=ddof)
p_value=1-chi2.cdf(x=chi_square_statistic,df=ddof)

In [None]:
if chi_square_statistic>=critical_value:
    print("There is a relationship between Order Date and Item Category")
else:
    print("There is no relationship between Order Date and Item Category")
    
if p_value<=alpha:
    print("There is a relationship between Order Date and Item Category")
else:
    print("There is no relationship between Order Date and Item Category")

##### Observations
- Both parameters for chi-squared test result validation indicate that there is a statistical relationship between Order Date and Item Category


### Conclusion

- The chi-squared result shows that there is some statistical relationship between **Order Date and Item Category**, however, any correlation between the two is either weak or non-existent.
- On a yearly level, the **last Friday of November** has increased transactions across all item categories. Similar trends, with a smaller peak, are seen on **Eid-ul-Fitr and 23rd March**.
- On a monthly level, **Week 3 and Week 4 have higher transactions** than **Week 1 and 2** across all item categories especially **Mobile & Tablets, Entertainment, Appliances, Home and Living, Superstore, Beauty and Grooming and Fashion Products for Men and Women**
- On a weekly level, **Friday** has higher transactions than other days with **Sunday having the least** transactions across all categories
- **Men's and Women's Fashion Products** have higher transactions over the **weekends** than weekdays
- E-commerce retailers can use this knowledge to do **marketing campaigns** and **plan product launches** at a weekly and monthly level.