In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Plotly and Cufflinks setup

#!pip install plotly
#!pip install cufflinks
#!pip install chart_studio

#import chart_studio.plotly as py
import plotly.graph_objs as go

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
print(__version__) #requires version >= 1.9.0

import cufflinks as cf

#for Notebooks
init_notebook_mode(connected = True)

#for offline use
cf.go_offline()

# Loading Data

In [None]:
path = "../input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv"

In [None]:
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv(path,  parse_dates = ["Customer Since", "M-Y"], low_memory = False, na_values = ["NaN", 'NaT', ' -   '])

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df.head(5)

In [None]:
len(df)

In [None]:
df.info()

# Data Pre-Processing

### Droping off Un-necessary Columns

In [None]:
df.columns

#### Droping Columns = ['Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25']

In [None]:
drop_columns_0 = df.columns[-5:]
drop_columns_0

In [None]:
df.drop(drop_columns_0, axis = 1, inplace = True)

#### Droping Columns "items_id", "created_at", "increment_id", "sales_commission_code", "Working Date" , "BI Status" , "Year", "Month", "FY"

In [None]:
drop_columns_1 = ["item_id", "created_at", "increment_id", "sales_commission_code", "Working Date" , "BI Status" , "Year", "Month", "FY"]

In [None]:
df.drop(drop_columns_1, axis = 1, inplace = True)

### Re arranging Columns

In [None]:
df = df[['Customer ID', 'sku', 'category_name_1', 'status', 'qty_ordered',  'price', 'grand_total',
       'discount_amount', ' MV ', 'payment_method', 'Customer Since', 'M-Y']]

#### Check For Missing Data

In [None]:
missing = pd.DataFrame(df.isnull().sum())
missing.transpose()

In [None]:
total_missing_values = df.isnull().sum().sum()
total_missing_values

### Droping off Missing Values Rows

In [None]:
df.dropna(axis = 0, how = 'any', inplace = True)

In [None]:
#Checking again for missing values
missing = pd.DataFrame(df.isnull().sum())
missing.transpose()

In [None]:
total_missing_values = df.isnull().sum().sum()
total_missing_values

In [None]:
df.head(5)

In [None]:
df[' MV '].isnull().sum()#.sum()

### Extract Year and Month from 'M-Y' Column

In [None]:
# Extract Year
df['Year'] = df['M-Y'].apply(lambda x: int(x.year)) 

# Extract Month
df['Month'] = df['M-Y'].apply(lambda x: int(x.month)) 

df.drop('M-Y', axis = 1, inplace = True)

### Looking for 0 (zeros) in integer or float value columns

In [None]:
df[df['qty_ordered'] == 0]['qty_ordered'].value_counts()

In [None]:
df[df['price'] == 0]['price'].value_counts()

In [None]:
df[df[' MV '] == '0'][' MV '].value_counts()

In [None]:
df[df['grand_total'] == 0]['grand_total'].value_counts()

In [None]:
#found 9465 cells containing 0.0
#replace 0.0 from corresponding values of ' MV ' column

In [None]:
def replace_zeros(x, y):
    if x == 0:
        return y
    else: 
        return x

In [None]:
df.columns

In [None]:
df['grand_total'] = df.apply(lambda z: replace_zeros(z['grand_total'], z[' MV ']), axis = 1)

In [None]:
#Checking again
df[df['grand_total'] == 0]['grand_total'].value_counts()

### DataFrame Without 0(Zeros) and Null Values

In [None]:
df.head(5)

In [None]:
len(df)

# Task 2: Visualize payment method and order status frequency

## Overall Payment Methods Count from 2016-2018

In [None]:
df['payment_method'].nunique()

In [None]:
pd.DataFrame(df['payment_method'].unique()).transpose()

In [None]:
df_payment_method = pd.DataFrame(df.payment_method.value_counts())
df_payment_method = df_payment_method.sort_values(by = 'payment_method', ascending = False)
df_payment_method.transpose()

In [None]:
plt.style.use('bmh')

df_payment_method.plot.bar(title = 'Overall Payment Methods Counts from 2016-2018', 
                           xlabel = 'Payment Methods', 
                           ylabel = 'Count', 
                           figsize = (10, 5))
plt.show()


## Yearly Payment Methods

### (A) Yearly Payment Methods

In [None]:
# Preparing Data
df_payMethod_year = pd.DataFrame(df[['payment_method', 'Year']].value_counts())
df_payMethod_year = df_payMethod_year.reset_index()
df_payMethod_year.columns = ['Payment Methods', 'Year', 'Count']
data = df_payMethod_year.sort_values(by = 'Count', ascending = False).head(20)


# Plot
fig, ax = plt.subplots(figsize = (16, 6))

sns.barplot('Payment Methods', 'Count', hue = 'Year', data = data)

# add the annotation
ax.bar_label(ax.containers[-2], fmt = '\n%.0f', label_type = 'edge')

# add Labels
ax.set(xlabel = 'Payment Methods')
ax.set(ylabel = 'Count')
ax.set(title = 'Yearly Payment Methods Count from 2016-2018')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 0, horizontalalignment = 'center')
ax.legend(loc = 1)
plt.tight_layout()
plt.show()

### (B) Yearly Payment Methods

In [None]:
# Prepare Data
years = df_payMethod_year.Year.unique()
years = sorted(years)

#data = df_payMethod_year[df_payMethod_year['Year'] == j].value_counts().reset_index().sort_values(by = ['Count'], ascending = False)
mycolors = ['tab:red', 'tab:blue', 'tab:green'] #, 'tab:orange', 'tab:brown', 'tab:grey', 'tab:pink', 'tab:olive', 'deeppink', 'steelblue', 'firebrick', 'mediumseagreen']

# Plot
fig, ax = plt.subplots(1, len(years), sharey = False, figsize = (16, 4))

for i, j in enumerate(years):
    
    data = df_payMethod_year[df_payMethod_year['Year'] == j].value_counts().reset_index().sort_values(by = ['Count'], ascending = False)
        
    ax[i].bar(data['Payment Methods'], data.Count, color = mycolors[i])
    
    #labels
    ax[i].set(xlabel = 'Order Status')
    ax[i].set(ylabel = 'Count')
    ax[i].set(title = 'Year ' + str(j))
    ax[i].tick_params(labelrotation = 90, axis='x')

## Overall Order Status Counts from 2016-2018

In [None]:
order_status = pd.DataFrame(df['status'].unique())
order_status.transpose()

In [None]:
df['status'].nunique()

In [None]:
df_order_status = pd.DataFrame(df['status'].value_counts())

In [None]:
df_order_status.transpose()

In [None]:
plt.style.use('ggplot')
df_order_status.plot.bar(title = 'Overall Order Status Counts from 2016-2018', 
                         xlabel = 'Status', 
                         ylabel = 'Count', 
                         figsize = (10, 4),
                         sort_columns = df_order_status.sort_values('status', ascending = False))
plt.show()

## Yearly Orders Status

### (A) Yearly Order Status

In [None]:
df_status_year = pd.DataFrame(df[['status', 'Year']].value_counts())
df_status_year = df_status_year.reset_index()
df_status_year.columns = ['Status', 'Year', 'Count']

In [None]:
fig, ax = plt.subplots(figsize = (15, 5))

sns.barplot(x = 'Status', y = 'Count', hue = 'Year', data = df_status_year.head(15)) #

# add the annotation
ax.bar_label(ax.containers[-2], fmt = '\n%.0f', label_type = 'edge')

# add Labels
ax.set(xlabel = 'Order Status')
ax.set(ylabel = 'Count')
ax.set(title = 'Yearly Order Status')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30, horizontalalignment = 'center')
ax.legend(loc = 1)
plt.tight_layout()
plt.show()

### (B) Yearly Order Status

In [None]:
# prepare data
years = df_status_year.Year.unique()
years = sorted(years)
#data = df_status_year[df_status_year['Year'] == j][['Status', 'Count']].value_counts().reset_index().sort_values(by = 'Count', ascending = False).head(5)
mycolors = ['tab:red', 'tab:blue', 'tab:green', 'tab:orange', 'tab:brown', 'tab:grey', 'tab:pink', 'tab:olive', 'deeppink', 'steelblue', 'firebrick', 'mediumseagreen']      

#Plot

fig, ax = plt.subplots(1, len(years), sharey = True, figsize = (16, 5))

for i, j in enumerate(years):
    
    data = df_status_year[df_status_year['Year'] == j][['Status', 'Count']].value_counts().reset_index().sort_values(by = 'Count', ascending = False).head(5)
    
    ax[i].bar(data.Status, data.Count, color = mycolors[i])
    
    
    ax[i].set(xlabel = 'Order Status')
    ax[i].set(ylabel = 'Count')
    ax[i].set(title = 'Year ' + str(j))
    ax[i].tick_params(labelrotation = 30, axis='x')    

In [None]:
payment_method_status = pd.crosstab(df.payment_method, df.status)
payment_method_status

In [None]:
layout = go.Layout(title = "Payment Methods/Status", 
                   xaxis = {'title': 'Payment Methods'}, 
                   yaxis = {'title': 'Count'}, 
                   showlegend = True, 
                   width = 1000, 
                   height = 500,)

payment_method_status.iplot(kind = 'bar', layout = layout) 

# colorscale = Greens, Greys,YlGnBu,Greens,YlOrRd,Bluered,RdBu,Reds,Bl ues,Picnic,Rainbow,Portland,Jet,Hot,Blackbody,Earth,Electric,Vi ridis,Cividis.


In [None]:
# import plotly.express as px
# fig = px.bar(payment_method_status)
# fig.update_xaxes(title = 'Payment Method', title_font = dict(size = 18, family = 'Courier', color = 'crimson'))
# fig.update_yaxes(title = 'Count', title_font = dict(size = 18, family = 'Courier', color = 'crimson'))
# fig.
# fig.show()

# Task 3: Correlation between Payment Method and Order Status

In [None]:
plt.figure(figsize = (14, 7))
sns.set_theme(context = 'notebook', style = 'darkgrid', palette = 'bright', font = 'sans-serif', font_scale = 1, color_codes = True, rc = None)

sns.heatmap(payment_method_status, 
            cmap = 'coolwarm', 
            robust = True, 
            annot = True, 
            annot_kws = {'size':13}, 
            fmt = ".0f", 
            linecolor = 'white', 
            linewidths = 1, 
            cbar = True, 
            square = False,
            xticklabels = True,
            yticklabels = True) 

plt.xlabel("Status")
plt.ylabel("Payment Methods")
plt.text(0,-1, "Heat Map", fontsize = 25, color = 'Black', fontstyle = 'italic')
plt.show()

# Task 4:  Find Correlation Between Order Date and Item Category

In [None]:
#df[df['Year'] ==  2016][['category_name_1', 'Month']].sort_values(by = 'Month', ascending = True)

In [None]:
df_cat_orderDate = pd.DataFrame(df[['category_name_1', 'Year', 'Month']])
df_cat_orderDate = df_cat_orderDate.sort_values(by = ['Year', 'Month'])
df_cat_orderDate

In [None]:
df_cat_orderDate['Year'] =  df_cat_orderDate['Year'].apply(lambda x: str(x))
df_cat_orderDate['Month'] = df_cat_orderDate['Month'].apply(lambda x: str(x))

df_cat_orderDate['Order_Date'] = df_cat_orderDate['Year'] + "-" + df_cat_orderDate['Month']
df_cat_orderDate = pd.crosstab(df_cat_orderDate.category_name_1, df_cat_orderDate.Order_Date, margins = True)
df_cat_orderDate

In [None]:
plt.figure(figsize = (20, 7))
sns.set_theme(context = 'notebook', style = 'darkgrid', palette = 'bright', font = 'sans-serif', font_scale = 1, color_codes = True, rc = None)

sns.heatmap(df_cat_orderDate, 
            cmap = 'viridis', 
            robust = True, 
            annot = True, 
            annot_kws = {'size':13}, 
            fmt = ".0f", 
            linecolor = 'white', 
            linewidths = 1, 
            cbar = True, 
            square = False,
            xticklabels = True,
            yticklabels = True) 

plt.xlabel("Order Date (Year-Month)")
plt.ylabel("Item Categories")
plt.text(0,-1, "Heat Map", fontsize = 25, color = 'Black', fontstyle = 'italic')
plt.show()