In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
path = "../input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv"

In [None]:
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv(path,  parse_dates = ["Customer Since", "M-Y"], low_memory = False, na_values = ["NaN", 'NaT', ' -   '])

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df.head(5)

In [None]:
df.tail(5)

In [None]:
len(df)

In [None]:
df.info()

### Droping off Un-necessary Columns

In [None]:
df.columns

#### Droping Columns = ['Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25']

In [None]:
drop_columns_0 = df.columns[-5:]
drop_columns_0

In [None]:
df.drop(drop_columns_0, axis = 1, inplace = True)

#### Droping Columns "items_id", "created_at", "increment_id", "sales_commission_code", "Working Date" , "BI Status" , "Year", "Month", "FY"

In [None]:
drop_columns_1 = ["item_id", "created_at", "increment_id", "sales_commission_code", "Working Date" , "BI Status" , "Year", "Month", "FY"]

In [None]:
df.drop(drop_columns_1, axis = 1, inplace = True)

### Re arranging Columns

In [None]:
df = df[['Customer ID', 'sku', 'category_name_1', 'status', 'qty_ordered',  'price', 'grand_total',
       'discount_amount', ' MV ', 'payment_method', 'Customer Since', 'M-Y']]

#### Check For Missing Data

In [None]:
missing = pd.DataFrame(df.isnull().sum())
missing.transpose()

In [None]:
total_missing_values = df.isnull().sum().sum()
total_missing_values

### Droping off Missing Values Rows

In [None]:
df.dropna(axis = 0, how = 'any', inplace = True)

In [None]:
#Checking again for missing values
missing = pd.DataFrame(df.isnull().sum())
missing.transpose()

In [None]:
total_missing_values = df.isnull().sum().sum()
total_missing_values

In [None]:
df.head(5)

In [None]:
df[' MV '].isnull().sum()#.sum()

### Extract Year and Month from 'M-Y' Column

In [None]:
# Extract Year
df['Year'] = df['M-Y'].apply(lambda x: int(x.year)) 

# Extract Month
df['Month'] = df['M-Y'].apply(lambda x: int(x.month)) 

df.drop('M-Y', axis = 1, inplace = True)

### Looking for 0 (zeros) in integer or float value columns

In [None]:
df[df['qty_ordered'] == 0]['qty_ordered'].value_counts()

In [None]:
df[df['price'] == 0]['price'].value_counts()

In [None]:
df[df[' MV '] == '0'][' MV '].value_counts()

In [None]:
df[df['grand_total'] == 0]['grand_total'].value_counts()

In [None]:
#found 9465 cells containing 0.0
#replace 0.0 from corresponding values of ' MV ' column

In [None]:
def replace_zeros(x, y):
    if x == 0:
        return y
    else: 
        return x

In [None]:
df.columns

In [None]:
df['grand_total'] = df.apply(lambda z: replace_zeros(z['grand_total'], z[' MV ']), axis = 1)

In [None]:
#Checking again
df[df['grand_total'] == 0]['grand_total'].value_counts()

## DataFrame Without 0(Zeros) and Null Values

In [None]:
df.head(5)

In [None]:
len(df)

# Returning Customers or Customers Who Brought Items Frequently

##### Count of unique and frequent customers

In [None]:
unique_customers = df['Customer ID'].nunique()
unique_customers

In [None]:
frequent_customers = len(df['Customer ID']) - unique_customers 
frequent_customers

##### IDs of Unique and Frequent Customers

In [None]:
unique_customer_IDs = df['Customer ID'].unique().tolist()
print(len(unique_customer_IDs))
#unique_customer_IDs

In [None]:
frequent_customers_IDs = df["Customer ID"][df["Customer ID"].duplicated(keep = 'first')] #use keep = 'firts' or keep = 'last' #it drops duplicates and keeps only one
#frequent_customers_IDs[91500]
#frequent_customers_IDs
print(len(frequent_customers_IDs))

In [None]:
plt.figure(figsize = (8, 4))

values = [frequent_customers, unique_customers]
labels = ['Frequent Customers', 'Unique Customers']

# Label distance: gives the space between labels and the center of the pie
plt.pie(values, 
        labels = labels, 
        labeldistance = 1.15, #Label distance: gives the space between labels and the center of the pie
        explode = (0, 0.1), #only explode the first
        wedgeprops = { 'linewidth' : 1, 'edgecolor' : 'black'},#
        autopct = '%1.2f%%',
        shadow = True, 
        startangle = 90)

plt.legend(labels, loc = "right")
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle
plt.tight_layout()
plt.show()

# Product Categories

In [None]:
no_of_product_categories = df['category_name_1'].nunique()
no_of_product_categories

### Plot Product Categories with Av. Price Per Category

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))

# add the plot
sns.barplot(x = 'category_name_1', y = 'price', data = df, capsize = 0.001, ax = ax).set(title = 'Product Categories Vs Av. Price')

# add the annotation
ax.bar_label(ax.containers[-1], fmt = 'Mean:\n%.2f', label_type = 'edge')

# add Labels
ax.set(xlabel = 'Product Categories')
ax.set(ylabel = 'Price')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30, horizontalalignment = 'right')

plt.show()

# Highest Price of Product, Name and Category

In [None]:
df['price'].max()

In [None]:
df[df['price'] == 1012625.9][['category_name_1', 'sku', 'price']]

# Question 1: Best Selling Category

## 1 (a) Best Seller Category Yearly

#### Total No. of Orders Per Year

In [None]:
df['Year'].value_counts()

In [None]:
cat_year_wise = pd.crosstab(df.category_name_1, df.Year)
cat_year_wise

#### Yearly Best Seller Category

In [None]:
#cat_year_wise.idxmax()

print("Best Selling Categories Year wise:\n", cat_year_wise.idxmax())

In [None]:
#df['category_name_1'].value_counts().idxmax()

print("Overall Best Selling Category from 2016 to 2018 is:", df['category_name_1'].value_counts().idxmax())

In [None]:
fig, ax = plt.subplots(figsize=(16, 6))

# add the plot
sns.countplot(x = 'category_name_1', data = df, hue = 'Year').set(title = 'Product Categories Vs Count')

# add Labels
ax.set(xlabel = 'Product Categories')
ax.set(ylabel = 'Count')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30, horizontalalignment = 'right')

plt.show()

## 1 (b) Highest Order Cancellation per Category

In [None]:
#Count of orders per status
df['status'].value_counts()

In [None]:
#Total No. of Status
df['status'].nunique()

In [None]:
#status Names
df['status'].unique()

In [None]:
cat_status_wise = pd.crosstab(df.category_name_1, df.status)
#cat_status_wise

In [None]:
cat_status_wise['Categories'] = cat_status_wise.index
cat_status_wise.reset_index(drop = True)
cat_status_wise

In [None]:
#Order Cancellation
fig, ax = plt.subplots(figsize=(15, 6))

# add the plot
sns.barplot(x = 'Categories', y = 'canceled', data = cat_status_wise, capsize = 0.001, ax = ax,  
            order = cat_status_wise.sort_values('canceled', ascending = False).Categories).set(title = 'Order Cancellations per Category')

# add the annotation
ax.bar_label(ax.containers[-1], fmt = '\n%.0f', label_type = 'edge')

# add Labels
ax.set(xlabel = 'Product Categories')
ax.set(ylabel = 'Cancellation')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30, horizontalalignment = 'right')
plt.tight_layout()
plt.show()

## 1 (c) Highest Order Refunded per Category

In [None]:
#Highest Order Refunded
fig, ax = plt.subplots(figsize=(15, 6))

# add the plot
sns.barplot(x = 'Categories', y = 'order_refunded', data = cat_status_wise, capsize = 0.001, ax = ax, 
            order = cat_status_wise.sort_values('order_refunded', ascending = False).Categories).set(title = 'Orders Refunded per Category')

# add the annotation
ax.bar_label(ax.containers[-1], fmt = '\n%.0f', label_type = 'edge')

# add Labels
ax.set(xlabel = 'Product Categories')
ax.set(ylabel = 'Cancellation')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30, horizontalalignment = 'right')
plt.tight_layout()
plt.show()

## 1 (d) Most Orders for Product and Top 10 Most Orders for Products

In [None]:
#Most Ordered Product
df['sku'].value_counts().idxmax()

In [None]:
#Top 10 Most Sold Products from 2016-2018
most_sold = pd.DataFrame(df['sku'].value_counts())
top_10_most_sold = most_sold[0:10]
top_10_most_sold['Top 10 Most Ordered Products'] = top_10_most_sold.index
top_10_most_sold.reset_index(drop = True)
top_10_most_sold.columns = ['Count', 'Top 10 Most Ordered Products'] #Renaming Columns
top_10_most_sold

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

sns.barplot(x = top_10_most_sold['Top 10 Most Ordered Products'], y = top_10_most_sold['Count'], data = top_10_most_sold)

# add Labels
ax.set(xlabel = "Top 10 Most Ordered Products")
ax.set(ylabel = 'Count')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30, horizontalalignment = 'right')
plt.tight_layout()
plt.show()


In [None]:
sku_by_year = pd.crosstab(df.sku, df.Year)
sku_by_year.sort_values(by = 2016, ascending = False)

In [None]:
#Most Ordered Product in 2016
sku_by_year[2016].idxmax()

In [None]:
#Most Ordered Product in 2017
sku_by_year[2017].idxmax()

In [None]:
#Most Ordered Product in 2018
sku_by_year[2018].idxmax()

In [None]:
#Most Ordered Product Each Year
sku_by_year = pd.crosstab(df.sku, df.Year).idxmax()
sku_by_year

In [None]:
#Highest Orders Each Year
by_year = df.groupby("Year").count()['sku']#.idxmax().max()
by_year

In [None]:
#print(plt.style.available)

In [None]:
plt.style.use('ggplot')
by_year.plot.bar()
plt.title('No. of Orders per Year')
plt.show()

# 1 (e) Most Qty of Products Ordered

In [None]:
sku = pd.DataFrame(df[['sku', 'Year', 'qty_ordered']])
sku

In [None]:
#Highest qty_ordered Each Year
by_sku = sku.groupby('Year').sum()
by_sku

In [None]:
#plt.style.use('fivethirtyeight')
plt.style.use('seaborn-pastel')
by_sku.plot.bar()
plt.title('Qty/items Ordered per Year')
plt.show()

In [None]:
df_sku_year = pd.DataFrame(df[['sku', 'Year', 'qty_ordered']])
df_sku_year

In [None]:
df_sku_year.info()

In [None]:
df_sku_year['Year'] = df_sku_year['Year'].apply(lambda x: str(x))

In [None]:
type(df_sku_year['Year'][0])

In [None]:
df_sku_year.info()

In [None]:
#pd.set_option('display.max_rows', None)
sku_year = df_sku_year.groupby(['sku', 'Year']).sum().sort_values(by = ['qty_ordered', 'Year'], ascending = False).head(16).reset_index()
sku_year

In [None]:
fig, ax = plt.subplots(figsize = (10, 8))

sns.barplot(x = 'sku', y = 'qty_ordered', hue = 'Year', data = sku_year)

# add labels
ax.set(xlabel = 'Products Qty Ordered')
ax.set(ylabel = 'Count')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30, horizontalalignment = 'right')
plt.tight_layout()
plt.show()