In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
ecommerce = pd.read_csv("/kaggle/input/pakistans-largest-ecommerce-dataset/Pakistan Largest Ecommerce Dataset.csv" , parse_dates = (["created_at", "Working Date"]))

In [None]:
ecommerce.shape

### Data contains 1048574 rows but maximum columns contain 584524 records.

In [None]:
ecommerce.info()

### check that half of the rows are completely emplty.and there are 5 totally null columns also dropping these column.The tricky part is we can't drop all na rows as actual data set also contain few NA entries. We need to keep them. We will drop NA values where all entries are Null.


In [None]:
ecommerce.isnull().sum()

In [None]:
ecommerce.drop(["Unnamed: 21" , "Unnamed: 22" , "Unnamed: 23" , "Unnamed: 24" , "Unnamed: 25"] , axis = 1, inplace = True)#1st method
# ecommerce.iloc[: , :-5] 2nd method

### Now see completely empty unnamed columns are dropped.

In [None]:
ecommerce.columns

### drop all empty rows.

In [None]:
ecommerce.tail(5)

## We will drop those NA values where all entries are Null.

In [None]:
ecommerce = ecommerce.dropna(how = "all")

In [None]:
ecommerce.tail(5)

### fill some null rows with mode value of a column.

In [None]:
ecommerce.isnull().sum()

In [None]:
ecommerce["status"].mode()

In [None]:
ecommerce["status"].fillna("complete" , inplace=True)

In [None]:
ecommerce["category_name_1"].mode()

In [None]:
ecommerce["category_name_1"].fillna("Mobiles & Tablets" , inplace = True)

### There is extra space in MV column remove spaces from column.

In [None]:
ecommerce.columns = ecommerce.columns.str.replace(" MV " , "MV").str.replace('created_at' , 'order_date') #1st method
# ecommerce.rename(columns = {' MV ':'MV'}, inplace = True) 2nd method

In [None]:
ecommerce.dtypes

### As we can see above, few columns are not in correct data type. We need to perform casting and change his data type.

In [None]:
ecommerce['Customer ID'] = ecommerce['Customer ID'].astype(str)
ecommerce['item_id'] = ecommerce['item_id'].astype(str)
ecommerce['qty_ordered'] = ecommerce['qty_ordered'].astype(int)  
ecommerce["Month"] = ecommerce["Month"].astype(int)
ecommerce["Year"] = ecommerce["Year"].astype(int)

In [None]:
ecommerce.dtypes

## Task 1

# Best Selling category


In [None]:
pd.options.display.max_rows = None

In [None]:
ecommerce["category_name_1"].head(10)

## Mobiles & Tablets are Best Selling category in Ecommerce.
#### these seven categories , Mobiles & Tablets  ,  Men's Fashion ,  Women's Fashion , Appliances  , Superstore  , Beauty & Grooming  , soghaat  contributed 70% in the best selling category . these items are most sells compared to other categories.

In [None]:
ecommerce["category_name_1"].value_counts()

In [None]:
ecommerce["category_name_1"].value_counts().plot.bar(figsize = (12,6) , title = "Best Selling categories")
plt.ylabel("Total Order")
plt.show()

## Task 2 

# Visualize Payment Methods versus Order Status

In [None]:
ecommerce.head()

In [None]:
ecommerce["status"].value_counts()

In [None]:
ecommerce["status"].value_counts().plot.bar(figsize = (12,6) , title = "Order Status Frequency" , color = 'red')
plt.xlabel("status category")
plt.ylabel("total counts")
plt.show()

# Order Status per Year

### In the year 2016 completed order quantity is 74610 its a good sign for ecommerce .but from other site 39624 order is cancell also. and 14034 order refunded also.
### In the year 2017 maximum orders are completed comapred to other year about 123489.2017 94781 order are cancell aslo.34119 order recieved.3370 orders are refund in the year 2017 is the highest refund order year.
### In the year 2018 minimum orders are completed about 35601.but received order is maximum 40286 from other year.cancellled order is  maximum in the this year about 67004 order are cancelled.1285 orders are refund in the 2018 year.


In [None]:
ecommerce["Year"].unique()

In [None]:
stats_2016 = ecommerce[ecommerce["Year"]== 2016].status           
stats_2017 = ecommerce[ecommerce["Year"]== 2017].status 
stats_2018 = ecommerce[ecommerce["Year"]== 2018].status 

plt.figure(figsize=(24 , 10))

#histogram for 2019
plt.subplot(1,3,1)
plt.title('Order Status in 2016')
plt.xlabel('Order status')
plt.ylabel('Total no of orders')
plt.xticks(rotation = 90)
plt.hist(stats_2016)

#histogram for 2017
plt.subplot(1,3,2)
plt.title('Order Status in 2017')
plt.xlabel('Order status')
plt.ylabel('Total no of orders')
plt.xticks(rotation = 90)
plt.hist(stats_2017)

#histogram for 2018
plt.subplot(1,3,3)
plt.title('Order Status in 2018')
plt.xlabel('Order status')
plt.ylabel('Total no of orders')
plt.xticks(rotation = 90)
plt.hist(stats_2018)
plt.show()


In [None]:
ecommerce["payment_method"].value_counts()

### cod replace with his full name cash_on_Delivery

In [None]:
ecommerce["payment_method"] = ecommerce["payment_method"].str.replace("cod" , "cash_on_delivery") 

In [None]:
ecommerce["payment_method"].value_counts().plot.bar(figsize = (14,6) ,color = 'orange' , title = "Payment method vs order status")
plt.xlabel("payment category")
plt.ylabel("total counts")
plt.show()

## Task 3

# Payment Method and Status Co-relation

### Now, Plot corelation between completed orders and Payment Methods

In [None]:
corr_comp = ecommerce.loc[ecommerce["status"] == "complete"]["payment_method"].value_counts().to_frame("payment_method")
corr_comp

In [None]:
corr_comp.plot.bar(figsize = (12,6) , title = "Payment Methods vs. Completed Order" , color = 'brown')
plt.xlabel("Payment Methods")
plt.ylabel("Number of Completed Orders")
plt.show()


### Now, Plot corelation between cancelled orders and Payment Methods

In [None]:
corr_can = ecommerce.loc[ecommerce["status"] == "canceled"]["payment_method"].value_counts().to_frame("payment_method")
corr_can

In [None]:
corr_can.plot.bar(figsize = (12,6) , title = "Payment Methods vs. Canceled Order" , color='blue')
plt.xlabel("Payment Methods")
plt.ylabel("Number of Canceled Orders")
plt.show()


In [None]:
corr_ord_ref = ecommerce.loc[ecommerce["status"] == "order_refunded"]["payment_method"].value_counts().to_frame("payment_method")
corr_ord_ref

### Now, Plot corelation between refunded orders and Payment Methods¶

In [None]:
corr_ord_ref.plot.bar(figsize = (12,6) , title = "Payment Methods vs. Refunded Order")
plt.xlabel("Payment Methods")
plt.ylabel("Number of Refunded Orders")
plt.show()

In [None]:
corr_received = ecommerce.loc[ecommerce["status"] == "received"]["payment_method"].value_counts().to_frame("payment_method")
corr_received

### Now, Plot corelation between Recieved orders and Payment Methods¶

In [None]:
corr_received.plot.bar(figsize = (12,6) , title = "Payment Methods vs Received Orders" , color = 'Green')
plt.xlabel("Payment Methods")
plt.ylabel("Number of Received Orders")
plt.show()

In [None]:
corr_refund = ecommerce.loc[ecommerce["status"] == "refund"]["payment_method"].value_counts().to_frame("payment_method")
corr_refund

### Now, Plot corelation between order refund and Payment Methods¶

In [None]:
corr_refund.plot.bar(figsize = (12,6) , title = "Payment Methods vs Order Refund")
plt.xlabel("Payment Methods")
plt.ylabel("Number of Refund Orders")
plt.show()

## We need to perform casting.for seeing these object columns correlation.

### .cat.codes converts your category from a string representation into an integer representation. For example, cod would be replaced with 0, complete would be replaced with 1, payaxis would be replaced with 2. In the other column, cancelled would be replaced with 0 and easypay would be replaced with 1 

In [None]:
ecommerce["payment_method_correlation"] = ecommerce["payment_method"].astype("category").cat.codes
ecommerce["status_correlation"] = ecommerce["status"].astype("category").cat.codes

##  We can say that there is 'no or weak' correlation  between Payment method and Status columns.

In [None]:
plt.figure(figsize = (16,8))
sns.heatmap(ecommerce.corr() ,annot = True)
plt.show()

## Let us remove some unnecessary information by creating a good representative correlation map.

In [None]:
plt.figure(figsize = (18 , 10))
corr = ecommerce.corr()
mask = np.triu(np.ones_like(corr,dtype = bool))
plt.title('Correlation Analysis')
sns.heatmap(ecommerce.corr(),mask=mask,annot=True,lw=1,linecolor='white')
plt.xticks()
plt.yticks()
plt.show()

## see there is no correlation in these columns.

In [None]:
fig = plt.figure(figsize = (10 , 6))
plt.scatter(ecommerce["payment_method_correlation"] , ecommerce["status_correlation"])
plt.xticks(rotation = 60)
plt.show()

## Task 4

# Find a correlation between order date and item category

## correlation between order_date and category_name is 'negative strong'.clearly shown in heatmap.

In [None]:
ecommerce["order_date_corr"] = ecommerce["order_date"].astype("category").cat.codes
ecommerce["category_name_1_corr"] = ecommerce["category_name_1"].astype("category").cat.codes

In [None]:
plt.figure(figsize = (16 , 8))
sns.heatmap(ecommerce.corr() ,annot = True ,cmap='viridis')
plt.show()

## Let us remove some unnecessary information by creating a good representative correlation map.

In [None]:
plt.figure(figsize = (18 , 10))
corr = ecommerce.corr()
mask = np.triu(np.ones_like(corr,dtype = bool))
plt.title('Correlation Analysis')
sns.heatmap(ecommerce.corr(),mask=mask,annot=True,lw=1,linecolor='white',cmap='viridis')
plt.xticks()
plt.yticks()
plt.show()

In [None]:
plt.figure(figsize = (16 , 8))
plt.scatter(ecommerce["order_date_corr"] , ecommerce["category_name_1_corr"])


## Working on further more analysis.