In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns


dt= pd.read_csv(
    "/kaggle/input/gufhtugu-publications-dataset-challenge/GP Orders - 5.csv",
    encoding='UTF-8')

import warnings  
warnings.filterwarnings('ignore')

# Basic Data Exploration

In [None]:
#to check few rows

dt.head()

In [None]:
#to check the number of column & rows
print("dimensions are : ", dt.shape)

In [None]:
#to check the columns names, data type and null values (if any)

print(dt.info())

Data contains null values.

In [None]:
#to see the null values
dt[dt.isna().any(axis=1)]

In [None]:
#drop the null values
dt=dt.dropna()

In [None]:
#rename the columns
dt = dt.rename(columns={'Order Number': 'Order_Number',"Order Status":"Order_Status", "Book Name":"Book_Name","Order Date & Time":"Date_Time","Payment Method":"Payment_Method","Total items":"Total_Items","Total weight (grams)":"Weight"})
dt.columns

In [None]:
#change to proper datatypes


dt['Order_Status'] = dt['Order_Status'].astype(str)
dt["Book_Name"]=dt["Book_Name"].astype(str)
dt["Date_Time"]=dt["Date_Time"].astype(str)
dt["City"]=dt["City"].astype(str)
dt["Payment_Method"]=dt["Payment_Method"].astype(str)
dt['Date_Time'] = pd.to_datetime(dt['Date_Time'])
dt["Book_Name"].str.encode('utf-8')
print(dt.info())

In [None]:
#to extract further info from "Date_Time" column 

dt = dt.sort_values('Date_Time') 

dt['Date']  = dt['Date_Time'].dt.date
dt['Year']  = dt["Date_Time"].dt.year
dt["Month"] = dt["Date_Time"].dt.month_name()
dt["Day"]   = dt["Date_Time"].dt.day_name()
dt['Hour']  = dt['Date_Time'].dt.hour
dt['Time']  = dt['Date_Time'].dt.time
dt["MY"]=pd.to_datetime(dt['Date_Time']).dt.to_period('M')
dt["MY"]= dt["MY"].dt.strftime('%Y-%m')

dt.columns

In [None]:
#to separate, from multiple to single book tiltle per line

print('No of rows BEFORE splitting : ',dt.shape[0])

scol = dt['Book_Name'].str.split('/', expand=True).stack()
scol.index = scol.index.droplevel(-1) 
scol.name = 'Book_Name' 
dt = dt.drop(columns='Book_Name').join(scol)

print('No of rows AFTER splitting : ',dt.shape[0])

#ref:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.split.html
#ref:https://stackoverflow.com/questions/40955656/how-does-python-split-function-works/40955737

now we have only one book title per row.

In [None]:
#just have a look at top cities to see if there any resemble city name

topcities=dt.groupby("City")["Order_Number"].count().reset_index().sort_values("Order_Number", ascending=False).head(50)

topcities

In [None]:
#rename the resembled city names to single name 

#print("Before rename:", len(df["City"].unique())) 
dt['City'] = dt['City'].replace(['karachi','KARACHI'],'Karachi')
dt['City'] = dt['City'].replace(['lahore','LAHORE'],'Lahore')
dt['City'] = dt['City'].replace(['ISLAMABAD','islamabad'],'Islamabad')
#print("AFTER rename:", len(df["City"].unique())) 

topcities=topcities=dt.groupby("City")["Order_Number"].count().reset_index().sort_values("Order_Number", ascending=False).head(10)

topcities
#https://datatofish.com/replace-values-pandas-dataframe/

In [None]:
#freebks=dt[dt["Book_Name"].str.contains("Free")]
#freebks
#some books were distributed as free in lucky draw. we will see later 
#what useful information we can extract from this point

In [None]:
#top 30 books by order, to see if there anything inaccurate with title

temp=dt.groupby("Book_Name")["Order_Number"].count().reset_index().sort_values("Order_Number", ascending=False).head(30)
temp



In [None]:
#  some books titles are referred to the same book, so we renamed those..

print("Before rename:", len(dt["Book_Name"].unique())) 
dt['Book_Name'] = dt['Book_Name'].replace(['انٹرنیٹ سے پیسہ کمائیں؟- مستحقین زکواة'],'انٹرنیٹ سے پیسہ کمائیں')
dt['Book_Name'] = dt['Book_Name'].replace(['ڈیٹا سائنس ۔ ایک تعارف','ڈیٹا سائنس'],'Data Science')
dt['Book_Name'] = dt['Book_Name'].replace(['Python Programming- Release Date: August 14, 2020'],'Python Programming')

print("AFTER rename:", len(dt["Book_Name"].unique()))

In [None]:
#lets have a final look at our data before proceed further.
dt.head(10)

Now data is clean. we are good to go now. Next step is to address the queries

# What is the best-selling book?

In [None]:
#top books by order
bnon=dt.groupby("Book_Name")["Order_Number"].count().reset_index().sort_values("Order_Number", ascending=False).head(20)


fig, ax = plt.subplots()
ax=sns.barplot(x="Book_Name",y="Order_Number",data=bnon, color="deepskyblue")
ax.set_xticklabels(bnon["Book_Name"], rotation=90)
fig.set_size_inches([10, 6])
ax.set_title('Top 20 Selling Books',fontsize=16)
ax.set_xlabel("Books Titles",fontsize=13)
ax.set_ylabel("Number of Order",fontsize=13) 
plt.show()

# Visualize order status frequency

In [None]:

dtos=dt.groupby(["Order_Status"])["Order_Number"].count().reset_index()
#sns.countplot(x="Order_Status", data=dtos)

fig, ax = plt.subplots()
ax=sns.barplot(x="Order_Status",y="Order_Number",data=dtos,color="deepskyblue")
ax.set_xticklabels(dtos["Order_Status"], rotation=90)
fig.set_size_inches([10, 6])
ax.set_title('Category-wise Total Orders',fontsize=16)
ax.set_ylabel("Number of Orders",fontsize=13) 
ax.set_xlabel("Order Categories",fontsize=13)
plt.show()

In [None]:
#top cities and order nos

cion=topcities

fig, ax = plt.subplots()
ax=sns.barplot(x="City",y="Order_Number",data=cion, color="deepskyblue")
ax.set_xticklabels(cion["City"], rotation=90)
fig.set_size_inches([10, 6])
ax.set_title('City-wise Total Orders',fontsize=16)
ax.set_ylabel("Number of Orders",fontsize=13) 
ax.set_xlabel("Top 10 Cities",fontsize=13)

plt.show()

In [None]:
sns.countplot(x="Order_Status",hue="Payment_Method", data=dt)

#copied from someone's notebook

# Correlation between date and time with order status

In [None]:
#Date-wise Order Status
dton=dt.groupby(["Order_Status","MY"])["Order_Number"].count().reset_index().sort_values("Order_Number", ascending=False)
dton=dton.sort_values("MY")

fig, ax = plt.subplots()
ax.set_xticklabels(dton["MY"], rotation=60)
fig.set_size_inches([10, 6])

ax=sns.barplot(x="MY",y="Order_Number",hue="Order_Status",hue_order=["Completed", "Returned", "Cancelled"],data=dton,palette="muted")
ax.set_title('Date-wise Order Status',fontsize=16)
ax.set_xlabel("Dates",fontsize=13)
ax.set_ylabel("Number of Orders",fontsize=13) 
plt.show()

# Correlation between city and order status

In [None]:

#order status of top cities 

cios=dt.groupby(["City","Order_Status" ])["Order_Number"].count().reset_index().sort_values("Order_Number", ascending=False)
cios=cios[cios["City"].isin(topcities["City"])]

fig, ax = plt.subplots()
ax=sns.barplot(x="City",y="Order_Number",hue="Order_Status",data=cios, palette="muted")
ax.set_xticklabels(cion["City"], rotation=90)
fig.set_size_inches([10, 6])
ax.set_title('City-wise Order Status',fontsize=16)
ax.set_ylabel("Number of Orders",fontsize=13) 
ax.set_xlabel("Top 10 Cities",fontsize=13)

# Find any hidden patterns that are counter-intuitive for a layman

# Can we predict number of orders, or book names in advance?

