**Task Details:**
Find out Top 10 selling books.

**Expected Submission:**
The names of Top Selling Books

**Evaluation:**
Show some visualization and see if you can break the top selling books by province and city

# Loading data

In [None]:
#Loading of Dataset and required Libraries

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns


dt= pd.read_csv(
    "/kaggle/input/gufhtugu-publications-dataset-challenge/GP Orders - 5.csv",
    encoding="utf_8")

import warnings  
warnings.filterwarnings('ignore')

# Basic Data Exploration

In [None]:
#to check few rows
dt.head()

In [None]:
#to check the number of column & rows
print("dimensions are : ", dt.shape)

so data contains 19239 rows and 8 columns

In [None]:
#to check the columns names, data type and null values (if any)

print(dt.info())

"Non-Null Count" of few columns shows the presence of null values

# Data cleaning

**Handling of missing values.**

In [None]:
dt.isna().sum().plot(kind="bar")

These 3 columns contains the missing values

In [None]:
print("Before drop, total rows are: ", dt.shape[0])

#drop the null values
dt.dropna(inplace=True)

print("After drop, total rows are: ", dt.shape[0])

print(dt.isna().sum())

now data doesnt contains any missing values

**Rename the columns**

Lets rename the columns to more appropriate

In [None]:
#rename the columns
dt = dt.rename(columns={'Order Number': 'Order_Number',"Order Status":"Order_Status", "Book Name":"Book_Name","Order Date & Time":"Date_Time","Payment Method":"Payment_Method","Total items":"Total_Items","Total weight (grams)":"Weight"})
print("After rename, columns names are: ", "\n", "\n" , dt.columns)

**"Date_Time" columns has "object" type, we will change it to datetime64 type.**

In [None]:
#change the type of "Date_Time" columns
dt['Date_Time'] = pd.to_datetime(dt['Date_Time'])
print(dt.info())

# **Handling Inconsistent Data in Books Name**

"Book_Name" column contains more than one book. lets split it.

In [None]:


#to separate, from multiple to single book title per line

#print('No of rows BEFORE splitting : ',dt.shape[0])

scol = dt['Book_Name'].str.split('/', expand=True).stack()
scol.index = scol.index.droplevel(-1) 
scol.name = 'Book_Name' 
dt = dt.drop(columns='Book_Name').join(scol)

#print('No of rows AFTER splitting : ',dt.shape[0])

#manually rename some books names
dt['Book_Name'] = dt['Book_Name'].replace('انٹرنیٹ سے پیسہ کمائیں','Internet Sy Pysy Kamaen')
dt['Book_Name'] = dt['Book_Name'].replace('انٹرنیٹ سے پیسہ کمائیں؟- مستحقین زکواة','Internet Sy Pysy Kamaen')
dt['Book_Name'] = dt['Book_Name'].replace('ڈیٹا سائنس','Data Science')
dt['Book_Name'] = dt['Book_Name'].replace('ڈیٹا سائنس ۔ ایک تعارف','Data Science')
dt['Book_Name'] = dt['Book_Name'].replace('مشین لرننگ','Machine Learning')
dt['Book_Name'] = dt['Book_Name'].replace('(C++) ++سی','(C++)')


#extracting top 20 books for fuzzywuzzy
top_bks=dt["Book_Name"].value_counts().head(20).reset_index()
top_bks.columns=['Book_Name','Sold_Qty']
all_bks = dt["Book_Name"].unique()

#renaming the books name to close matching using fuzzywuzzy
from fuzzywuzzy import process

for bks in top_bks['Book_Name']:
    matches = process.extract(bks, all_bks , limit = len(all_bks))
    for potential_match in matches:
        if potential_match[1] > 90:
                dt.loc[dt['Book_Name'] == potential_match[0],"Book_Name"] = bks
    
dt.reset_index(drop=True, inplace=True)
print("Top 10 unique Books are: \n",dt["Book_Name"].value_counts().head(10))

# **Handling Inconsistent Data in City Names**

The "City" column contains many typos. lets fix it.

In [None]:
dt['City'] = dt['City'].replace(['karachi','KARACHI'],'Karachi')
dt['City'] = dt['City'].replace('FSD','Faisalabad')
dt['City'] = dt['City'].replace(['lahore','LAHORE'],'Lahore')

#extracting top 20 cities for fuzzywuzzy

fuzz_top_City=dt["City"].value_counts().head(20).reset_index()
fuzz_top_City.columns=['City','Sold_Qty']
fuzz_all_City = dt["City"].unique()

#removing the typo mistake in books name

from fuzzywuzzy import process
for city in fuzz_top_City['City']:
    matches = process.extract(city, fuzz_all_City , limit = len(fuzz_all_City))
    for potential_match in matches:
        if potential_match[1] > 90:
                dt.loc[dt['City'] == potential_match[0],"City"] = city
                
print("Top 10 Cities are: \n",dt["City"].value_counts().head(10))

**Creating new dataset**

In [None]:
cty = dt['City'].value_counts().iloc[:10]
bks = dt['Book_Name'].value_counts().iloc[:10]

#tcb=top 10 cities and top 10 books
tcb=dt.groupby(["City","Book_Name"])["Order_Number"].count().reset_index().sort_values("Order_Number", ascending=False)

tcb=tcb[tcb["Book_Name"].isin(bks.index)]
tcb=tcb[tcb["City"].isin(cty.index)]
tcb.head()

**To add another column for Province against each city**

In [None]:
#to add another column for Province against each city
prov = {'Karachi': "Sindh", 'Lahore': "Punjab", 'Islamabad': "Islamabad", 'Rawalpindi': "Punjab", 'Faisalabad': "Punjab",
       'Peshawar': "KPK", 'Multan': "Punjab", 'Gujranwala': "Punjab", 'Sialkot': "Punjab", 'Hyderabad': "Sindh"}

tcb["Province"] = tcb["City"].map(prov)
tcb.columns=['City','Book_Name','Total_Order','Province']

topbks=tcb.groupby("Book_Name")["Total_Order"].sum().reset_index().sort_values("Total_Order", ascending=False)
topcty=tcb.groupby("City")["Total_Order"].sum().reset_index().sort_values("Total_Order", ascending=False)
toppro=tcb.groupby("Province")["Total_Order"].sum().reset_index().sort_values("Total_Order", ascending=False)

tcb

for the ease of display, the urd names have been replaced with roman English

**Now our data is clean and we are ready to visualize**

# Top 10 Ordered Books

In [None]:
fig, ax = plt.subplots()
ax=sns.barplot(x="Book_Name",y="Total_Order",data=topbks,ci=None)
ax.set_xticklabels(topbks["Book_Name"], rotation=90,fontsize=18)
fig.set_size_inches([18,12])
#ax.yaxis.set_major_formatter(mtick.PercentFormatter(10000))
ax.set_title('Top 10 Ordered Books ',fontsize=20)
ax.set_xlabel("Books Titles",fontsize=18)
ax.set_ylabel("Number of Order(s)",fontsize=18) 
plt.show()

graph shows that "Internet Sy Pysy Kamaen"  is top selling book. which was ordered more than 1600 times. 
"Sukkur To Florida" is at 10th position

# Top 10 Cities - Order Wise

In [None]:

fig, ax = plt.subplots()
ax=sns.barplot(x="City",y="Total_Order",data=topcty,ci=None)
ax.set_xticklabels(topcty["City"], rotation=90)
fig.set_size_inches([18, 12])
#ax.yaxis.set_major_formatter(mtick.PercentFormatter(10000))
ax.set_title('Top 10 Cities - Order Wise',fontsize=20)
ax.set_xlabel("City Name(s)",fontsize=18)
ax.set_ylabel("Number of Order(s)",fontsize=18) 
plt.show()

Graph shows that most orders were received from Karachi which are above 2500.

# Top Province - Order Wise

In [None]:
fig, ax = plt.subplots()
ax=sns.barplot(x="Province",y="Total_Order",data=toppro,ci=None)
ax.set_xticklabels(toppro["Province"], rotation=90)
fig.set_size_inches([12, 8])
ax.set_title('Top Province - Order Wise',fontsize=20)
ax.set_xlabel("Province Name(s)",fontsize=18)
ax.set_ylabel("Number of Order(s)",fontsize=18) 
plt.show()

graph shows that highest no of orders were received from Punjab. followed by Sindh, Islamabad and KPK.

# Top 10 Books - City wise

In [None]:
fig, ax = plt.subplots()
ax=sns.pointplot(x="Book_Name",y="Total_Order",data=tcb,hue="City")
#ax.set_xticklabels(tcb["Book_Name"], rotation=90)
#ax.set_xticklabels(rotation=90)
fig.set_size_inches(18,9)
#ax.yaxis.set_major_formatter(mtick.PercentFormatter(10000))
ax.set_title('Top 10 Books - City wise',fontsize=20)
ax.set_xlabel("Books Name",fontsize=18)
ax.set_ylabel("Number of Order(s)",fontsize=18) 
plt.xticks(rotation=90)
plt.show()

graph shows that "Internet Sy Pysy Kamaen" is top selling book for almost all the top cities excep for Lahore, where the sale of  "Sukkur to Florida is slightly higher then "Internet Sy Pysy Kamaen". C++ is the least selling book in top 10 books

In [None]:
fig, ax = plt.subplots()
ax=sns.barplot(x="Book_Name",y="Total_Order",data=tcb,hue="City")
#ax.set_xticklabels(tcb["Book_Name"], rotation=90)
#ax.set_xticklabels(rotation=90)
fig.set_size_inches(18,9)
#ax.yaxis.set_major_formatter(mtick.PercentFormatter(10000))
ax.set_title('Top 10 Books - City wise',fontsize=20)
ax.set_xlabel("Books Name",fontsize=18)
ax.set_ylabel("Number of Order(s)",fontsize=18) 
plt.xticks(rotation=90)
plt.show()

# Top 10 Books - Province wise

In [None]:

fig, ax = plt.subplots()
sns.set_style("whitegrid")
ax=sns.pointplot(x="Book_Name",y="Total_Order",data=tcb,hue="Province", ci=None)
#ax.set_xticklabels(tcb["Book_Name"], rotation=90)
#ax.set_xticklabels(rotation=90)
fig.set_size_inches(18,9)
#ax.yaxis.set_major_formatter(mtick.PercentFormatter(10000))
ax.set_title('Top 10 Books - Province wise',fontsize=20)
ax.set_xlabel("Books Name",fontsize=18)
ax.set_ylabel("Number of Order(s)",fontsize=18) 
plt.xticks(rotation=90)
plt.show()

graph shows that for all the books,maximum order were receved from Sindh province.

In [None]:
fig, ax = plt.subplots()
ax=sns.barplot(x="Book_Name",y="Total_Order",data=tcb,hue="Province",ci=None)
#ax.set_xticklabels(tcb["Book_Name"], rotation=90)
#ax.set_xticklabels(rotation=90)
fig.set_size_inches(18,9)
#ax.yaxis.set_major_formatter(mtick.PercentFormatter(10000))
ax.set_title('Top 10 Books - Province wise',fontsize=20)
ax.set_xlabel("Books Name",fontsize=18)
ax.set_ylabel("Number of Order(s)",fontsize=18) 
plt.xticks(rotation=90)
plt.show()

In [None]:
fig, ax = plt.subplots()
ax=sns.swarmplot(x="Book_Name",y="Total_Order",data=tcb,hue="Province")
plt.legend(bbox_to_anchor=(1, 1), loc=2)
#ax.set_xticklabels(tcb["Book_Name"], rotation=90)
#ax.set_xticklabels(rotation=90)
fig.set_size_inches(18,9)
#ax.yaxis.set_major_formatter(mtick.PercentFormatter(10000))
ax.set_title('Top 10 Books - Province wise',fontsize=20)
ax.set_xlabel("Books Name",fontsize=18)
ax.set_ylabel("Number of Order(s)",fontsize=18) 
plt.xticks(rotation=90)
plt.show()

Thanks for viewing my notebook, you are welcome to give any comments/suggetions to further improve my work.