In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px


Fetching the latest version of the GUfhtugu publication data

In [None]:
df = pd.read_csv("../input/gufhtugu-publications-dataset-challenge/GP Orders - 5.csv",encoding="utf-8", delimiter=',')
df.head(5)

Cheking for the dimensions

In [None]:
df.shape

Checking for the null values

In [None]:
df.isnull().sum()

Looking into the records where "Book Name" column has null values

In [None]:
df[df['Book Name'].isnull()]

Looking into the records where "City" column has null values

In [None]:
df[df['City'].isnull()]

Looking into the records where "Payment Method" column has null values

In [None]:
df[df['Payment Method'].isnull()]

Dropping the rows with null values

In [None]:
df = df.dropna(how='any',axis=0) 

Again checking the dimensions

In [None]:
df.shape

Ensuring there are no null values

In [None]:
df.isnull().sum()

## **Splitting the City values**

In [None]:
# Step 1
# We start with creating a new dataframe from the series with Order Number as the index
new_df = pd.DataFrame(df['Book Name'].str.split('/').tolist(), index=df['Order Number']).stack()
# Step 2
# We now want to get rid of the secondary index
# To do this, we will make Order Number as a column (it can't be an index since the values will be duplicate)
new_df = new_df.reset_index([0, 'Order Number'])
# Step 3
# The final step is to set the column names as we want them
new_df.columns = ['Order Number', 'Book Name']

In [None]:
new_df

Merging the new df with split values into the original dataframe 

In [None]:
book_df = pd.merge(df, new_df, on=["Order Number"])

Peek into the merged data

In [None]:
book_df.head(5)

Checking to ensure that data has been split properly

In [None]:
book_df[book_df['Order Number']==71420]

Dropping the old Book Name column and renaming the new

In [None]:
book_df.drop(['Book Name_x'], axis=1, inplace = True)

In [None]:
book_df.rename(columns={'Book Name_y':'Book Name'}, inplace=True)

Counting the books sold and listing the top 10

In [None]:
book_counts = book_df.groupby(['Book Name']).size().reset_index(name='count')
top_10_books = book_counts.iloc[book_counts['count'].nlargest(10).index]

In [None]:
top_10_books

Resetting the index

In [None]:
top_10_books.reset_index(0,['Book Name'])

Fetching the complete records of top 10 books

In [None]:
top_rs = book_df.loc[book_df['Book Name'].isin(top_10_books['Book Name'])]
top_rs.reset_index(inplace=True)

In [None]:
top_rs['City'].sample(10)

Removing the records where City entry is numerical (garbage value)

In [None]:
top_rs = top_rs[~top_rs.City.str.contains(r'[0-9]')]


Finding index of records with same city names

In [None]:
khi_index = (top_rs['City'].str.contains("karachi") | top_rs['City'].str.contains("karqchi")| top_rs['City'].str.contains("Karachi") | top_rs['City'].str.contains("Karwchi") | top_rs['City'].str.contains("gulshan e hadeed") | top_rs['City'].str.contains("KARACHI") | top_rs['City'].str.contains("P.I.B Colony") | top_rs['City'].str.contains("sader town"))
lhr_index = (top_rs['City'].str.contains("lahore") | top_rs['City'].str.contains("Lahore") | top_rs['City'].str.contains("Lhr") | top_rs['City'].str.contains("LHR"))
rwp_index = (top_rs['City'].str.contains("Rawalpindi") | top_rs['City'].str.contains("RAWALPINDI") | top_rs['City'].str.contains("rawalpindi") | top_rs['City'].str.contains("Rwp"))
fsd_index = (top_rs['City'].str.contains("Faisalabad") | top_rs['City'].str.contains("faisalabad") | top_rs['City'].str.contains("Fesalabad") | top_rs['City'].str.contains("faslabad"))
sargodha_index = (top_rs['City'].str.contains("Sargodha") | top_rs['City'].str.contains("sargodha"))
atk_index = (top_rs['City'].str.contains("attock") | top_rs['City'].str.contains("Attock"))
skh_index = (top_rs['City'].str.contains("sheikhupura") | top_rs['City'].str.contains("Sheikhupura"))
muz_index = (top_rs['City'].str.contains("Muzffarabad") | top_rs['City'].str.contains("Muzaffarabad"))
suk_index = (top_rs['City'].str.contains("Sukkur") | top_rs['City'].str.contains("sukkur"))
mul_index = (top_rs['City'].str.contains("Multan") | top_rs['City'].str.contains("multan"))
hyd_index = (top_rs['City'].str.contains("hydrabad") | top_rs['City'].str.contains("Hyderabad") | top_rs['City'].str.contains("HYDERABAD") | top_rs['City'].str.contains("Hyd"))
ming_index = (top_rs['City'].str.contains("Mingaora") | top_rs['City'].str.contains("Mingora"))
gujw_index = (top_rs['City'].str.contains("Gujranwala") | top_rs['City'].str.contains("guranwala"))
ajk_index = (top_rs['City'].str.contains("AJK") | top_rs['City'].str.contains("A. K") | top_rs['City'].str.contains("Azad") | top_rs['City'].str.contains("azad kashmir") | top_rs['City'].str.contains("ajk"))
kas_index = (top_rs['City'].str.contains("kasure") | top_rs['City'].str.contains("Kasur") | top_rs['City'].str.contains("kasur") | top_rs['City'].str.contains("KASUR"))
pish_index = (top_rs['City'].str.contains("Pishin") | top_rs['City'].str.contains("pishin"))
pesh_index = (top_rs['City'].str.contains("Peshawar") | top_rs['City'].str.contains("peshawar"))
isb_index = (top_rs['City'].str.contains("Islamabad") | top_rs['City'].str.contains("islamabad") | top_rs['City'].str.contains("islamababd"))
guj_index = (top_rs['City'].str.contains("Gujrat") | top_rs['City'].str.contains("gujrat") | top_rs['City'].str.contains("Gujrtq"))
dadu_index = (top_rs['City'].str.contains("Dadu") | top_rs['City'].str.contains("dadu"))


Replacing with one name of the City

In [None]:
top_rs.loc[khi_index,'City'] = "Karachi"
top_rs.loc[lhr_index,'City'] = 'Lahore'
top_rs.loc[rwp_index,'City'] = 'Rawalpindi'
top_rs.loc[fsd_index,'City'] = 'Faisalabad'
top_rs.loc[sargodha_index,'City'] = 'Sargodha'
top_rs.loc[atk_index,'City'] = 'Attock'
top_rs.loc[skh_index,'City'] = 'Sheikhupura'
top_rs.loc[muz_index,'City'] = 'Muzaffarabad'
top_rs.loc[suk_index,'City'] = 'Sukkur'
top_rs.loc[mul_index,'City'] = 'Multan'
top_rs.loc[hyd_index,'City'] = 'Hyderabad'
top_rs.loc[ming_index,'City'] = 'Mingora'
top_rs.loc[gujw_index,'City'] = 'Gujranwala'
top_rs.loc[ajk_index,'City'] = 'AJK'
top_rs.loc[kas_index,'City'] = 'Kasur'
top_rs.loc[pish_index,'City'] = 'Pishin'
top_rs.loc[pesh_index,'City'] = 'Peshawar'
top_rs.loc[isb_index,'City'] = 'Islamabad'
top_rs.loc[guj_index,'City'] = 'Gujrat'
top_rs.loc[dadu_index,'City'] = 'Dadu'

Grouping Books and Cities

In [None]:
city_wise_book_df = top_rs[['Book Name', 'City']]
#city_wise_book_df['count'] = 1
#city_wise_grouped_df = city_wise_book_df.groupby(['Book Name','City'], as_index=False).count()
city_wise_grouped_df = city_wise_book_df.groupby(['Book Name','City']).size().reset_index(name='count')
city_wise_grouped_df


## Visualising the City-wise data

In [None]:
fig = px.scatter(city_wise_grouped_df, y=city_wise_grouped_df['City'],  color=city_wise_grouped_df['Book Name'], x=city_wise_grouped_df['Book Name'], size=city_wise_grouped_df['count'])
fig.update_layout(
    autosize=False,
    width=1200,
    height=2000,)
fig.show()