In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [None]:
data_file  = pd.read_csv('../input/gufhtugu-publications-dataset-challenge/GP Orders - 4.csv', encoding="utf-8", delimiter=',')
data_file.head()

In [None]:
data_file.shape

In [None]:
data_file.info()

In [None]:
data_file.isnull().sum()  #number of missing values in each col.

In [None]:
data_file.dropna(subset=['Book Name'], inplace= True)

In [None]:
data_file.dropna(subset=['City (Billing)'], inplace = True)

In [None]:
data_file.isnull().sum() #so now there is no null data in 'Book Name' and 'City' 

In [None]:
data_file.info() #In previous steps,I drop those rows that contains null data that's why it has same number of rows

# Q1. What is the best-selling book?

In [None]:
from itertools import chain

# return list from series of comma-separated strings
def chainer(s):
    return list(chain.from_iterable(s.str.split('/')))

# calculate lengths of splits
lens = data_file['Book Name'].str.split('/').map(len)

# create new dataframe, repeating or chaining as appropriate
df = pd.DataFrame({'Order Number': np.repeat(data_file['Order Number'], lens),
                    'Order Status': np.repeat(data_file['Order Status'], lens),
                    'Book Name': chainer(data_file['Book Name']),
                    'Order Date': np.repeat(data_file['Order Date'], lens),
                    'City (Billing)': np.repeat(data_file['City (Billing)'], lens)})

In [None]:
df.shape

In [None]:
len(df["Book Name"].unique()) #unique books

In [None]:
df['Order Status'].unique()

In [None]:
df['Book Name'].value_counts().max() #Total no. of copies of most "ordered" book (but it's include complet./cancel/return)

In [None]:
df1 = df[df['Order Status'] == 'Completed']  # we need only completed order for best selling book
df1.head(10)

In [None]:
df1['Book Name'].value_counts().max()  ##Total no. of copies of most "selling" book

In [None]:
df1['Book Name'].value_counts().idxmax()  #so, the below book is best selling book

In [None]:
df1['Book Name'].value_counts() #Total numbers of each books

In [None]:
#Visualization of Top 10 selled books
book_chart = df1['Book Name'].value_counts().nlargest(10).to_frame()
px.bar(book_chart, y = book_chart['Book Name'], x = book_chart.index, title = 'Most Selling Books')

# #Q.2) Visualize order status frequency

In [None]:
df['Order Status'].unique()

In [None]:
df.groupby('Order Status')['Order Status'].agg('count')

In [None]:
sns.countplot(df['Order Status'])
fig = plt.gcf()
fig.set_size_inches(8,4)
plt.title('Order Status')

# # Q3- Find a correlation between city and order status

In [None]:
len(df['City (Billing)'].unique())  

In [None]:
df['City (Billing)'] = df['City (Billing)'].str.lower()
df['City (Billing)']

In [None]:
len(df['City (Billing)'].unique())   #Total no. of unique cities

In [None]:
df['City (Billing)'] = df['City (Billing)'].apply(lambda x: x.strip(''))
city_stats = df['City (Billing)'].value_counts(ascending=False)
city_stats.head()

In [None]:
df1 = df[df['Order Status'] == 'Completed']
df1.head()

In [None]:
top_10_cities = df1["City (Billing)"].value_counts().nlargest(10).to_frame()  #top 10 only those cities whose orders are completed
top_10_cities   

In [None]:
combine = [df]
titlemapping = {'Canceled':0, 'Completed':1,'Returned':2}
for row in combine:
    row["Order Status"] = row["Order Status"].map(titlemapping)
    row['Order Status'] = row['Order Status'].fillna(0)
    row['Order Status'] = row['Order Status'].astype(int)

In [None]:
df[['City (Billing)','Order Status']].groupby(['Order Status']).agg(['count'])

In [None]:
#where order is canceled
df2 = df.loc[(df['Order Status'] == 0)]
df2.head()

In [None]:
#where order is completed
df3 = df.loc[(df['Order Status'] == 1)]
df3.head()

In [None]:
#where order is returned
df4 = df.loc[(df['Order Status'] == 2)]
df4.head()

In [None]:
df5 = pd.concat([df2, df4])

In [None]:
#Cities from where the most number of orders are canceled

top_10_cities_cancelled = df2["City (Billing)"].value_counts().nlargest(15).to_frame()

fig = px.bar(top_10_cities_cancelled, y =top_10_cities_cancelled['City (Billing)'], x = top_10_cities_cancelled.index, title = 'Cities Rate for Cancelled Bought Books',
            custom_data=[top_10_cities_cancelled['City (Billing)'],
                          top_10_cities_cancelled.index]
            )

fig.update_xaxes(title="Top 10 Cities cancelled Ordered Books Under Guftugu Publications",title_font=dict(size=18, family='Courier', color='crimson'), linecolor='black', mirror=True,gridcolor='red')
fig.update_yaxes(title="Books Count",title_font=dict(size=18, family='Courier', color='crimson'), linecolor='black', mirror=True,gridcolor='red')
fig.update_traces(marker_color='purple',
                  hovertemplate="<br>".join([
                      "City: %{x}",
                      "Count: %{y}",
    ]))


fig.update_layout(hovermode="x unified")
fig.show()

In [None]:
#Cities from where the most number of orders are returned

top_15_cities_returned = df3["City (Billing)"].value_counts().nlargest(15).to_frame()


fig = px.bar(top_15_cities_returned, y =top_15_cities_returned['City (Billing)'], x = top_15_cities_returned.index, title = 'Top 10 Cities From Where Books have been returned',
             custom_data=[top_15_cities_returned['City (Billing)'],
                          top_15_cities_returned.index]
            )

fig.update_xaxes(title="Cities from Books Returned Under Guftugu Publications",title_font=dict(size=18, family='Courier', color='crimson'), linecolor='black', mirror=True,gridcolor='red')
fig.update_yaxes(title="Books Count",title_font=dict(size=18, family='Courier', color='crimson'), linecolor='black', mirror=True,gridcolor='red')
fig.update_traces(marker_color='purple',
                  hovertemplate="<br>".join([
                      "City: %{x}",
                      "Count: %{y}",
    ]))

fig.update_layout(hovermode="x unified")
fig.show()

In [None]:
#It is for learning purpose. It is also included some other people code. 

# # CONT... Next questions will be soved soon.