In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Challenge Gufhtugu Dataset**

Q1: What is the best-selling book?

Q2: Visualize order status frequency

Q3: Find a correlation between date and time with order status

Q4: Find a correlation between city and order status

Q5: Find any hidden patterns that are counter-intuitive for a layman

Q6: Can we predict number of orders, or book names in advance?

In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
df = pd.read_csv("../input/gufhtugu-publications-dataset-challenge/GP Orders - 5.csv",encoding="utf-8", delimiter=',')
df.head()

In [None]:
# total numbers of rows and Columns in dataset
Row, Col = df.shape
print(f'There are {Row} Rows and {Col} columns')


In [None]:
# Print information about dataset
df.info()

In [None]:
# Print statistical summary
df.describe()

In [None]:
# Unique values of each column
df.nunique()

In [None]:
# checking null value
df.isnull().sum()

In [None]:
# the count of each book
df['Book Name'].value_counts()

In [None]:
# pick the best selling book and replace null values on it 
best_book = df['Book Name'].value_counts().index[0]
df['Book Name'] = df['Book Name'].fillna(best_book)

In [None]:
# count each city
df['City'].value_counts()

In [None]:
# pick used more city and replace null values

best_city = df['City'].value_counts().index[0]
df['City'] = df['City'].fillna(best_city)

In [None]:
# find count for each payment method

df['Payment Method'].value_counts()

In the 'Payment Method' there are two values 'Cash on delivery' and 'Cash on Delivery (COD)' which are same. So, we will combine these two values into one value 'Cash on Delivery'.

In [None]:
# Combine 'Cash on delivery' and 'Cash on Delivery (COD)' as 'Cash on Delivery'
df['Payment Method'].replace({'Cash on delivery' : 'Cash on Delivery','Cash on Delivery (COD)': 'Cash on Delivery'}, inplace=True)

In [None]:
# Pick the payment method used most frequently, and replace null values with it

best_method = df['Payment Method'].value_counts().index[0]
best_method
df['Payment Method'] = df['Payment Method'].fillna(best_method)

In [None]:
# Now check null value exist or not
df.isnull().sum()

# 1 - Top 10 Best Selling Books

In [None]:
# Show top 10 books
best_book = df['Book Name'].explode().value_counts()[0:10]
best_book

In [None]:
# Show top 10 books with Matplotlib

plt.title('Top 10 Best Selling Books')
df[df['Order Status']=='Completed']['Book Name'].value_counts().iloc[:10].sort_values().plot.barh()


In [None]:
# Show the top 10 books with Searborn
fig, ax = plt.subplots()
ax = sns.barplot(x=best_book.index,y=best_book.values)
ax.set_xlabel('Books Name', fontsize=20)
ax.set_ylabel('Selling Frequency', fontsize=20)
ax.set_xticklabels(best_book.index, rotation='vertical', fontsize=10)
fig.set_size_inches([15,6])
plt.show()

# 2) Visualize order status frequency

In [None]:
# Show the first five row
df.head()

In [None]:
# check each order and their frequency
orders = df['Order Status'].value_counts()
orders

In [None]:
# Visulize the order status with bar chart
fig, ax = plt.subplots()
ax = sns.barplot(x=orders.index,y=orders.values)
ax.set_xlabel('Order Status', fontsize= 20)
ax.set_ylabel('Orders Frequency', fontsize=20)
ax.set_xticklabels(orders.index, rotation='vertical', fontsize=10)
fig.set_size_inches([10,6])
plt.show()

# 3) Find date and time with order status

In [None]:
# find the year from order data and time and find the order on each year
year = pd.DatetimeIndex(df['Order Date & Time']).year.value_counts()
year

In [None]:
# Visulize the order yearwise
plt.figure(figsize=(15,6))
year.plot(kind='bar',title='Orders Year Wise')

In [None]:
# find order for each day
days = pd.DatetimeIndex(df['Order Date & Time']).day_name().value_counts()
days

In [None]:
# Visulize the orders count for each day
plt.figure(figsize=(12,6))
days.plot(kind='bar', title='Order Day Wise')

# 4) Find the corelation between city and order status

In [None]:
# complete orders citywise
city = df[df['Order Status'] == 'Completed']
city['City'].value_counts()

In [None]:
# plot of first 20 cities (completed orders) 
complete = city['City'].value_counts()[0:20]
plt.figure(figsize=(12,6))
complete.plot(kind='bar', title='Completed orders citywise')


In [None]:
# Cancelled orders citywise
cancel = df[df['Order Status'] == 'Cancelled']
cancel['City'].value_counts()

In [None]:
# plot of first 20 cities (cancelled order)
cancel = cancel['City'].value_counts()[0:20]
plt.figure(figsize=(12,6))
cancel.plot(kind='bar', title='Cancelled orders citywise')

In [None]:
# Return orders citywise
ret = df[df['Order Status'] == 'Returned']
ret['City'].value_counts()

In [None]:
# plot for first 20 cities (returned oders)
ret = ret['City'].value_counts()[0:20]
plt.figure(figsize=(12,6))
ret.plot(kind='bar', title='Returned orders citywise')

In [None]:
# First 20 Cities Order complete percentage
orders = df['City'].value_counts()[0:20]
completed = df[df['Order Status'] == 'Completed']
completed = completed['City'].value_counts()[0:20]
percent = []
for i in range(20):
    percent.append((complete[i] * 100) / orders[i])

In [None]:
fig, ax = plt.subplots()
ax = sns.barplot(x=completed, y=percent)
ax.set_xlabel('City Name', fontsize=20)
ax.set_ylabel('Percentage Frequency', fontsize=20)
ax.set_xticklabels(orders.index,rotation='vertical',fontsize=10)
fig.set_size_inches([15,6])
plt.show()