In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Challenges**

1) Best selling books <br>
2) Visualize order status frequency <br>
3) Find a correlation between date and time with order status <br>
4) Find the correlation between city and order status

In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# Load Dataset, and show first five rows of it
dataset = pd.read_csv('/kaggle/input/gufhtugu-publications-dataset-challenge/GP Orders - 5.csv')
dataset.head()

In [None]:
# checking total number of rows and columns in the dataset
dataset.shape

As shown above, the dataset contains 19239 rows and 8 columns in total

In [None]:
# Print information about dataset
dataset.info()

In [None]:
# Prints statistical summary
dataset.describe()

In [None]:
# Unique values for each column
dataset.nunique()

In [None]:
# Checking null values
dataset.isnull().sum()

The 'Book Name' contains 2 null, so it can be replaced with the best seller book in the list. The 'City' contains 1 null, so, again it will be replaced with the city name of highest count number. And the 'Payment Method' which contains 10 nulls will also be replaced by the payment method mostly used.

In [None]:
# The count for each book
dataset['Book Name'].value_counts()

In [None]:
# Pick the best selling book and replace null values with it

best_book = dataset['Book Name'].value_counts().index[0]

dataset['Book Name'] = dataset['Book Name'].fillna(best_book)

In [None]:
# Find the count for each city

dataset['City'].value_counts()

In [None]:
# Pick the city used most frequently, and replace null values with it

best_city = dataset['City'].value_counts().index[0]

dataset['City'] = dataset['City'].fillna(best_city)

In [None]:
# Find count for each payment method

dataset['Payment Method'].value_counts()

In the 'Payment Method' there are two values 'Cash on delivery' and 'Cash on Delivery (COD)' which are same. So, we will combine these two values into one value 'Cash on Delivery'.

In [None]:
# Combine 'Cash on delivery' and 'Cash on Delivery (COD)' as 'Cash on Delivery'
dataset['Payment Method'].replace({'Cash on delivery': 'Cash on Delivery', 'Cash on Delivery (COD)': 'Cash on Delivery'}, inplace=True)

In [None]:
# Pick the payment method used most frequently, and replace null values with it

best_method = dataset['Payment Method'].value_counts().index[0]
best_method
dataset['Payment Method'] = dataset['Payment Method'].fillna(best_method)

In [None]:
# Nowh check if null value exist or not
dataset.isnull().sum()

# 1) Top 10 best selling books

In [None]:
# Show top 10 books
best_book = dataset['Book Name'].explode().value_counts()[0:10]
best_book

In [None]:
# Show top 10 books with Matplotlib
plt.figure(figsize=(12,6))

best_book.plot(kind='bar', title='Top 10 Best Selling Books')

plt.xlabel('Books Name')
plt.ylabel('Each book frequency')

In [None]:
# Show the top 10 books with Searborn
fig, ax = plt.subplots()
ax = sns.barplot(x=best_book.index,y=best_book.values)
ax.set_xlabel('Books Name', fontsize=20)
ax.set_ylabel('Selling Frequency', fontsize=20)
ax.set_xticklabels(best_book.index, rotation='vertical', fontsize=10)
fig.set_size_inches([15,6])
plt.show()

# 2) Visualize order status frequency

In [None]:
# Show the first five rows
dataset.head()

In [None]:
# Check each order status and their frequency
orders = dataset['Order Status'].value_counts()
orders

In [None]:
# Show the order status with bar chart
fig, ax = plt.subplots()
ax = sns.barplot(x=orders.index,y=orders.values)
ax.set_xlabel('Order Status', fontsize=20)
ax.set_ylabel('Orders Frequency', fontsize=20)
ax.set_xticklabels(orders.index, rotation='vertical', fontsize=10)
fig.set_size_inches([10,6])
plt.show()

# 3) Find a correlation between date and time with order status

In [None]:
# Pick the year from 'Order Date & Time' field, and find the orders in each year
year = pd.DatetimeIndex(dataset['Order Date & Time']).year.value_counts()
year


In [None]:
# Show the orders yearswise
plt.figure(figsize=(12,6))
year.plot(kind="bar", title="Orders yearwise")

In [None]:
# Orders count for each day
days = pd.DatetimeIndex(dataset['Order Date & Time']).day_name().value_counts()
days

In [None]:
# Visulize the orders count for each day
plt.figure(figsize=(12,6))
days.plot(kind="bar", title="Orders daywise")


Maximum number of orders are on the weekends

In [None]:
# Show the order status (Completed, Cancelled, Returned) yearwise
fig, ax = plt.subplots()
ax=sns.countplot(x=pd.DatetimeIndex(dataset['Order Date & Time']).year,data=dataset,hue="Order Status")
fig.set_size_inches(18,9)
ax.set_title('Yearly Orders Status (Frequency)',fontsize=20)
ax.set_xlabel("Years",fontsize=18)
ax.set_ylabel("Number of Order(s)",fontsize=18) 
# plt.xticks(rotation=90)
plt.show()

# 4) Find the corelation between city and order status

In [None]:
# Completed orders citywise
com = dataset[dataset['Order Status'] == 'Completed']
com['City'].value_counts()

In [None]:
# Plot of first 20 cities (completed orders)
comp = com['City'].value_counts()[0:20]
plt.figure(figsize=(12,6))
comp.plot(kind='bar', title='Completed orders citywise')

In [None]:
# Cancelled orders citywise
canc = dataset[dataset['Order Status'] == 'Cancelled']
canc['City'].value_counts()

In [None]:
# Plot of first 20 cities (cancelled orders)
canc = canc['City'].value_counts()[0:20]
plt.figure(figsize=(12,6))
canc.plot(kind='bar', title='Cancelled orders citywise')

In [None]:
# Returned orders citywise
retr = dataset[dataset['Order Status'] == 'Returned']
retr['City'].value_counts()

In [None]:
# Plot of first 20 cities (returned orders)
retr = retr['City'].value_counts()[0:20]
plt.figure(figsize=(12,6))
retr.plot(kind='bar', title='Returned orders citywise')

# Citywise orders completion percentage (%)

In [None]:
# First 20 Cities order completion percentage
orders = dataset['City'].value_counts()[0:20]
completed = dataset[dataset['Order Status'] == 'Completed']
completed = completed['City'].value_counts()[0:20]
percent= []
for i in range(20):
    percent.append((completed[i] * 100) / orders[i])

In [None]:
fig, ax = plt.subplots()
ax = sns.barplot(x=completed,y=percent)
ax.set_xlabel('City Names', fontsize=20)
ax.set_ylabel('Percentage Frequency', fontsize=20)
ax.set_xticklabels(orders.index, rotation='vertical', fontsize=10)
fig.set_size_inches([10,6])
plt.show()

# In Progress