In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Read Dataset File
df = pd.read_csv('/kaggle/input/gufhtugu-publications-dataset-challenge/GP Orders - 5.csv' ,
                encoding = "utf-8" , delimiter = ',')


In [None]:
#Get the information about the dataset
df.info()

In [None]:
#Get the Shape of dataset
rows,columns = df.shape
print(f'(There are {rows} rows and {columns} columns in gufhtugu publication dataset)')

In [None]:
#View first 50 rows of dataset
df.head(50)

In [None]:
#view last 50 rows of dataset
df.tail(50)

In [None]:
#Get the column names
df.columns

In [None]:
#Rename Column names
df = df.rename(columns = {'Order Number' : 'Order_Number' , 'Order Status' : 'Order_Status' , 'Book Name' : 'Book_Name' , 'Order Date & Time' : 'Order_Date_Time' , 'City' : 'City_Order', 'Payment Method' : 'Payment_Method', 'Total items' : 'Items_Total', 'Total weight (grams)' : 'Total_weight(gm)'})
df.columns

In [None]:
#Review the first 5 rows
df.head()

In [None]:
#Check Null values
df.isnull().sum().sort_values(ascending = False)

There are ten null values in Payment_Method, two null values in Book_Name and one null value in City_Order. Let's check it which are those values

In [None]:
#Check Missing values in Payment_Method
df[df['Payment_Method'].isnull()]

In [None]:
#Check Missing Values in Book_Name
df[df['Book_Name'].isna()]

In [None]:
#Check missing values in City_Order
df[df['City_Order'].isnull()]

In [None]:
#We drop these Missing Values from our dataset
df.dropna(inplace = True)

After drop the missing values, recheck the dataset

In [None]:
print(df.isnull().sum().sort_values(ascending = False))
rows,columns = df.shape
print(f'There are {rows} rows and {columns} Columns after drop null values')

After dropping null values, we see that there is not any null values available and rows reduced from 19239 to 19226

# Separate book names for further processing and find the best selling book

In [None]:
#Make a new column Order_Books_Name by exploding the names of book
df = df.assign(Order_Books_Name=df.Book_Name.str.split("/")).explode("Book_Name")
df.head(10)

In [None]:
#Find the total count of unique books and cities
unique_count_books = df.Order_Books_Name.explode().value_counts()#Gives us total count of unique books

#Find Top ten Books
top_ten_books = dict(unique_count_books[:10])
print(f'\n \n Top ten selling books are \n \n {top_ten_books}')

#Best Selling Book
best_selling_book = dict(unique_count_books[0:1])
print(f'\n \n Best selling book is \n \n{best_selling_book}')

#Visualize top ten books
names = top_ten_books.keys()
quantity = top_ten_books.values()
plt.figure()
plt.bar(names,quantity,color = ['red', 'blue', 'purple', 'green', 'orange'])
plt.xticks(rotation = 90)
plt.title('Top Selling Books',fontsize=15)
plt.show()

# Order Status Insights

In [None]:
#First we check Number of completed, returned and cancelled orders
order_status = dict(df.value_counts('Order_Status'))
for key,value in order_status.items():
    print(f'{key} orders are {value}')

#Visualize completed, returned and cancelled orders
status = order_status.keys()
quantity = order_status.values()
plt.figure(figsize = (20,10))
plt.bar(status , quantity , color = ('Purple' , 'Green' , 'Red'))
plt.grid()
plt.xticks(fontsize = 25)
plt.yticks(fontsize = 25)
plt.title('Order Status Insights' , fontsize = 40)
plt.xlabel('Status', fontsize = 30)
plt.ylabel('Quantity' , fontsize = 30)
plt.show()

In [None]:
#Find the Correlation of returned books with book name, city and payment method
returned_books = df[df['Order_Status'] == 'Returned']
returned_books[['Order_Status','Book_Name','City_Order','Payment_Method']]


In [None]:
#Retuned books correlation with payment method
print('The Payment Methods of Returned books are')
returned_books.Payment_Method.value_counts()


In [None]:
#Returned books correlation with book name
print("The books which returned are")
returned_books.Book_Name.value_counts()

In [None]:
#Returned books correlation with city
print('The cities from which books are returned')
returned_books.City_Order.value_counts()

In [None]:
#correlation of completed orders
completed_orders = df[df['Order_Status'] == 'Completed']
completed_orders[['Order_Status' , 'Book_Name' , 'City_Order' , 'Payment_Method']]


In [None]:
#Correlation of completed orders and book
print('The books whose orders are completed are')
completed_orders.Book_Name.value_counts()


# In Process