**Topics to be covered in this notebook**

• Which one is the best-selling book?
• Visualize order status frequency
• Find a correlation between date and time with order status
• Find a correlation between city and order status
• Find any hidden patterns that are counter-intuitive for a layman
• Can we predict number of orders, or book names in advance?

Importing the necessary libraries.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings("ignore")

#Loading the Dataset first

In [None]:
data_set=pd.read_csv("../input/gufhtugu-publications-dataset-challenge/GP Orders - 5.csv")

#Understanding the Dataset

In [None]:
data_set.head(10)

#Understanding the data in another way

In [None]:
data_set.shape

#Seeing the null values in the dataset for better understanding

In [None]:
data_set.isnull()

#also see the last 5 values of the dataset

In [None]:
data_set.tail()

#Summary of the whole dataset

In [None]:
data_set.describe()

#Getting all the columns name in the dataset

In [None]:
data_set.columns

#change of the Columns name for our ease of understanding

In [None]:
data_set.columns = ['order_number', 'order_status', 'book_name', 'order_date', 'city', 'payment_method', 'items', 'weight']

#Now checking if the change has actually applied or not

In [None]:
data_set.head()

#Lets check the unique values in our dataset

In [None]:
data_set.nunique()

#check unique values for each column separately.
#You can also use nunique() for the count.

In [None]:
data_set['order_status'].unique()

#if you see above two results then you will get to know that "int64 = represent integer values, object = represent the string value".

In [None]:
data_set.info()

#After understanding the data our next step is of cheaning the data.

#First of all, checking the missing values in the dataset

In [None]:
data_set.isnull().sum().sort_values(ascending = False)

#From the above result we have 10 null values in payment_method,2 in book_name and 1 in city.Let's find out where these values are.

#Missing value in payment_method column

In [None]:
data_set[data_set['payment_method'].isna()]

#Missing value in book_name column

In [None]:
data_set[data_set['book_name'].isna()]

#Missing value in city column

In [None]:
data_set[data_set['city'].isna()]

#These missing values are disturbing our dataset so we have to first get rid of these values.

In [None]:
data_set.dropna(inplace=True)

#Lets see if the above query id executed successfully or not.

In [None]:
data_set.isnull().sum()

#You see, now we have no null values in our dataset.We have cleaned the dataset. 

#Now check the total orders.

In [None]:
data_set.order_status.value_counts()

# Task 1: Which one is the best-selling book?

In [None]:
#Split  the orders on the basis of  "/"

from itertools import chain

# return list from series of comma-separated strings
def chainer(s):
    return list(chain.from_iterable(s.str.split('/')))

# calculate lengths of splits
lens = data_set['book_name'].str.split('/').map(len)

# create new dataframe, repeating or chaining as appropriate
data_set = pd.DataFrame({'order_number': np.repeat(data_set['order_number'], lens),
                    'order_status': np.repeat(data_set['order_status'], lens),
                    'book_name': chainer(data_set['book_name']),
                    'order_date': np.repeat(data_set['order_date'], lens),
                    'city': np.repeat(data_set['city'], lens)})

#Now see our total rows increase from 19187 to 33229

In [None]:
data_set.shape

In [None]:
from matplotlib.pyplot import figure
figure(num=None, figsize=(10, 10))
data_set[data_set["order_status"]=="Completed"]["book_name"].value_counts()[:10].sort_values().plot.barh()
plt.title("Top 10 purchased books")
plt.xlabel("Number of orders")
plt.ylabel("Name of books ")
plt.show()

# Task 2: Visualize order status frequency


In [None]:
#Using bar plot

sns.countplot(data = data_set, x = 'order_status')

#Lets use pir charts to properly view the data.

In [None]:
pal=['#349d6e','#faff00',"#ff0000"]
sns.set_palette(pal)
plt.figure(figsize=(10,10))
plt.pie(data_set['order_status'].value_counts())
plt.legend(data_set['order_status'].unique(),bbox_to_anchor=(0.00, 1))