# Import required libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly as py
from plotly import tools
from plotly.offline import iplot
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read Data file using pandas

In [None]:
data = pd.read_csv("../input/gufhtugu-publications-dataset-challenge/GP Orders - 5.csv", sep=",", encoding="utf_8")
data.head()

# Exploratory Data Analysis

### Understanding Data

In [None]:
#exploring number of rows and columns in data
data.shape

In [None]:
#Analyse all column names.
data.columns

### Column names with spaces are not appropriate

In [None]:
#Rename column name in suitable formate.
data = data.rename(columns = {'Order Number' : 'Order_Number' , 'Order Status' : 'Order_Status' , 'Book Name' : 'Book_Name' , 'Order Date & Time' : 'Order_Date_Time' , 'City' : 'City_Order', 'Payment Method' : 'Payment_Method', 'Total items' : 'Total_Items', 'Total weight (grams)' : 'Total_weight(gm)'})

In [None]:
#Checking if column names are changed.
data.columns

In [None]:
#Exploring Unique values under all column heads.
data.nunique()

In [None]:
#Getting information about data and data types.
data.info()

### Data type of Order Date & time is not correct

In [None]:
#Changing Data type of order date and time.
data["Order_Date_Time"] = pd.to_datetime(data["Order_Date_Time"])
data.info()

In [None]:
#Describing some important statistics about data.
data.describe()

In [None]:
#Looking for null values in data.
data.isnull().sum().sort_values(ascending = False)

### Dropoing null values in data

In [None]:
#Dropping and rechecking null values.
data.dropna(inplace = True)
data.isnull().sum()

In [None]:
#Preprocess Billing_City
data['City_Order'] = data['City_Order'].str.lower()
data['City_Order'] = data['City_Order'].str.replace('\d+', '')
data['City_Order'] = data['City_Order'].str.replace('pakistan', '')
data['City_Order'] = data['City_Order'].str.replace('city', '')
data['City_Order'] = data['City_Order'].str.replace('?', '')
data['City_Order'] = data['City_Order'].str.strip()
#preprocess Book_Name
data['Book_Name'] = data['Book_Name'].str.replace("- مستحقین زکواة", "")
data['Book_Name'] = data['Book_Name'].str.lower()
data['Book_Name'] = data['Book_Name'].str.replace("linux - an introduction  (release data - october 3, 2020)", "linux - an introduction")
data['Book_Name'] = data['Book_Name'].str.replace("python programming- release date: august 14, 2020", "python programming")
data['Book_Name'] = data['Book_Name'].str.replace("ڈیٹا سائنس ۔ ایک تعارف", "ڈیٹا سائنس")
data['Book_Name'] = data['Book_Name'].str.replace("(C++)","(C++) ++سی/سی")
data['Book_Name'] = data['Book_Name'].str.replace("molo masali - مولو مصلی", "molo masali")
data['Book_Name'] = data['Book_Name'].str.replace("مشین ل", "مشین لرننگ")
data['Book_Name'] = data['Book_Name'].str.replace("مشین لرننگرننگ", "مشین لرننگ")
data['Book_Name'] = data['Book_Name'].str.replace("r ka taaruf آر کا تعارف", "r ka taaruf")
data['Book_Name'] = data['Book_Name'].str.strip()
df.sample(20)
df = df.assign(Order_Books_Name=df.Book_Name.str.split("/")).explode("Book_Name")

In [None]:
#Featuring date column
data['Dateofmonth'] = data['Order_Date_Time'].dt.day
data['Month'] = data['Order_Date_Time'].dt.month
data['Week'] = data['Order_Date_Time'].dt.week
data['Dayofweek'] = data['Order_Date_Time'].dt.dayofweek # 0 = monday.
data['Weekdayflg'] = (data['Dayofweek'] // 5 != 1).astype(float)
data['Month'] = data['Order_Date_Time'].dt.month
data['Quarter'] = data['Order_Date_Time'].dt.quarter
data['Dayofyear'] = data['Order_Date_Time'].dt.dayofyear
data.head()

### Visualizing Number of Orders in comperison with Order Status

In [None]:
plt.figure(figsize = (10,4))
visual = data.Order_Status.hist()
plt.xticks(rotation = 90, horizontalalignment = "center")
plt.title("Number of Orders vs. Order Status")
plt.xlabel("Order Status")
plt.ylabel("Number of Orders")
plt.show()

### Replacing multiple multiple entries of cod in one

In [None]:
#replace entries with COD instead of many.
data['Payment_Method'].replace({'Cash on Delivery (COD)':'Cash on delivery'}, inplace=True)
data['Payment_Method'] = data['Payment_Method'].fillna('Cash on delivery')

In [None]:
#Finding number of orders in each payment method.
data.Payment_Method.value_counts()

### Visualizing Number of Orders in comperison with Pyment Methods

In [None]:
plt.figure(figsize = (10,4))
data.Payment_Method.value_counts().plot(kind = 'bar')
plt.xticks(rotation = 90, horizontalalignment = "center")
plt.title("Number of Orders vs. Payment Methods")
plt.xlabel("Payment Method")
plt.ylabel("Number of Deliveries")
plt.show()

### Finding Top10 best seller books

In [None]:
data['Book_Name'].value_counts().head(10)

In [None]:
# Since the book names were separated by " / " that is why I used split method to get name of all the books separately
book_data = data.Book_Name.apply(lambda x: str(x).split('/'))
# Storing the list of lists book names in a list 'books' 
books = [item for sublist in book_data for item in sublist]
# creating a new dataframe 'df' for ease in plotting the books sold
df = pd.DataFrame(data = books, columns = ['Books_Sold'])
# Only storing the top 10 most selling books of all times and converting the result into a datafram
book_chart = df.Books_Sold.value_counts().nlargest(10).to_frame()
# Printing the most sold book name and its number of times it was sold
print(book_chart.head(10))

In [None]:
# plotting 10 top selling books of all time
px.bar(book_chart, y = book_chart.Books_Sold, x = book_chart.index, title = 'Top 10 Selling Books')

### Top 10 best selling Cities 

In [None]:
data.City_Order.str.upper().value_counts()[:10].to_frame()
visual_City_Order = data.City_Order.value_counts()[:10].plot.bar()

# Next Steps in progress..

## Please Upvote this notebook and comment about my work.