In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import plotly.graph_objs as go

import plotly as py
from plotly import tools
from plotly.offline import iplot
from plotly.subplots import make_subplots
import datetime as dtt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import datetime as d
import seaborn as sns

In [None]:
dt=pd.read_csv(r'../input/gufhtugu-publications-dataset-challenge/GP Orders - 5.csv',parse_dates=['Order Date & Time'])
print("Data Dimensions are: ", dt.shape)
print("Columns: ", dt.columns)

In [None]:
print(dt.info())

The dataset contains total 19239 records with some null values in columns Book Name, City and Payment Method. We now rename columns by removing spaces for simplicity in code manipulation.

In [None]:
dt.rename(columns = {'Order Number':'order_number','Order Status':'order_status','Book Name':'book_name','City':'city','Order Date & Time':'order_date_time','Payment Method':'payment_method','Total items':'total_items','Total weight (grams)':'total_weight_grams'}, inplace = True)
dt.columns

Now we perform data type casting.

In [None]:
dt['order_status']=dt['order_status'].astype(str)
dt['book_name']=dt['book_name'].astype(str)
dt['city']=dt['city'].astype(str)
dt['payment_method']=dt['payment_method'].astype(str)

In [None]:
dt['payment_method'].replace({'Cash on Delivery (COD)':'Cash on delivery'}, inplace=True)
dt['payment_method'] = dt['payment_method'].fillna('Cash on delivery')

In [None]:
dt['date'] = dt['order_date_time'].dt.date
dt['month_Year'] = pd.to_datetime(dt['order_date_time']).dt.to_period('M')
dt['time'] = dt['order_date_time'].dt.strftime('%H:%M')

Splitting orders having mutiple books.

In [None]:
db = (dt.set_index(['order_number', 'order_status','order_date_time', 'city',
       'payment_method', 'total_items', 'total_weight_grams','date','month_Year','time'])
    .apply(lambda x: x.str.split('/').explode())
    .reset_index())

In [None]:
db=db[['order_number', 'book_name','order_status','date','month_Year','time','city','payment_method', 'total_items', 'total_weight_grams','order_date_time']]

In [None]:
db.drop(['order_date_time'],axis=1)

Now check summery of non numeric data

In [None]:
db.describe(include=['object'])

In [None]:
db=db.sort_values('date')

Check number of orders by date

In [None]:
dts = db.groupby('date')['order_number'].count().reset_index() 

In [None]:
fig1 = go.Figure()

# Add traces
fig1.add_trace(go.Scatter(x=dts['date'], y=dts['order_number'],
                    mode='lines+markers',
                    name='sales trend'))
fig1.show()

Quantity of free books included in sales  data

In [None]:
df=db[db['book_name'].str.contains(r'(?!$)Free(?!$)')]
df['book_name'].value_counts()

Sales comparison with and without free books.

In [None]:
dnfree = db[~db.book_name.isin(['Lucky Draw - Free Book'])]
dnfree = dnfree.groupby('date')['order_number'].count().reset_index()

In [None]:
fig = go.Figure()
# Add traces
fig.add_trace(go.Scatter(x=dts['date'], y=dts['order_number'],
                    mode='lines+markers',
                    name='Sales with Free books'))
fig.add_trace(go.Scatter(x=dnfree['date'], y=dnfree['order_number'],
                    mode='lines+markers',
                    name='Sales without Free books'))
fig.show()

Simple linear regression on sales data.

In [None]:
fig = px.scatter(dts, x= 'date', y = 'order_number', trendline = "ols")
fig.show()
results = px.get_trendline_results(fig)
results

In [None]:
table = pd.pivot_table(db,index=['order_status'],aggfunc={'order_number':np.count_nonzero})
table
table.plot(kind='bar')

Query top 35 books to check duplicate/typos

In [None]:
db['book_name'].value_counts()[:35]

In [None]:
#Removing duplications
db['book_name'].replace({'Python Programming- Release Date: August 14, 2020':'Python Programming'}, inplace=True)
db['book_name'].replace({'ڈیٹا سائنس':'Data Science'}, inplace=True)
db['book_name'].replace({'ڈیٹا سائنس ۔ ایک تعارف':'Data Science'}, inplace=True)

Top ten books

In [None]:
topbooks=db['book_name'].value_counts()[:10].index.tolist()
topbooks

Sales trend of top ten books.

In [None]:
df_dict = {}
for n in topbooks:
    df_dict[n] = db[db.book_name.isin([n])]
    df_dict[n] = df_dict[n].groupby('date')['book_name'].count().reset_index()

In [None]:
fig = go.Figure()

fig = make_subplots(rows=10, cols=1)

tr_dict = {}
for n in topbooks:
    tr_dict[n] = go.Scatter(x=df_dict[n]['date'], y=df_dict[n]['book_name'],mode='lines+markers',name=n)
i=0
j=1
while i< len(topbooks):
    fig.append_trace(tr_dict[topbooks[i]], row=j, col=1)
    i+=1
    j+=1
fig.update_layout(height=1000, width=800, title_text="Sales Trend-Top Ten Books")
fig.show()