In [None]:
import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import os
import seaborn as sns

from fuzzywuzzy import process, fuzz

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('mode.chained_assignment',None)

## Load & View Data Properties

In [None]:
dataset = pd.read_csv('../input/gufhtugu-publications-dataset-challenge/GP Orders - 4.csv',encoding='utf-8')

In [None]:
dataset.head()

In [None]:
dataset.shape

## Cleaning up the Values for Further Analysis
#### Not sure, if that '??' was actually in Urdu Script

In [None]:
dataset.columns = dataset.columns.str.replace(' ', '_')
dataset['Book_Name'] = dataset['Book_Name'].str.replace('?','')
dataset['Book_Name'] = dataset['Book_Name'].str.replace(' ','')

### Lets see from which Area we've got most orders

In [None]:
city_wise_orders = dataset.groupby('City_(Billing)')['Order_Number'].nunique().sort_values(ascending=False).head(5)
print(city_wise_orders.index)
city_wise_orders_top_5 = dataset[dataset['City_(Billing)'].isin(city_wise_orders.index)]
city_wise_orders_top_5 = city_wise_orders_top_5[['Order_Number','City_(Billing)']]

#### Picking the top 5 Cities

In [None]:
orders_top_5 = dataset[dataset['City_(Billing)'].isin(city_wise_orders.index)]
plt.figure(figsize=(10,7))
sns.countplot(y="Order_Status", data=orders_top_5,order = orders_top_5['Order_Status'].value_counts().index)

In [None]:
plt.figure(figsize=(10,7))
sns.countplot(y="City_(Billing)", data=city_wise_orders_top_5,order = city_wise_orders_top_5['City_(Billing)'].value_counts().index)

The `City_(Billing)` column is not clean, The top counts are not accurate becuase we've got a lot of typos and full adresses instead of Cities. Lets have a quick peek if that's true.

In [None]:
process.extract('Lahore', dataset['City_(Billing)'].unique(), scorer=fuzz.token_sort_ratio)

So there are a lot of LAHOREs here :). I'll not be cleaning them but this can be achieved via `fuzzywuzzy`

### Lets dive into Karachi and slice the data further;

In [None]:
karachi_orders = dataset[dataset['City_(Billing)'].isin(['Karachi'])]
print(karachi_orders.shape[0])

In [None]:
karachi_orders.head()

In [None]:
karachi_orders['Order_Date']= pd.to_datetime(karachi_orders['Order_Date'])
print(karachi_orders.dtypes)

In [None]:
karachi_orders['Order_Date'].min(), karachi_orders['Order_Date'].max()

We've data from Oct 2019, till Jan 2021.

Lets create the Day, Month and Year Columns

In [None]:
karachi_orders['year'] = pd.DatetimeIndex(karachi_orders['Order_Date']).year
karachi_orders['month'] = pd.DatetimeIndex(karachi_orders['Order_Date']).month
karachi_orders['day'] = pd.DatetimeIndex(karachi_orders['Order_Date']).dayofweek

In [None]:
plt.figure(figsize=(5,3))
sns.countplot(y="year", data=karachi_orders,order = karachi_orders['year'].value_counts().index)

It makes sense, We've got only 2 months data for 2019, and bearly a Month's data for 2021.

Lets take only **2020's** data

In [None]:
karachi_orders_2020 = karachi_orders[karachi_orders['year'].isin(['2020'])]
plt.figure(figsize=(15,7))
sns.countplot(y="month", data=karachi_orders_2020,order = karachi_orders_2020['month'].value_counts().index)

Intersting, Seems the case where people are setting there New year's target :). Major orders are from **November** and **December**.

### Trend of top Cities - Month Wise 

In [None]:
plt.figure(figsize=(15,7))
city_wise_data_top_5 = dataset[dataset['City_(Billing)'].isin(city_wise_orders.index)]
city_wise_data_top_5['Order_Date']= pd.to_datetime(city_wise_data_top_5['Order_Date'])
city_wise_data_top_5['month'] = pd.DatetimeIndex(city_wise_data_top_5['Order_Date']).month
city_wise_data_top_5['year'] = pd.DatetimeIndex(city_wise_data_top_5['Order_Date']).year
city_wise_data_top_5 = city_wise_data_top_5[city_wise_data_top_5['year'].isin(['2020'])]
city_wise_data_top_5_agg = city_wise_data_top_5.groupby(['City_(Billing)','month'])['Order_Number'].count().reset_index().rename(columns={'Order_Number':'order_count'})
sns.lineplot(data=city_wise_data_top_5_agg, x="month", y="order_count", hue="City_(Billing)")

Not sure why, but top cities seems to follow a pattern (Like the spikes in 5th and 8th Month). Probably its becuase of some campaign or discounts on Gufthagu. 

### Lets see a day wise breakup now

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(y="day", data=karachi_orders_2020,order = karachi_orders_2020['day'].value_counts().index)

I should have encoded these to Names but FYI; Monday is denoted by 0 and ends on Sunday which is denoted by 6

So major order are placed on **Saturdays**.

### Lets see which book was famous in Karachi

In [None]:
# karachi_orders_2020['Book_Name'].value_counts()
karachi_orders_2020_top_books = karachi_orders_2020.groupby('Book_Name')['Order_Number'].nunique().sort_values(ascending=False).head(5)
print(karachi_orders_2020_top_books)

In [None]:
print(karachi_orders_2020.shape[0])
karachi_orders_2020['Book_Name'] = karachi_orders_2020['Book_Name'].replace('', np.nan)
karachi_orders_2020.dropna(inplace=True)
print(karachi_orders_2020.shape[0])

In [None]:
karachi_orders_2020_top_books = karachi_orders_2020.groupby('Book_Name')['Order_Number'].nunique().sort_values(ascending=False).head(5)
karachi_orders_2020_top_books = dataset[dataset['Book_Name'].isin(karachi_orders_2020_top_books.index)]
plt.figure(figsize=(15,7))
sns.countplot(y="Book_Name", data=karachi_orders_2020_top_books,order = karachi_orders_2020_top_books['Book_Name'].value_counts().index)

Shout Out to Zeeshan ul Hassan on getting the top position for his Book in Karachi.

I feel honored coding this notebook in Python :D - Because it was pretty hot in 2020

Not to forget we've not Cleansed the Book names yet *(Not sure if there are any typos in Book Names)*. If so, the ranking will surely differ after we sort out the Nomenclature

## Lets build a wordclould of all Ordered Booknames - English

In [None]:
from wordcloud import WordCloud, STOPWORDS
comment_words = [' ','?']
stopwords = set(STOPWORDS) 

In [None]:
dataset = pd.read_csv('../input/gufhtugu-publications-dataset-challenge/GP Orders - 2.csv',encoding='unicode_escape')
dataset.columns = dataset.columns.str.replace(' ', '_')
book_words = []
for i in dataset.Book_Name: 
    i = str(i) 
    separate = i.split()
    for j in range(len(separate)): 
        separate[j] = separate[j].lower() 
    book_words+=(separate)
word_str = ' '.join([str(elem) for elem in book_words])

In [None]:
wordcloud = WordCloud(width = 1600, height = 800, background_color ='white', stopwords = stopwords, min_font_size = 10).generate(word_str)                      
plt.figure(figsize = (15, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show() 

**Note**: The wordcloud is plotted on old CSV. I wasn't able to spend some time on setting it up for Urdu Characters as well.
Leaving it for someone to take up :)

Product Managment, Python, AI and Blockchain seems prominent.

It would be interesting to see bigrams plotted not just individual words to get more context.

## Now lets check only the Books that were returned

In [None]:
dataset = pd.read_csv('../input/gufhtugu-publications-dataset-challenge/GP Orders - 4.csv',encoding='utf-8')
dataset.columns = dataset.columns.str.replace(' ', '_')
dataset_returned = dataset[dataset['Order_Status'].isin(['Returned'])]
dataset_returned_top_5 = dataset_returned[dataset_returned['City_(Billing)'].isin(city_wise_orders.index)]

In [None]:
dataset_returned_top_5.head()

### Returned by City amoung top Cities

In [None]:
sns.countplot(x="City_(Billing)", data=dataset_returned_top_5)

In [None]:
dataset_one_hot = pd.get_dummies(dataset, columns = ['Order_Status'])
dataset_one_hot.head()
dataset_one_hot['Order_Date']= pd.to_datetime(dataset_one_hot['Order_Date'])
dataset_one_hot['year'] = pd.DatetimeIndex(dataset_one_hot['Order_Date']).year
dataset_one_hot['month'] = pd.DatetimeIndex(dataset_one_hot['Order_Date']).month
dataset_one_hot['day'] = pd.DatetimeIndex(dataset_one_hot['Order_Date']).dayofweek
dataset_one_hot = dataset_one_hot[dataset_one_hot['year'].isin(['2020'])]
corr_cols = ['month','day','Order_Status_Canceled','Order_Status_Returned','Order_Status_Completed']
corr = dataset_one_hot[corr_cols].corr()
sns.heatmap(corr,annot=True)

#### Seems like Returns are likely to increase as we move ahead in a Year - Slight Correlation with Month-IDs. 

#### Don’t mix it with Causation, it might be a trend that doesn’t make any sense.

Lets quickly vertify if thats happening

In [None]:
dataset_t5 = dataset_one_hot[dataset_one_hot['City_(Billing)'].isin(city_wise_orders.index)]
dataset_t5 = dataset_t5.groupby(['month','City_(Billing)']).sum()[["Order_Status_Returned", "Order_Status_Completed","Order_Status_Canceled"]].reset_index()
sns.scatterplot(data=dataset_t5, x="month", y="Order_Status_Returned")

You can see that returns are in an increasing patterns as we move though the timeline (2020-Months)

# Things that I'll try to do;

* Try Fuzzy match on Cities to get an accurate picture
* Try to predict the orders for next month
