# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import fuzzywuzzy
from fuzzywuzzy import process
import sys
#np.set_printoptions(threshold=sys.maxsize)
import warnings
warnings.filterwarnings("ignore")

# Reading Data

In [None]:
df = pd.read_csv('../input/gufhtugu-publications-dataset-challenge/GP Orders - 4.csv')
df.head()

# EDA

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df = df.rename(columns={'Order Number': 'order_number',"Order Status":"order_status", "Book Name":"book_name","Order Date":"order_date","City (Billing)":"billing_city"})
df.head()

#### Changing data type

In [None]:
df['order_date'] = pd.to_datetime(df['order_date'])
df.head()

In [None]:
df.dtypes

In [None]:
df.order_number.nunique()

In [None]:
df[df['book_name'].isna()]

In [None]:
df[df['billing_city'].isna()]

#### Dropping NaN values

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df.order_status.value_counts()

In [None]:
df.order_status.hist()

# Preprocessing Data

In [None]:
# Preprocess Billing_City
df['billing_city'] = df['billing_city'].str.lower()
df['billing_city'] = df['billing_city'].str.strip()
# Preprocess Book_Name
df['book_name'] = df['book_name'].str.lower()
df['book_name'] = df['book_name'].str.strip()
df.head()

In [None]:
df.billing_city.nunique()

In [None]:
df = df.assign(order_books_name=df.book_name.str.split('/')).explode('book_name')

In [None]:
#Thanks to @hussainsaddam12 & @mnavaidd for this codeblock idea
df['date'] = df['order_date'].dt.date
df['time'] = df['order_date'].dt.time
df["day_name"] = df["order_date"].dt.day_name()
df["month_name"] = df["order_date"].dt.month_name()
df['year'] = df["order_date"].dt.year
df

#### Top 10 Best Selling Books

In [None]:
df.order_books_name.explode().value_counts()[0:10]

In [None]:
top_10 = df.order_books_name.explode().value_counts()[0:10]
ax = sns.barplot(x=top_10.index,y=top_10.values)
ax.set_xlabel('Book Names')
ax.set_ylabel('Sold')
ax.set_xticklabels(top_10.index, rotation='vertical', fontsize=10)
plt.show()

#### Using fuzzy matching to correct inconsistent Data Entry

In [None]:
df_city = df['billing_city'].unique()
df_city

In [None]:
matches = fuzzywuzzy.process.extract("karachi", df_city, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
matches

In [None]:
# Thanks to Sir ZeeshanUsmani for this function
def replace_matches_in_column(df, column, string_to_match, min_ratio = 85):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
    # only get matches with a ratio > 85
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match

In [None]:
replace_matches_in_column(df=df, column='billing_city', string_to_match='karachi')
#df_city.sort()
len(df_city)

In [None]:
replace_matches_in_column(df=df, column='billing_city', string_to_match="khi")
replace_matches_in_column(df=df, column='billing_city', string_to_match="rawalpindi")
replace_matches_in_column(df=df, column='billing_city', string_to_match="islamabad")
replace_matches_in_column(df=df, column='billing_city', string_to_match="lahore")
replace_matches_in_column(df=df, column='billing_city', string_to_match="jauharabad")
replace_matches_in_column(df=df, column='billing_city', string_to_match="taxila")

### Next step
* Removing inconsistencies in billing_city coloumn
* And to predict future sales
