In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Challenges**
1. Find top 10 books.
2. Find co-relation between different columns.
3. Probability of an order return based on given variables.
4. Clean list of cities.
5. Prediction on number of orders for given data/day for the year 2021


In [None]:
#importing important Libraries. Numpy and pandas already imported in above code cell.
import matplotlib.pyplot as plt
import fuzzywuzzy
from fuzzywuzzy import process

In [None]:
#loading dataset
df = pd.read_csv('/kaggle/input/gufhtugu-publications-dataset-challenge/GP Orders - 5.csv',parse_dates=['Order Date & Time'])
df.head()

In [None]:
#statistics about numerical columns of the dataset
df.describe()

In [None]:
#Column names
df.columns

In [None]:
#Find unique values in each column
df.nunique()

In [None]:
#Check for null values in each column
df.isnull().sum()

Book Name column contains only two null values so we replace it with the best selling book.
Similarly city has null values and a better pick will be to change it to the most occuring city. 

In [None]:
#find best selling book and city.
best_seller = df['Book Name'].value_counts().nlargest(1,keep='all')
top_city = df['City'].value_counts().nlargest(1,keep='all')
print("Best selling book is " + str(best_seller))
print("City with high number of purchases is " + str(top_city))

In [None]:
#fill the null spaces with respective values
df['Book Name'] = df['Book Name'].fillna('انٹرنیٹ سے پیسہ کمائیں')
df['City'] = df['City'].fillna('karachi')

In [None]:
#Use most frequent value in Payment Method column to replace the null values
#with that frequently occuring values.
df['Payment Method'].value_counts()

There is spelling difference between "Cash on Delivery" and "Cash on Delivert (COD)" even though they are the same. To address this issue, we make both the spellings same.

In [None]:
#Make both spellings same.
df["Payment Method"].replace({"Cash on delivery": "Cash on delivery", "Cash on Delivery (COD)": "Cash on delivery"}, inplace=True)

In [None]:
#Check to see the changes.
df['Payment Method'].value_counts()

In [None]:
#find frequently used Payment Method.
df['Payment Method'].value_counts().nlargest(1, keep='all')

In [None]:
#replace the null values in null values with "Cash on Delivery" as it is the most frequently occuring
#value. 
df['Payment Method'] = df['Payment Method'].isnull().fillna("Cash on delivery")

In [None]:
#check if any null value exists in the whole data.
df.isnull().sum()

Before doing any analysis over the data, it is wise to clean the data first. See, there are cities with names written in different formats for example Karachi, karachi, KARACHI, KCH, etc. Python is going to consider each one of these a different city name. So the first thing we should be doing is to clean the city column.

In [None]:
#Some Column names are long. Replacing it with short ones.
df.rename(columns={"Total weight (grams)": "Weight(grams)" }, inplace=True)
df.columns

In [None]:
#City and Book Name columns are important columns so we do some cleaning on these columns.
df['City'] = df['City'].str.lower() #makes the letters lower case.
df['City'] = df['City'].str.replace('\d+', '') #remove one or more digits.
df['City'] = df['City'].str.replace('pakistan', '') #remove the word pakistan from city names within a column
df['City'] = df['City'].str.replace('city', '') #remove the word city from city names
df['City'] = df['City'].str.replace('?', '') #remove ? sign from city 
df['City'] = df['City'].str.strip() #remove extra spaces before and after the strings.
#preprocess Book_Name
df['Book Name'] = df['Book Name'].str.replace("- مستحقین زکواة", "")
df['Book Name'] = df['Book Name'].str.lower()
df['Book Name'] = df['Book Name'].str.replace("linux - an introduction  (release data - october 3, 2020)", "linux - an introduction")
df['Book Name'] = df['Book Name'].str.replace("python programming- release date: august 14, 2020", "python programming")
df['Book Name'] = df['Book Name'].str.replace("ڈیٹا سائنس ۔ ایک تعارف", "ڈیٹا سائنس")
df['Book Name'] = df['Book Name'].str.replace("molo masali - مولو مصلی", "molo masali")
df['Book Name'] = df['Book Name'].str.replace("مشین ل", "مشین لرننگ")
df['Book Name'] = df['Book Name'].str.replace("مشین لرننگرننگ", "مشین لرننگ")
df['Book Name'] = df['Book Name'].str.replace("r ka taaruf آر کا تعارف", "r ka taaruf")
df['Book Name'] = df['Book Name'].str.strip()
df.sample(40)

In [None]:
#check how many city names are there.
df.City.nunique()

Cities number seems too high. This is obviously due to the fact that some cities names spellings are different. For example the spellings for faisalabad are "Faisalabad", "faisalabad", "FSD", "faisalabad city" etc. Python treats them as different names that's why cities number is high.

We try to reduce the number of cities by choping cities names to single, original name.

In [None]:
#import cities dataset
df_cities = pd.read_csv('../input/pakistan-cities-and-postal-codes/Pakistan Cities and Zip Codes.csv')

In [None]:
df_cities.columns

In [None]:
#take only cities names column
cities_list = df_cities['Area_Name'].str.lower().tolist()

In [None]:
len(cities_list)

In [None]:
#this is another cities list taken from github
pak_cities = ['islamabad', 'ahmed nager chatha', 'ahmadpur east', 'ali khan abad', 'alipur',
              'arifwala', 'attock', 'bhera', 'bhalwal', 'bahawalnagar', 'bahawalpur', 'bhakkar',
              'burewala', 'chillianwala', 'chakwal', 'chichawatni', 'chiniot', 'chishtian', 
              'daska', 'darya khan', 'dera ghazi khan', 'dhaular', 'dina', 'dinga', 'dipalpur',
              'faisalabad', 'ferozewala', 'fateh jhang', 'ghakhar mandi', 'gojra', 'gujranwala',
              'gujrat', 'gujar khan', 'hafizabad', 'haroonabad', 'hasilpur', 'haveli lakha',
              'jatoi', 'jalalpur', 'jattan', 'jampur', 'jaranwala', 'jhang', 'jhelum', 'kalabagh',
              'karor lal esan', 'kasur', 'kamalia', 'kamoke', 'khanewal', 'khanpur', 'kharian',
              'khushab', 'kot addu', 'jauharabad', 'lahore', 'lalamusa', 'layyah', 'liaquat pur',
              'lodhran', 'malakwal', 'mamoori', 'mailsi', 'mandi bahauddin', 'mian channu',
              'mianwali', 'multan', 'murree', 'muridke', 'mianwali bangla', 'muzaffargarh',
              'narowal', 'nankana sahib', 'okara', 'renala khurd', 'pakpattan', 'pattoki',
              'pir mahal', 'qaimpur', 'qila didar singh', 'rabwah', 'raiwind', 'rajanpur', 
              'rahim yar khan', 'rawalpindi', 'sadiqabad', 'safdarabad', 'sahiwal', 'sangla hill',
              'sarai alamgir', 'sargodha', 'shakargarh', 'sheikhupura', 'sialkot', 'sohawa',
              'soianwala', 'siranwali', 'talagang', 'taxila', 'toba tek singh', 'vehari',
              'wah cantonment', 'wazirabad', 'badin', 'bhirkan', 'rajo khanani', 'chak', 'dadu',
              'digri', 'diplo', 'dokri', 'ghotki', 'haala', 'hyderabad', 'islamkot', 'jacobabad',
              'jamshoro', 'jungshahi', 'kandhkot', 'kandiaro', 'karachi', 'kashmore', 
              'keti bandar', 'khairpur', 'kotri', 'larkana', 'matiari', 'mehar', 'mirpur khas',
              'mithani', 'mithi', 'mehrabpur', 'moro', 'nagarparkar', 'naudero', 'naushahro feroze', 'naushara',
              'nawabshah', 'nazimabad', 'qambar', 'qasimabad', 'ranipur', 'ratodero', 'rohri', 'sakrand',
              'sanghar', 'shahbandar', 'shahdadkot', 'shahdadpur', 'shahpur chakar', 'shikarpaur', 'sukkur',
              'tangwani', 'tando adam khan', 'tando allahyar', 'tando muhammad khan', 'thatta', 'umerkot',
              'warah', 'abbottabad', 'adezai', 'alpuri', 'akora khattak', 'ayubia', 'banda daud shah', 'bannu', 
              'batkhela', 'battagram', 'birote', 'chakdara', 'charsadda', 'chitral', 'daggar', 'dargai',
              'darya khan', 'dera ismail khan', 'doaba', 'dir', 'drosh', 'hangu',
              'haripur', 'karak', 'kohat', 'kulachi', 'lakki marwat', 'latamber', 'madyan', 'mansehra', 'mardan',
              'mastuj', 'mingora', 'nowshera', 'paharpur', 'pabbi', 'peshawar', 'saidu sharif',
              'shorkot', 'shewa adda', 'swabi', 'swat', 'tangi', 'tank', 'thall', 'timergara', 
              'tordher', 'awaran', 'barkhan', 'chagai', 'dera bugti', 'gwadar', 'harnai', 
              'jafarabad', 'jhal magsi', 'kacchi', 'kalat', 'kech', 'kharan', 'khuzdar', 
              'killa abdullah', 'killa saifullah', 'kohlu', 'lasbela', 'lehri', 'loralai', 'mastung',
              'musakhel', 'nasirabad', 'nushki', 'panjgur', 'pishin valley', 'quetta', 'sherani',
              'sibi', 'sohbatpur', 'washuk', 'zhob', 'ziarat']

In [None]:
#Function to chop cities names
def get_nearest_city(city):
  for check_city in cities_list:
    if check_city in str(city):
      return check_city
  return city

In [None]:
print(f'total unique cities in our dataset before normalization: {df.City.nunique()}')

In [None]:
df['city'] = df['City'].apply(get_nearest_city)

In [None]:
print(f'total unique cities in our dataset after preprocessing: {df.city.nunique()}')

In [None]:
#writing the function again to check with another city list
def get_nearest_city(city):
  for check_city in pak_cities:
    if check_city in str(city):
      return check_city
  return city

In [None]:
df['city'] = df['City'].apply(get_nearest_city)

In [None]:
print(f'total unique cities in our dataset after preprocessing: {df.city.nunique()}')

List named pak_cities gets us 1854 unique cities which is better than its counterpart list named cities_list. So we keep pak_cites output.

Now lets use fuzzy wuzzy to replace incorrect spellings with correct ones. This will bar python from treating same cities as different due to spelling mistakes.
For Example Charsadda, chrsadda, charsada should be one name.

**Fuzzy matching:** The process of automatically finding text strings that are very similar to the target string. In general, a string is considered "closer" to another one the fewer characters you'd need to change if you were transforming one string into another. So "apple" and "snapple" are two changes away from each other (add "s" and "n") while "in" and "on" and one change away (rplace "i" with "o"). You won't always be able to rely on fuzzy matching 100%, but it will usually end up saving you at least a little time.

Fuzzywuzzy returns a ratio given two strings. The closer the ratio is to 100, the smaller the edit distance between the two strings. Here, we're going to get the ten strings from our list of districts that have the closest distance to "charsadda".

In [None]:
city_unique = df['city'].unique()
city_unique

In [None]:
matches = fuzzywuzzy.process.extract("charsadda", city_unique, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
# take a look at them
matches

So we replace the top 4 strings with charsadda to make them a single city name. Which actually is a single city name.

In [None]:
df['city'].replace(['charssadda', 'charsadaa', 'chārsadda', 'charssadsa'] , 'charsadda')

Doing this one-by-one is a tedius process. Lets automate it a bit to change a few more cities.

In [None]:
# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 81):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match

In [None]:
replace_matches_in_column(df=df, column='city', string_to_match="charsadda")

In [None]:
replace_matches_in_column(df=df, column='city', string_to_match="karachi")
replace_matches_in_column(df=df, column='city', string_to_match="rawalpindi")
replace_matches_in_column(df=df, column='city', string_to_match="peshawar")
replace_matches_in_column(df=df, column='city', string_to_match="shikarpur")
replace_matches_in_column(df=df, column='city', string_to_match="kohat")
replace_matches_in_column(df=df, column='city', string_to_match="faisalabad")
replace_matches_in_column(df=df, column='city', string_to_match="islamabad")
replace_matches_in_column(df=df, column='city', string_to_match="sialkot")
replace_matches_in_column(df=df, column='city', string_to_match="quetta")
replace_matches_in_column(df=df, column='city', string_to_match="lahore")
replace_matches_in_column(df=df, column='city', string_to_match="okara")

In [None]:
print(f'total unique cities in our dataset after preprocessing: {df.city.nunique()}')

Cities number reduced from 1854 to 1780

In [None]:
df = df.assign(Books_names_ordered=df['Book Name'].str.split("/")).explode("Book Name")


In [None]:
#top selling books
top_seller_books = df.Books_names_ordered.explode().value_counts().reset_index()
top_seller_books.head(10)

In [None]:
#Plot the top selling books
fig = plt.figure(figsize=(12,6))
top_seller_books = df.Books_names_ordered.explode().value_counts()[:10].plot.barh(
    color='green', rot=0)

In [None]:
#Cities with high number of purchased books
fig = plt.figure(figsize=(12,6))
top_cities = df.city.value_counts()[:10].plot.barh(
    color='red', rot=0)

In [None]:
df.columns

In [None]:
df['Order Status']=df['Order Status'].astype('category').cat.codes
df['City']=df['City'].astype('category').cat.codes

In [None]:
df["City"].corr(df['Order Status'])

In [None]:
df.columns

In [None]:
df['Payment Method']=df['Payment Method'].astype('category').cat.codes

In [None]:
df['Order Status'].corr(df['Payment Method'])

In [None]:
df["City"].corr(df['Payment Method'])

# Association Rule Mining
**In this section, we attempt to implement association rule mining by using Apriori Algorithm.
This algorithm tries to find association rules between different items which are also termed as market basket analysis. Simply put, this algorithm suggests if a certain customer buys a certain product, what next product he might potentially be buying.**

In [None]:
pip install apyori #install apyori

In [None]:
from apyori import apriori #import apriori

In [None]:
#Books name cell is filled with many books.Each book is seperated by "/" character
#we split the book name by this character to and expand the cell via books.
df_books=df['Book Name'].str.split("/", n = 300, expand = True) #we make expansion limited to 300 
                                                                #to avoid burdon of plethora of rules mining
df_books

In [None]:
df_books =df_books.fillna(0)
df_books

In [None]:
#apriori algorithm take input as lists so we convert the dataframe to list.

records = []      #lists within list
for i in range(0, 19239):
    records.append([str(df_books.values[i,j]) for j in range(0, 194)])

In [None]:
association_rules =  apriori(records, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2)
association_results = list(association_rules)

In [None]:
print('Total rules mined ' + str(len(association_results)))

In [None]:
type(association_results[0])

In [None]:
#the apriori algortihm has returned the output in RelationRecord type. we convert it to list to print it.
list_rules = [list(association_results[i][0]) for i in range(0,len(association_results))]

In [None]:
list_rules

In [None]:
print(association_results[2])

Three tasks done sor far:
1. Top ten books found.
2. City column cleaned. Althogh there still is space for improvement.
3. Correlation has been found between different columns which might come handy when doing prediction.

Two tasks to go:
1. Probability of an order return based on given variables.
2. Prediction on number of books to be sold on a specific date.

**Feedback and correction is highly appreciated**

**Upvote only if you have found it useful**  

**Happy coding**

# In progress