Importing liberaries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import mean
from numpy import std
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import StackingClassifier
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

importing dataset

In [None]:
dataset = pd.read_csv("../input/zomato-bangalore-restaurants/zomato.csv")
dataset.info()

Thisa data needs cleaning and the testing dataset will be isolated from this!

In [None]:
####################################################################################################################### Data Cleaning
dataset = dataset.drop(['url', 'address', 'phone', 'name', 'menu_item', 'dish_liked'], axis = 1)
dataset = dataset.dropna(axis = 0, how ='any')
dataset = dataset.rename(columns = {'approx_cost(for two people)' : 'cost', 'listed_in(type)' : 'type', 'listed_in(city)' : 'city'})

dataset['cost'] = dataset['cost'].astype(str)
dataset['cost'] = dataset['cost'].apply(lambda x: x.replace(',','.'))
dataset['cost'] = dataset['cost'].astype(float)

dataset_test = dataset.loc[dataset.rate == 'NEW']
dataset = dataset.loc[dataset.rate != 'NEW'].reset_index(drop = True)
dataset = dataset.loc[dataset.rate != '-'].reset_index(drop = True)
remove_slash = lambda x: x.replace('/5', '') if type(x) == np.str else x
dataset.rate = dataset.rate.apply(remove_slash).str.strip().astype('float')
dataset['rate'] = round(dataset['rate'])

dataset['online_order'].replace('Yes', 1,inplace = True)
dataset['online_order'].replace('No', 0,inplace = True)
dataset_test['online_order'].replace('Yes', 1,inplace = True)
dataset_test['online_order'].replace('No', 0,inplace = True)
dataset['book_table'].replace('Yes', 1,inplace = True)
dataset['book_table'].replace('No', 0,inplace = True)
dataset_test['book_table'].replace('Yes', 1,inplace = True)
dataset_test['book_table'].replace('No', 0,inplace = True)

dataset.info()
dataset_test.info()

The data is understood thoroughly and cleaned in such a way to retain all the
meaningful things while eliminating as much as possible.

The column dropped completely are as follows ['url', 'address', 'phone',
'name', 'menu_item', 'dish_liked'] and columns whose name has been changed are
{'approx_cost(for two people)' : 'cost', 'listed_in(type)' : 'type',
'listed_in(city)' : 'city'}

The date in dataset[‘cost’] column is converted into integer. The
ORDER_ONLINE and BOOK_TABLE are also converted to numerical values rather
than a simple Y or N. the coatagory data is also converted to numerical values
in cloumns [‘type’, ‘cuisine’, ‘location’, ‘city’, ‘rest_type’] data cleaning is also
done where the dataset is null is removed.

The dataset is divided into test and train set based on the dataset[‘rate’], if
rate is NEW it is taken in test set

In [None]:
####################################################################################################################### NLP for cleaning and rating predictions
def cleaning(s):
    s = str(s)
    s = s.lower()
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub(r'[^\w]', ' ', s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace(",","")
    s = s.replace("[\w*"," ")
    s = s.replace("rated rated n","")
    return s

dataset['reviews_list'] = [cleaning(s) for s in dataset['reviews_list']]
dataset_test['reviews_list'] = [cleaning(s) for s in dataset_test['reviews_list']]
dataset = dataset.loc[dataset.reviews_list != ' '].reset_index(drop = True)
dataset_test = dataset_test.loc[dataset_test.reviews_list != ' '].reset_index(drop = True)

The above function cleans the text is the dataset

In [None]:
####################################################################################################################### Data Encoding
def catagory_encoding(data):
    for i in range (0, len(dataset[data].unique())):
        dataset[data].replace(dataset[data].unique()[i], i, inplace = True)
    for i in range (0, len(dataset_test[data].unique())):
        dataset_test[data].replace(dataset_test[data].unique()[i], i, inplace = True)
    return

catagory_encoding('type')
catagory_encoding('cuisines')
catagory_encoding('location')
catagory_encoding('city')
catagory_encoding('rest_type')

dataset_test = dataset_test.drop(['rate'], axis = 1)

above function converts catagorial data into numeric and stores in same column

In [None]:
####################################################################################################################### Data Corellation
corr = dataset.corr(method = 'kendall')
sns.heatmap(corr, annot = True, annot_kws = {"size" : 7})
print("Most corellation is found between votes and rates")
plt.show()

In [None]:
####################################################################################################################### Data Visualisation
def plotting_sns(data, title):
    sns.countplot(data)
    sns.countplot(data).set_xticklabels(sns.countplot(data).get_xticklabels(), rotation = 90)
    plt.title(title)
    fig = plt.gcf()
    fig.set_size_inches(15, 15)
    plt.show()
    return

above function is used for plotting the heatmaps

In [None]:
plotting_sns(dataset['city'], 'Location')
print("Most famous city is Bannerghatta Road")

In [None]:
plotting_sns(dataset['type'], 'Type of Service')
print("Most famous type of service is Delivery and Dine-Out")

In [None]:
plotting_sns(dataset['cost'], 'Cost')
print("Most 2 person cost is between 200 and 400")

In [None]:
plotting_sns(dataset['location'], 'No of resturant in a location')
print("Most famous location is bhanashankari")

In [None]:
plotting_sns(dataset['rest_type'], 'Type of resturants')
print("Most famous type of resturant is quick bytes and causal dining")

In [None]:
####################################################################################################################### Function for emotional analysis per gender
dataset_temp = dataset.copy()

def emotional_analysis(text):
    tokenize_words = word_tokenize(text)
    clean_words=[]
    for i in tokenize_words:
        if i not in stopwords.words("english"):
            clean_words.append(i)
    emotions = []
    with open("../input/emotion/emotion.txt","r") as file:
        for i in file:
            temp = i.replace("\n","")
            temp = temp.strip()
            temp = temp.replace(" ","")
            temp = temp.replace(",","")
            temp = temp.replace("'","")
            word, emotion = temp.split(":")
            if word in clean_words:
                emotions.append(emotion)
    return emotions
        
def emotional_plotting(i):
    temp = ''
    for j in text[i - 1]:
        temp = temp + j
    emotion = emotional_analysis(temp)
    title = 'Emotion in review rated - ' + str(i)
    plotting_sns(emotion, title)
    return

text = [[],[],[],[],[]]
for rating in range (1, 6):
    for i in dataset['reviews_list'].loc[dataset['rate'] == rating]:
        text[rating - 1].append(i)

above functions provide the type and quantity of each emotions used in revioews of resturants sorted by their rating

In [None]:
emotional_plotting(2)
print("Most used emotion in 2 rated places is :-> happy and attracted")

In [None]:
emotional_plotting(3)
print("Most used emotion in 3 rated places is :-> happy and angry")


In [None]:
emotional_plotting(4)
print("Most used emotion in 4 rated places is :-> happy and attracted")

In [None]:
emotional_plotting(5)
print("Most used emotion in 5 rated places is :-> happy and sad")

The amount of emotion used is increasing as we move
from lower rated reviews to higher rated reviews.. but
the type of emotion remains same.

This indicated that the restaurant which as established
for a long time do not get very different reviews than
new restaurant. Only time is a major factor which seems
to invade the review system.

Except some bad review restaurant. The type of emotion
in any rated restaurant is same. Only the amount of
emotional word used is different. 

In [None]:
####################################################################################################################### Feature Engineering
count_vectorizer = CountVectorizer(max_features = 1000, stop_words = "english")

sparce_matrix = count_vectorizer.fit_transform(dataset_test['reviews_list']).toarray()
dataset_sparce_matrix = pd.DataFrame(data = sparce_matrix)
dataset_test = dataset_test.drop(['reviews_list'], axis = 1)
dataset_test = pd.concat([dataset_test, dataset_sparce_matrix], axis=1)

sparce_matrix = count_vectorizer.fit_transform(dataset['reviews_list']).toarray()
dataset_sparce_matrix = pd.DataFrame(data = sparce_matrix)
dataset = dataset.drop(['reviews_list'], axis = 1)
dataset = pd.concat([dataset, dataset_sparce_matrix], axis=1)

here the review gets converted into a matrix of numbers to be feeded into our super learner model

In [None]:
####################################################################################################################### get the dataset
def get_dataset(dataset_y, dataset_x):
    y = dataset_y.values
    X = dataset_x
    return X, y

################################################# get a stacking ensemble of models
def get_stacking():
    ################################################# define the base models
    level0 = list()
    level0.append(('Logistic Regression', LogisticRegression()))
    level0.append(('K Nearest Neighbour', KNeighborsClassifier()))
    level0.append(('Decision Tree Classifier', DecisionTreeClassifier()))
    level0.append(('Support Vector Classifier', SVC()))
    level0.append(('Gaussian Navy Bayse', GaussianNB()))
    level0.append(('ADA boost', AdaBoostClassifier()))
    level0.append(('Bagging Classifier', BaggingClassifier(n_estimators = 10)))
    level0.append(('Random Forest Classifier', RandomForestClassifier(n_estimators = 10)))
    level0.append(('Extra Trees Classifier', ExtraTreesClassifier(n_estimators = 10)))
    ################################################# define meta learner model
    level1 = LogisticRegression()
    ################################################# define the stacking ensemble
    model = StackingClassifier(estimators = level0, final_estimator = level1, cv = 5)
    return model

################################################# get a list of models to evaluate
def get_models():
    models = dict()
    models['Logistic Regression'] = LogisticRegression()
    models['K Nearest Neighbour'] = KNeighborsClassifier()
    models['Decision Tree Classifier'] = DecisionTreeClassifier()
    models['Support Vector Classifier'] = SVC()
    models['Gaussian Navy Bayse'] = GaussianNB()
    models['ADA boost'] = AdaBoostClassifier()
    models['Bagging Classifier'] = BaggingClassifier()
    models['Random Forest Classifier'] = RandomForestClassifier()
    models['Extra Trees Classifier'] = ExtraTreesClassifier()
    models['Stacking'] = get_stacking()
    return models

################################################# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
    scores = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv, n_jobs = -1, error_score = 'raise')
    return scores

In [None]:
################################################# get database and plot the result of all models & predicting on new dataset
X, y = get_dataset(dataset_train_refined['rate'], pd.get_dummies(dataset_train_refined.drop('rate', 1)))
models = get_models()
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
plt.boxplot(results, labels = names, showmeans = True)
plt.xticks(rotation = 90)
plt.show()

model_test = get_stacking()
model_test.fit(X, y)
yhat = model_test.predict(dataset_test)
dataset_test['rate'] = yhat

this above is the structure of our super learner model used for prediction

The K-folds used are 10 due to system limitations
and the word sample size used is 999 from the library of
all words used due to system limitations.
The score are represented as (name, mean(scores),
std(scores)). The percentages and standard deviations 
are written to better understand what each model is
doing.

The logistic regression is chosen as level 1 in stacking
for defining the best algorithm approach to the train set
provided.

The stacking super learner provided the best score
of 94.4% accuracy with just 0.7% standard deviations.
Which seems to be a case of overfitting the data. But in
this problem is correct as we are predicting the rating of
the restaurant which are not rated. Generating rating as
close as the trained model is good and not overfitting.
Ideally the K-Folds should be near 50 to 100 and the
Words used in training should be 15000 to 20000 but
due to some hardware limitations the Script with K-Fold
10 and word of 999 took almost 4 Hours to run on my
system.

This super learner will now be used to predict the
gender of those in the test dataset of Zomato restaurant
rate review.

The prediction algorithm also shows that time is a
major factor in rate review system. Since the deviation
is close to null and 90% accuracy which is very good way
to understand the two major aspect of a feature which 
are QUALITY and QUANTITY. Here standard deviation of
emotions at any rate is minimal which contribute to
QUALITY of the prediction, while 90% accuracy is
directly related to QUANTITY of emotions used in
different rated restaurant.

The HIGH QUALITY and GOOD ACCURACY is an
indication that review of any rated restaurant is same,
the more the quantity the more the rate of that
restaurant, directly relating rating system to time of
establishment of the restaurant.