#### Please note some codes are commented out because of high computation requirements.

## Data Wrangling


#### Standard Imports

In [None]:
#Necessary Imports
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.options.display.float_format = '{:.2f}'.format #Turning off scientific notations
# import sweetviz

In [None]:
# Text Analysis related imports
import nltk 
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud, STOPWORDS 

In [None]:
# Model building Imports
import statsmodels.api as sm
from sklearn import preprocessing
from sklearn import tree, metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
# from sklearn.externals.six import StringIO  
from IPython.display import Image  
# import pydotplus

#### Data Ingestion

In [None]:
#Data Ingestion

df=pd.read_csv("../input/zomato-restaurants-in-india/zomato_restaurants_in_India.csv")
print(df.shape)
df.head(1).T

#### Checking EDA Report through Sweetviz Package on the whole data

In [None]:
# report = sweetviz.analyze(df)
# report.show_html('full_data.html')

#### Check for duplicates

In [None]:
df.duplicated().sum() 

In [None]:
df['res_id'].duplicated().sum()

In [None]:
df=df.drop_duplicates(subset='res_id') 
#Dropped duplicates on the basis of res_id as res_id is unique for every restaurant and for each branch, after this zipcode will be removed
df.shape

#### Missing value treatment

In [None]:
(df.isnull().sum() / len(df)).sort_values(ascending=False)

In [None]:
# Zip Code, has to be dropped as 80% values are missing and it is not much of a contributor to the analysis.
# Locality can very well be used inplace of Zip Code.

# Also, res_id now is just an identifier, and isn't of much use, so dropping this as well.

# country_id is redundant as all restaurants pertain to India only, so dropping it.

# url isn't much help here either, customers will order from Zomato, and Zomato already has all info, so dropping it.

# Address and locality have extra info which isn't required as such because locality_verbose variable is here.

# Dropping city_id, city name is available here

# Currency field has to go, currency is INR only!

# opentable_support has all 0 values, so this should be dropped too

In [None]:
df=df.drop(['res_id','url','country_id','currency', 'address', 'locality','city_id', 'zipcode', 'opentable_support'],axis=1)

In [None]:
df.describe(include='all').T

#### Data Prep : Fixing Incorrect Data Types

In [None]:
df.info()

In [None]:
#delivery, takeaway and price_range columns are categorical and are stored as int, so this needs to be fixed.
df['delivery'] = df['delivery'].astype(object)
df['takeaway'] = df['takeaway'].astype(object)
df['price_range'] = df['price_range'].astype(object)

In [None]:
df['delivery'].value_counts(dropna=False)

In [None]:
df['takeaway'].value_counts(dropna=False)

In [None]:
# Assuming -1 is a data entry error, so -1 is to be encoded as 1 for both delivery and takeaway

df['delivery'] = df['delivery'].replace(-1, 1)

In [None]:
df['takeaway'] = df['takeaway'].replace(-1,1)

In [None]:
print(df['delivery'].value_counts(normalize=True)) #99% offer delivery, 1% don't
print('*****************************************')
print(df['takeaway'].value_counts(normalize=True)) #100% offer takeaway

#### Checking correlations

In [None]:
plt.figure(figsize=(7,7))
sns.heatmap(df.corr(), cmap='coolwarm', annot=True)
plt.show()

In [None]:
# Photo_count and votes are highly correlated
# Rest all seem to be in acceptable range

#### Visualising the Dependent Variable (Aggregate Rating)

In [None]:
df['aggregate_rating'].describe()

In [None]:
df['aggregate_rating'] = df['aggregate_rating'].replace(0, np.nan)
df['average_cost_for_two'] = df['average_cost_for_two'].replace(0, np.nan)

In [None]:
df[df['aggregate_rating'] == 0]

In [None]:
# Extracting Not Null values only
df_nn = df[pd.notnull(df['aggregate_rating'])]

In [None]:
plt.hist(df_nn['aggregate_rating'])
plt.title('Histogram of aggregate_rating (not null)')
plt.xlabel('aggregate_rating')
plt.ylabel('Counts')
plt.show()

In [None]:
# Zomato has a rating scale of 1-5 only, so restaurants rated 0 seem incorrect.
# 0 is missing value here (could be unrated as rating_text for these values says not rated)

In [None]:
# Imputing missing values for average_cost_for_two
df_nn['average_cost_for_two'].fillna(df_nn['average_cost_for_two'].median(), inplace=True)

In [None]:
sns.relplot(y = 'average_cost_for_two', x = 'aggregate_rating', size='average_cost_for_two', hue = 'aggregate_rating',
            sizes= (15,200), data = df_nn)
plt.title("Average Cost for Two vs. AggregateRating")

plt.show()

In [None]:
# # # Checking Sweetviz report again on this trimmed and cleaned version of the data

# report_trim = sweetviz.analyze(df)
# report_trim.show_html('data_trim.html')

#### Exploring the text columns

In [None]:
# Establishment
df_nn.establishment[0].replace('[','').replace(']','').replace("'",'')

df_nn.establishment=df_nn.establishment.apply(lambda x:x.replace('[','').replace(']','').replace("'",''))

In [None]:
est_wc = ' '.join(df_nn['establishment'])

In [None]:
# Word Cloud 
wordcloud = WordCloud(width = 3000, height = 3000, 
                background_color ='black', 
                min_font_size = 10, random_state=100).generate(est_wc) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off")
plt.xlabel('Word Cloud')
plt.tight_layout(pad = 0) 

print("Word Cloud of Establishment!!")
plt.show()

In [None]:
df_nn['establishment'].value_counts(dropna=False).head()

In [None]:
# Quick Bites, Casual Dining, Cafe are the dominant establishment types

In [None]:
# Highlights
df_nn.highlights[0].replace('[','').replace(']','').replace("'",'')

df_nn.highlights=df_nn.highlights.apply(lambda x:x.replace('[','').replace(']','').replace("'",''))

In [None]:
df_nn['highlights'].value_counts().head()

In [None]:
# Since I want to do some analysis on the highlights vs ratings,
# it is better tosplit the values of each record to extract different words, 
# Else, the whole data frame will become really cluttered
subset = df_nn[['highlights', 'aggregate_rating']]

In [None]:
high_split = subset['highlights'].str.get_dummies(sep = ",")

high_split

In [None]:
subset = pd.concat([subset, high_split], axis=1).drop('highlights', axis = 1)
subset

In [None]:
subset.columns

In [None]:
# Declare an explanatory variable, called X,and assign it the result of dropping 'Name' and 'AdultWeekend' from the df
X = subset.drop(['aggregate_rating'], axis=1)

# Declare a response variable, called y, and assign it the AdultWeekend column of the df 
y = subset['aggregate_rating'] 

# Here we use the StandardScaler() method of the preprocessing package, and then call the fit() method with parameter X 
scaler = preprocessing.StandardScaler().fit(X)

# Declare a variable called X_scaled, and assign it the result of calling the transform() method with parameter X 
X_scaled=scaler.transform(X)

In [None]:
y = y.ravel()

In [None]:
gini_model = tree.DecisionTreeRegressor(criterion = 'mse', random_state=5)

gini_model.fit(X, y)

In [None]:
feature_imp=pd.Series(gini_model.feature_importances_,index=X.columns)
a = feature_imp.sort_values(ascending=False).head(20)
a

In [None]:
# Table booking recommended, Credit Card, Digital Payments Accepted, Outdoor seating, etc are the top highlights
# affecting Ratings (out of all highlights)

#### Checking if cuisine has an effect on rating 

In [None]:
subset2 = df_nn[['cuisines', 'aggregate_rating']]

In [None]:
cuisines = subset2['cuisines'].str.get_dummies(sep = ',')
cuisines

In [None]:
subset2 = pd.concat([subset2, cuisines], axis=1).drop('cuisines', axis = 1)
subset2

In [None]:
#Declare an explanatory variable, called X,and assign it the result of dropping 'aggregate_rating' from the df
X2 = subset2.drop(['aggregate_rating'], axis=1)

# Declare a response variable, called y, and assign it the aggregate_rating column of the df 
y2 = subset2['aggregate_rating'] 

# Here we use the StandardScaler() method of the preprocessing package, and then call the fit() method with parameter X 
scaler2 = preprocessing.StandardScaler().fit(X2)


In [None]:
# Declare a variable called X_scaled, and assign it the result of calling the transform() method with parameter X 
X_scaled2=scaler2.transform(X2)

y2 = y2.ravel()

In [None]:
dt2 = tree.DecisionTreeRegressor(criterion = 'mse', random_state=5)

dt2.fit(X2, y2)

In [None]:
feature_imp=pd.Series(dt2.feature_importances_,index=X2.columns)
b = feature_imp.sort_values(ascending=False).head(20)
b

In [None]:
# Restaurants having Italian, Asian, Chinese cuisines are better rated than those that don't!

#### Developing train test datasets

In [None]:
df_nn.columns # Using the subset without NULL Values for Aggregate_Rating

In [None]:
X = df_nn.drop(['aggregate_rating', 'name', 'establishment', 'locality_verbose', 'cuisines', 'timings', 'highlights',
               'rating_text'], axis=1)
y = df_nn['aggregate_rating']

In [None]:
X = pd.get_dummies(X)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

In [None]:
# Decision Tree Regressor
dt_model = tree.DecisionTreeRegressor(criterion = 'mse', random_state=5, max_features='sqrt')

dt_model.fit(X_train, y_train)

In [None]:
# dot_data = StringIO()


# tree.export_graphviz(dt_model, out_file=dot_data,  
#                 filled=True, rounded=True,
#                 special_characters=True, feature_names=X_train.columns) 


# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# Image(graph.create_png())

In [None]:
y_pred_dt = pd.Series(dt_model.predict(X_test))

from sklearn.model_selection import cross_val_score

np.mean(cross_val_score(dt_model, X_test, y_test, cv=10))

In [None]:
# RMSE - DT Model
from sklearn.metrics import mean_squared_error
mse_dt = mean_squared_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mse_dt)
rmse_dt

In [None]:
# RandomForestRegressor

reg_model = RandomForestRegressor(criterion = 'mse', random_state=5, max_features='sqrt')

reg_model.fit(X_train, y_train)

In [None]:
y_pred_reg = pd.Series(reg_model.predict(X_test))

np.mean(cross_val_score(reg_model, X_test, y_test, cv=10))

In [None]:
# RMSE - RF Model
mse_rf = mean_squared_error(y_test, y_pred_reg)
rmse_rf = np.sqrt(mse_rf)
rmse_rf

In [None]:

# from lazypredict.Supervised import LazyRegressor

# reg = LazyRegressor(verbose=0, predictions=False, custom_metric='None')
# models,predictions = reg.fit(X_train, X_test, y_train, y_test)

# models_c

In [None]:
# GB Regressor

from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(criterion = 'mse', random_state=5, max_features='sqrt')

gbr.fit(X_train, y_train)

In [None]:
y_pred_gbr = pd.Series(gbr.predict(X_test))

np.mean(cross_val_score(gbr, X_test, y_test, cv=10))

In [None]:
# RMSE - GBR Model
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
rmse_gbr = np.sqrt(mse_gbr)
rmse_gbr

In [None]:
# AdaBoost Regressor
from sklearn.ensemble import AdaBoostRegressor

adr = AdaBoostRegressor(random_state=5)
adr.fit(X_train, y_train)

In [None]:
y_pred_adr = pd.Series(adr.predict(X_test))

np.mean(cross_val_score(adr, X_test, y_test, cv=10))

In [None]:
mse_adr = mean_squared_error(y_test, y_pred_adr)
rmse_adr = np.sqrt(mse_adr)
rmse_adr

In [None]:
from scipy.stats import uniform, truncnorm, randint

model_params = {
    # randomly sample numbers from 4 to 110 estimators
    'n_estimators': randint(1,110),
#     # normally distributed max_features, with mean .25 stddev 0.1, bounded between 0 and 1
#     'max_features': truncnorm(a=0, b=1, loc=0.25, scale=0.1),
#     # uniform distribution from 0.01 to 0.2 (0.01 + 0.199)
#     'min_samples_split': uniform(0.01, 0.199)
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

mod=RandomizedSearchCV(reg_model, model_params, n_iter=100, cv=5, random_state=5, n_jobs=-1) 

mod.fit(X_train,y_train)

In [None]:
from pprint import pprint
pprint(mod.best_estimator_.get_params())

In [None]:
# Building a tuned model with Best Parameters
rf_t = RandomForestRegressor(criterion = 'mse', random_state=5, 
                             max_features='sqrt', 
                             min_samples_split=2,
                            n_estimators=109, verbose=0,
                            min_samples_leaf=1)

rf_t.fit(X_train, y_train)

In [None]:
y_pred_rft = pd.Series(rf_t.predict(X_test))

np.mean(cross_val_score(rf_t, X_test, y_test, cv=5))

In [None]:
mse_rft = mean_squared_error(y_test, y_pred_rft)
rmse_rft = np.sqrt(mse_rft)
rmse_rft

#### Model Metrics and Comparison

In [None]:
print("MODEL_METRICS_RMSE")
print("RMSE for Decision Tree Regressor : " + str(rmse_dt))
print("RMSE for Random Forest Regressor : " + str(rmse_rf))
print("RMSE for Gradient Boosting Regressor : " + str(rmse_gbr))
print("RMSE for AdaBoost Regressor : " + str(rmse_adr))
print("RMSE for RandomForest_pruned Model : " +str(rmse_rft))

In [None]:
print("MODEL_METRICS_R2")
print("R2 for Decision Tree Regressor : " +str(np.mean(cross_val_score(dt_model, X_test, y_test, cv=5))))
print("R2 for RandomForest Regressor : " +str(np.mean(cross_val_score(reg_model, X_test, y_test, cv=5))))
print("R2 for Gradient Boosting Regressor : " + str(np.mean(cross_val_score(gbr, X_test, y_test, cv=5))))
print("R2 for Adaboost Regressor : " +str(np.mean(cross_val_score(adr, X_test, y_test, cv=5))))
print("R2 for RandomForest Pruned Model : " + str(np.mean(cross_val_score(rf_t, X_test, y_test, cv=5))))

In [None]:
# RMSE is the least for RandomForest (Pruned) Model
# R2 is maximum for RandomForest (Pruned) Model

#So, finalising this model!

#### Feature Importances

In [None]:
# Checking Feature Importances

fi_rft=pd.Series(rf_t.feature_importances_,index=X_train.columns)
d = fi_rft.sort_values(ascending=False).head(20)
d

# Votes and photo_count contribute max to aggregate_rating