In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
print("Done")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/zomato-bangalore-restaurants/zomato.csv') #for regressors
df1 = pd.read_csv('/kaggle/input/zomato-bangalore-restaurants/zomato.csv') #for recommendation 

In [None]:
df.head(5)

In [None]:
df1.tail(5)

In [None]:
# RESTAURANT RECOMMENDATION 
'''
This code uses the zomato dataset to recommend a restaurant similar to what you give as an input(your liking). 
It uses a content based filtering of other reviews and then sorts them (highest to lowest.)

'''

#Preprocessing- reasonong in the second dataset

#The recommendation algorithm does not need these features
zomato=df1.drop(['url','dish_liked','phone'],axis=1) 

#Remove duplicates, null values, convert object data to analysable form,rename columns,
zomato.duplicated().sum()
zomato.drop_duplicates(inplace=True)
zomato.isnull().sum()
zomato.dropna(how='any',inplace=True)
zomato = zomato.rename(columns={'approx_cost(for two people)':'price','listed_in(type)':'type', 'listed_in(city)':'city'})
zomato['price'] = zomato['price'].astype(str) #Changing the price a to string
zomato['price'] = zomato['price'].apply(lambda x: x.replace(',','')) 
zomato['price'] = zomato['price'].astype(float)
zomato = zomato.loc[zomato.rate !='NEW']
zomato = zomato.loc[zomato.rate !='-'].reset_index(drop=True)
remove_slash = lambda x: x.replace('/5', '')
zomato.rate = zomato.rate.apply(remove_slash).str.strip().astype('float')
zomato.name = zomato.name.apply(lambda x:x.title())
zomato.online_order.replace(('Yes','No'),(True, False),inplace=True)
zomato.book_table.replace(('Yes','No'),(True, False),inplace=True)

#Mean Rating
restaurants = list(zomato['name'].unique())
zomato['Mean Rating'] = 0

for i in range(len(restaurants)):
    zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()
    
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5))
zomato[['Mean Rating']] = scaler.fit_transform(zomato[['Mean Rating']]).round(2)

In [None]:
# Lower Casing
zomato["reviews_list"] = zomato["reviews_list"].str.lower()

# Removing of puctuation
import string
punct = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', punct))

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_punctuation(text))

# Removing commonly used words(stop words)
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_stopwords(text))

## Removal of URLS
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_urls(text))

zomato[['reviews_list', 'cuisines']].sample(5)

In [None]:
 #RESTAURANT NAMES:
restaurant_names = list(zomato['name'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]
    
zomato=zomato.drop(['address','rest_type', 'type', 'menu_item', 'votes'],axis=1)
import pandas

# Draw a random sample(half of the data)
df_percent = zomato.sample(frac=0.5)

In [None]:
df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

# Creating tf-idf matrix
'''
Term Frequency-Inverse Document Frequency(TF-IDF) vectorization gives a matrix. It is used to transform text into 
a meaningful representation of numbers and assess meaning of the words so it can be used for a 
machine learning algorithm
'''
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
#Recommendation algorithm
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'price'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating', 'price']][df_percent.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','Mean Rating', 'price'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)
    
    print('We recommend these %s restaurants because you liked %s : ' % (str(len(df_new)), name))
    
    return df_new
recommend('Pizza Hut')

In [None]:
df.info()
''' Column description

1. url : contains the url of the restaurant in the zomato website
2. address : contains the address of the restaurant in Bengaluru
3. name : contains the name of the restaurant
4. online_order : whether online ordering is available in the restaurant or not
5. book_table : table book option available or not
6. rate : contains the overall rating of the restaurant out of 5
7. votes : contains total number of rating for the restaurant as of the above mentioned date
8. phone : contains the phone number of the restaurant
9. location : contains the neighborhood in which the restaurant is located
10. rest_type : restaurant type
11. dish_liked : dishes people liked in the restaurant
12. cuisines : food styles, separated by comma
13. cost_two : contains the approximate cost for meal for two people
14. reviews_list : list of tuples containing reviews for the restaurant, each tuple
15. menu_item : contains list of menus available in the restaurant
16. service_type : type of meal
17. serve_to : contains the neighborhood in which the restaurant is listed
'''

In [None]:
#PREPROCESSING
#null values, cleaning, data type conversions(object to analysable format), renaming, etc

# Checking for missing values
pd.DataFrame(round(df.isnull().sum()/df.shape[0] * 100,3), columns = ['Missing'])
#Therefore,column dish_liked has more tha 50% of the missing data. If we drop the data we would lose 50% of the data.

In [None]:
#droping some columns- url, address, phone
df.drop(['url', 'address', 'phone'], axis=1, inplace = True)

#Renaming few columns 
df.rename(columns = {"approx_cost(for two people)" : "cost_two", "listed_in(type)" : "service_type", "listed_in(city)" : "serve_to"}, inplace = True)

In [None]:
# Converting the cost_two variaible into integer 
df.cost_two = df.cost_two.astype(str)
df.cost_two = df.cost_two.apply(lambda x : x.replace(',','')).astype(float)
#the ","(comma, example 1,600) inbetween numbers must be removed before conversion

df.rate.unique() #to check what values need to be removed 

#"NEW" and "-" must be replaced and the '/5'must be removed before conversion
df['rate'] = df.rate.replace('NEW', np.NaN)
df['rate'] = df.rate.replace('-', np.NaN)
df.rate = df.rate.astype(str)
df.rate = df.rate.apply(lambda x : x.replace('/5','')).astype(float)

In [None]:
#VISUALIZATION

plt.rcParams['figure.figsize'] = 16,8
sns.countplot(df['rate'], palette='Set2')
plt.title("Count plot of the ratings")
plt.xticks()
plt.show()
#Ratings is normally distributed with 3.4-4.2 stars being the most common 

In [None]:
#box plot comparisons 

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
sns.boxplot(x='book_table', y='cost_two', data=df.loc[df['book_table'].isin(df['book_table'].value_counts().head(10).index)]);
plt.title('Online booking(0-No 1-Yes) vs Cost');
#This shows that mean cost for two people is higher in restaurants with online booking facility

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
sns.boxplot(x='online_order', y='cost_two', data=df.loc[df['online_order'].isin(df['online_order'].value_counts().head(10).index)]);
plt.title('Online ordering(0-No 1-Yes) vs Cost');

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
sns.boxplot(x='cost_two', y='rate', data=df.loc[df['cost_two'].isin(df['cost_two'].value_counts().head(10).index)]);
plt.title('Cost vs Rating');

In [None]:
# Finding how many people order online 

trace = go.Pie(labels = ['Online_orders', 'No_online_orders'], values = df['online_order'].value_counts(), 
               textfont=dict(size=15), opacity = 0.8,
               marker=dict(colors=['pink','teal'], 
                           line=dict(color='#000000', width=1.5)))


layout = dict(title =  'Distribution of order variable')
           
fig = dict(data = [trace], layout=layout)
py.iplot(fig)
#From the pie chart it can be seen that people order online more than going out

In [None]:
# Finding the locations with the most number of retaurants 

df.location.value_counts().nlargest(10).plot(kind='barh',color='teal')
plt.title("Number of restaurants by location")
plt.xlabel("Restaurant counts")
plt.show()
#From the barchart it can be seen that BTM has the most number of restaurants

In [None]:
# Restaurants serving in

df.serve_to.value_counts().nlargest(10).plot(kind = 'barh', color = 'teal')
plt.title("Number of restaurants listed in a location")
plt.xlabel("Count")
plt.legend()
plt.show()
#Most restaurants deliver to BTM and different blocks of Koramangala

In [None]:
sns.countplot(x = df['rate'], hue = df['online_order'], palette= 'Set2')
plt.title("Distribution of restaurant rating over online order facility")
plt.show()
#This plot shows that rating clearly depends on the online ordering facility provision, restaurants with online facilities have a higher rating

In [None]:
df.dish_liked.value_counts().nlargest(20).plot(kind = 'barh',color='teal')
plt.show()
#Biriyani is the most liked dish

In [None]:
df.rest_type.value_counts().nlargest(20).plot(kind = 'barh',color='teal')
plt.title("Restaurant type")
plt.xlabel("Count")
plt.legend()
plt.show()
#Quick bites are the most popular type 

In [None]:
df.name.value_counts().nlargest(20).plot(kind = 'barh',color='teal')
plt.legend()
plt.show()
#Cafe coffee day is the most popular restaurant in Bangalore

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x = df['online_order'], hue = df['rate'], palette= 'Set2')
plt.title("Distribution of restaurant rating over table booking facility")
plt.show()
#The distribution below clearly shows that ratings depend on online table booking facility. The restaurants with the online reservation facility have a higher rating

In [None]:
# Plotting a pie chart for online orders

trace = go.Pie(labels = ['Table_booking_available', 'No_table_booking_available'], values = df['book_table'].value_counts(), 
               textfont=dict(size=15), opacity = 0.8,
               marker=dict(colors=['pink','teal'], 
                           line=dict(color='#000000', width=1.5)))


layout = dict(title =  'Distribution of order variable')
           
fig = dict(data = [trace], layout=layout)
py.iplot(fig)
#87.5% of the restaurants have online table booking(reservation) facilities

In [None]:
plt.rcParams['figure.figsize'] = 14,7
plt.subplot(1,2,1)

df.name.value_counts().head().plot(kind = 'barh', color = sns.color_palette("hls", 5))
plt.xlabel("Number Of Restaurants")
plt.title("Biggest Restaurant Chain (Top 5)")

plt.subplot(1,2,2)

df[df['rate'] >= 4.5]['name'].value_counts().nlargest(5).plot(kind = 'barh', color = sns.color_palette("Paired"))
plt.xlabel("Number Of Restaurants")
plt.title("Biggest Restaurant Chain (Top 5) - Rating more than 4.5")
plt.tight_layout()
'''
The bigger chained restaurants in Bangalore do not necessarily have the highest rating. Cafe coffee day has almost 
100 cafes while truffles has just over 40. Truffles has a higher rating than cafe coffee day
Therefore, quality over quantity
'''

In [None]:
# Replacing the NaN values in rate feature
df['rate'] = df['rate'].fillna(df['rate'].mean())

#Ratings is normally distributed as seen below
sns.distplot(df['rate'], color = 'teal')
plt.title('Rating Distribution')
plt.show()

In [None]:
# Replacing the NaN values for the cost_two feature with mean value
df['cost_two'] = df['cost_two'].fillna(df['cost_two'].mean())

# cost for two is normally distriuted as seen below
sns.distplot(df['cost_two'], color = 'teal')
plt.title('Rating Distribution')
plt.show()

In [None]:
#Categorical to numeric type for analysis

df['online_order']= pd.get_dummies(df['online_order'], drop_first=True)
df['book_table'] = pd.get_dummies(df['book_table'], drop_first=True)

In [None]:
#One Hot Encoding on rest_type

get_dummies_rest_type = pd.get_dummies(df.rest_type)
get_dummies_rest_type.head(5)

In [None]:
#One Hot Encoding on location

get_dummies_location = pd.get_dummies(df.location)
get_dummies_location.head(5)

In [None]:
#One Hot Encoding on type

get_dummies_service_type = pd.get_dummies(df.service_type)
get_dummies_service_type.head(5)

In [None]:
# Concatinating the dataframes
final_df = pd.concat([df,get_dummies_rest_type,get_dummies_service_type, get_dummies_location], axis = 1)
final_df.head(5)

In [None]:
#drop name, rest_type, location, cuisines, dish_liked, reviews_list
final_df = final_df.drop(["name","rest_type","location", 'cuisines', 'dish_liked', 'reviews_list'],axis = 1)
#drop menu_item, service_type, serve_to
final_df = final_df.drop(["menu_item","service_type","serve_to"],axis = 1)
final_df.head()

In [None]:
sns.heatmap(df.corr(), annot=True, cmap="RdYlGn", annot_kws={"size":15})

In [None]:
# Splitting the features into independent and dependent variables

x = final_df.drop(['rate'], axis = 1) #independent

y = final_df['rate'] #dependent


In [None]:
x.head()

In [None]:
y.head()

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

model = ExtraTreesRegressor()
model.fit(x,y)
#Extra Tree Regressor is used for finding importance(related to output) score for each feature(column)
#Select 10 most relevant features and use them to train the model more accurately
print(model.feature_importances_)

In [None]:
#visualization of top 10 features' importance
#Higher the value, higher the relevance
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(10).plot(kind='barh',color='teal')
plt.show()


In [None]:
'''
The data is split into two sets- one for training the model and one for testing it. 
We use 2 models- Linear Regression and Decision Tree Regressor 
'''

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.30) #30% split

In [None]:
#LINEAR REGRESSION 

from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(x_train, y_train)

lr_pred = lr.predict(x_test)

In [None]:
'''
R-squared (R2) is a statistical measure that represents the proportion of the variance for a dependent variable that's explained by an independent variable or variables in a regression model
'''
print('LINEAR REGRESSION\n')

r2 = r2_score(y_test,lr_pred)
print('R-Squared Score: ',r2*100)
#ERRORS
#mean absolute error(MAE) , mean absolute percentage error(MAPE), accuracy

lr_errors = abs(lr_pred - y_test)
print('Mean Absolute Error:', round(np.mean(lr_pred), 2), 'degrees')
mape = 100 * (lr_errors / y_test)
lr_accuracy = 100 - np.mean(mape)
print('Accuracy :', round(lr_accuracy, 2), '%')

In [None]:
#plotting the errors, normally distributed as seen below 
sns.distplot(y_test-lr_pred,color='y')

In [None]:
#plotting the Random forest values predicated Rating

plt.figure(figsize=(12,7))

plt.scatter(y_test,x_test.iloc[:,2],color="black")
plt.title("True rate vs Predicted rate, Linear regression",size=20,pad=15)
plt.xlabel('Rating',size = 15)
plt.ylabel('Frequency',size = 15)
plt.scatter(lr_pred,x_test.iloc[:,2],color="yellow")

In [None]:
#DECISION TREE REGRESSOR

from sklearn.tree import DecisionTreeRegressor

dtree = DecisionTreeRegressor(criterion='mse')
dtree.fit(x_train, y_train)
dtree_pred = dtree.predict(x_test)

In [None]:
print('DECISION TREE REGRESSOR \n')

r2 = r2_score(y_test,dtree_pred)
print('R-Square Score: ',r2*100)

#ERRORS
#absolute errors, accuracy
dtree_errors = abs(dtree_pred - y_test)
print('Mean Absolute Error:', round(np.mean(dtree_pred), 2), 'degrees.')
mape = 100 * (dtree_errors / y_test)
dtree_accuracy = 100 - np.mean(mape)
print('Accuracy:', round(dtree_accuracy, 2), '%.')

In [None]:
#plotting the errors
sns.distplot(y_test-dtree_pred,color='y')

In [None]:
#plotting the Random forest values predicated Rating

plt.figure(figsize=(12,7))

plt.scatter(y_test,x_test.iloc[:,2],color="black")
plt.title("True rate vs Predicted rate, Decision Tree Regressor",size=20,pad=15)
plt.xlabel('Rating',size = 15)
plt.ylabel('Frequency',size = 15)
plt.scatter(dtree_pred,x_test.iloc[:,2],color="yellow")
plt.legend()