In [27]:
import numpy as np
import pandas as pd
from sklearn import preprocessing as sk
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.utils.random import sample_without_replacement
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support as score
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Hide all warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
def clean_rate(dataframe):
    dataframe = re.sub('/5', '',str(dataframe)) #Replacing all ratings to a single value
    dataframe = re.sub('-', '0', str(dataframe)) #Replacing all values containing '-' with 0 rating since the number of votes are also 0
    dataframe = re.sub('NEW', '0', str(dataframe)) #Replacing all values containing 'NEW' with 0 rating since the number of votes are also 0 and the restaurant is new
    dataframe = re.sub('nan', '0', str(dataframe)) #Replacing all values containing 'nan' with 0 rating
    return float(dataframe)

In [9]:
def clean_approx_cost(dataframe):
     dataframe = re.sub(',','', str(dataframe))
     return int(dataframe)

In [None]:
def scale_to_zero_mean_and_unit_variance(column):
    scaled_data = np.array(sk.scale(column))
    scaled_data = np.reshape(scaled_data,(scaled_data.shape[0],1))
    return scaled_data

In [None]:
def one_hot_encoding_of_column(column):
    # First, use LabelEncoder to convert Strings to numeric values as OHE does not accept Strings
    lab_enc = sk.LabelEncoder()
    lab_enc.fit(column)
    label_encoded = lab_enc.transform(column)
    
    # Reshape the label_encoded array into a Nx1 matrix as OHE requires a 2-D matrix as input
    label_encoded = np.reshape(label_encoded, (label_encoded.shape[0],1))
    
    # Perform One-Hot-Encoding
    OHE_encoder = sk.OneHotEncoder(sparse=False)
    OHE_encoder.fit(label_encoded)
    OHE_result = OHE_encoder.transform(label_encoded)

    return OHE_result

In [28]:
def multi_label_binarize_column(column):
    mlb = MultiLabelBinarizer()
    mlb_binarized = mlb.fit_transform(column)
    return mlb_binarized

## 1. Introduction

a.

## 2. Data pre-processing

In [13]:
# Load the data
df = pd.read_csv('zomato.csv')

# Read test dataset from file and name the columns
# df_test = pd.read_csv('dataset1_test.csv', header=None)
# df_test.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country"]

# Drop rows from train and test which are Nan (i.e remove dirty data)
# df = df.dropna()
# df_test.dropna()
print(df.shape)
df = df.drop(columns=['url','phone','reviews_list','menu_item','listed_in(city)', 'votes', 'dish_liked'])
print(df.shape)
print(df.columns)
# fig, ax = plt.subplots()
# print(df['location'].value_counts().plot(ax=ax, kind='bar'))
# df['location'].value_counts().plot(ax=ax, kind='bar')


# sns.set(style="whitegrid")
# ax = sns.countplot(x="location", data=df)



(51717, 17)
(51717, 10)
Index(['address', 'name', 'online_order', 'book_table', 'rate', 'location',
       'rest_type', 'cuisines', 'approx_cost(for two people)',
       'listed_in(type)'],
      dtype='object')


In [18]:
df['rate'] = df['rate'].apply(clean_rate)
print(df.shape)
df.dropna(subset=['location', 'cuisines','rest_type','online_order','book_table','listed_in(type)','approx_cost(for two people)'], inplace=True)
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].apply(clean_approx_cost)
df = df.drop_duplicates(subset=['name','address'], keep='first')
df.to_csv('out.csv', encoding='utf-8', index=False)

(12428, 10)


In [19]:
print(df.isna().sum())
print(df.shape)

address                        0
name                           0
online_order                   0
book_table                     0
rate                           0
location                       0
rest_type                      0
cuisines                       0
approx_cost(for two people)    0
listed_in(type)                0
dtype: int64
(12378, 10)


In [7]:
df_grouped_locations=df.groupby(['location'], as_index=False)['rate'].mean().max() # Get the location with the maximum rating
print(df_grouped_locations)
print(df.loc[df['location'] == 'Yeshwantpur'])

location    Yeshwantpur
rate            4.15556
dtype: object
                                                 address  \
39465  333, MSR Road, Gokul Extension, Mathikere, Yes...   
39490  6, 1st Cross Modal Colony, Yeshwantpur, Bangalore   
39491  40/7, Sri Sai Complex, MSRCE College Road, Mat...   
39533  2/1, 1st Cross, 14th Main, MSR Road, Mathikere...   
39543  45/1, Below Corporation Bank, Tumkur Road, RMC...   
39566  26/1, Triveni Road, Mathikere Extension, Mathi...   
39601  1052/20, Triveni Road, Gokul, Near Ramaiah Bus...   
39635  4/3, 12th Main, 4th Cross, M.S. Ramaiah Colleg...   
39669  1065, Triveni Road, Gokul Extention, MS Ramaia...   
39681  2/26, 11th Main Road, 1st stage, 2nd Phase, Ma...   
39839  Comfort INN Insys, 46,1st Main Road, Gokula 1s...   
39842  40/4, 15th Main, MSR Industrial Estate Road, M...   
40345  765, 1st Main Road, 5th Cross, Near RTO Office...   

                             name online_order book_table  rate     location  \
39465          Ne

In [26]:
df_test = df.drop(columns=['address','name','online_order','book_table','rate', 'location','rest_type','cuisines','listed_in(type)'])
df_train = df.drop(columns=['approx_cost(for two people)','address','online_order','book_table','listed_in(type)','name'])
print(df_train)

       rate                    location            rest_type  \
0       4.1                Banashankari        Casual Dining   
1       4.1                Banashankari        Casual Dining   
2       3.8                Banashankari  Cafe, Casual Dining   
3       3.7                Banashankari          Quick Bites   
4       3.8                Basavanagudi        Casual Dining   
5       3.8                Basavanagudi        Casual Dining   
6       3.6                 Mysore Road        Casual Dining   
7       4.6                Banashankari  Casual Dining, Cafe   
8       4.0                Banashankari                 Cafe   
9       4.2                Banashankari                 Cafe   
10      4.1                Banashankari                 Cafe   
11      4.2                Banashankari                 Cafe   
12      4.2                Banashankari                 Cafe   
13      4.0                Banashankari                 Cafe   
15      3.8                Banashankari 