In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_train=pd.read_excel('Rest_Data_Train.xlsx')
data_test=pd.read_excel('Rest_Data_Test.xlsx')

In [3]:
data=pd.concat([data_train, data_test], ignore_index=True)
data

Unnamed: 0,CITY,COST,CUISINES,Dataset,LOCALITY,RATING,RESTAURANT_ID,TIME,TITLE,VOTES
0,Thane,1200.0,"Malwani, Goan, North Indian",Train,Dombivali East,3.6,9438,"11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",CASUAL DINING,49 votes
1,Chennai,1500.0,"Asian, Modern Indian, Japanese",Train,Ramapuram,4.2,13198,6pm – 11pm (Mon-Sun),"CASUAL DINING,BAR",30 votes
2,Chennai,800.0,"North Indian, Chinese, Biryani, Hyderabadi",Train,Saligramam,3.8,10915,"11am – 3:30pm, 7pm – 11pm (Mon-Sun)",CASUAL DINING,221 votes
3,Mumbai,800.0,"Tibetan, Chinese",Train,Bandra West,4.1,6346,11:30am – 1am (Mon-Sun),QUICK BITES,24 votes
4,Mumbai,300.0,Desserts,Train,Lower Parel,3.8,15387,11am – 1am (Mon-Sun),DESSERT PARLOR,165 votes
...,...,...,...,...,...,...,...,...,...,...
16916,New Delhi,,"North Indian, Mughlai, Chinese",Test,Punjabi Bagh,3.9,9057,11:30am – 11:30pm (Mon-Sun),CASUAL DINING,287 votes
16917,Bangalore,,"Biryani, North Indian, Sandwich, Salad, Wraps",Test,HSR Layout,4.3,1247,11am – 1am (Mon-Sun),,469 votes
16918,Faridabad,,"Continental, North Indian",Test,Sector 86,3.7,8617,9:30am – 10:30pm (Mon-Sun),QUICK BITES,53 votes
16919,Kochi,,"Rolls, Beverages",Test,Kochi,-,6485,"11am – 11:30pm (Mon, Tue, Wed, Thu, Sat, Sun),...",QUICK BITES,


In [4]:
data.reset_index(drop=True,inplace=True)

In [5]:
data.dtypes

CITY              object
COST             float64
CUISINES          object
Dataset           object
LOCALITY          object
RATING            object
RESTAURANT_ID      int64
TIME              object
TITLE             object
VOTES             object
dtype: object

In [6]:
for col in data.columns:
    if data[col].dtype=="object":
        print(data[col].value_counts())
        print()

Chennai                         2855
Bangalore                       2835
Hyderabad                       2458
Mumbai                          2311
New Delhi                       1754
                                ... 
Opposite Barathi Gas Company       1
Malad East                         1
Kurla (W)                          1
Shihab Thangal Road                1
Ashok Vihar Phase 1                1
Name: CITY, Length: 450, dtype: int64

North Indian                                        716
North Indian, Chinese                               714
South Indian                                        706
Fast Food                                           364
Chinese                                             226
                                                   ... 
North Indian, Beverages, Chinese                      1
Biryani, Fast Food, Indian                            1
Cafe, European, Italian, Pizza, Beverages, Salad      1
Cafe, Pizza, Salad, Desserts                      

In [7]:
data.isnull().sum()

CITY              147
COST             4231
CUISINES            0
Dataset             0
LOCALITY          128
RATING              4
RESTAURANT_ID       0
TIME                0
TITLE               0
VOTES            1606
dtype: int64

In [8]:
data.loc[data['VOTES'].isnull()]
# This indicates that new restaurants do not have any ratings
# Hence, NaN values can be replaced by 0

Unnamed: 0,CITY,COST,CUISINES,Dataset,LOCALITY,RATING,RESTAURANT_ID,TIME,TITLE,VOTES
13,Kochi,400.0,South Indian,Train,Edappally,-,4319,11am – 11:30pm (Mon-Sun),CASUAL DINING,
62,Kochi,200.0,"North Indian, Beverages",Train,Kalamassery,NEW,7841,11am – 11pm (Mon-Sun),QUICK BITES,
69,New Delhi,200.0,"Pizza, Fast Food",Train,Kalkaji,NEW,11096,11am – 11pm (Mon-Sun),,
85,Bangalore,300.0,"North Indian, South Indian",Train,Vijay Nagar,-,12657,"Closed (Mon),7:30am – 10:30pm (Tue-Sun)",QUICK BITES,
111,Mumbai,800.0,"Gujarati, Rajasthani, North Indian",Train,Borivali West,NEW,7095,"11:30am – 3:30pm, 7pm – 11pm (Mon-Sun)",CASUAL DINING,
...,...,...,...,...,...,...,...,...,...,...
16873,Noida,,"Cafe, North Indian",Test,Sector 63,NEW,7007,11am – 11pm (Mon-Sun),CAFÉ,
16875,New Delhi,,Fast Food,Test,Shalimar Bagh,NEW,6387,12:30pm – 10:30pm (Mon-Sun),QUICK BITES,
16888,Bangalore,,Continental,Test,Indiranagar,NEW,13381,11am – 12midnight (Mon-Sun),QUICK BITES,
16913,Mumbai,,"Sindhi, Street Food",Test,Lower Parel,NEW,6887,"11:30am – 4pm, 7pm – 11:30pm (Mon-Sun)",CASUAL DINING,


In [9]:
data['VOTES'].fillna(0, inplace = True)
data

Unnamed: 0,CITY,COST,CUISINES,Dataset,LOCALITY,RATING,RESTAURANT_ID,TIME,TITLE,VOTES
0,Thane,1200.0,"Malwani, Goan, North Indian",Train,Dombivali East,3.6,9438,"11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",CASUAL DINING,49 votes
1,Chennai,1500.0,"Asian, Modern Indian, Japanese",Train,Ramapuram,4.2,13198,6pm – 11pm (Mon-Sun),"CASUAL DINING,BAR",30 votes
2,Chennai,800.0,"North Indian, Chinese, Biryani, Hyderabadi",Train,Saligramam,3.8,10915,"11am – 3:30pm, 7pm – 11pm (Mon-Sun)",CASUAL DINING,221 votes
3,Mumbai,800.0,"Tibetan, Chinese",Train,Bandra West,4.1,6346,11:30am – 1am (Mon-Sun),QUICK BITES,24 votes
4,Mumbai,300.0,Desserts,Train,Lower Parel,3.8,15387,11am – 1am (Mon-Sun),DESSERT PARLOR,165 votes
...,...,...,...,...,...,...,...,...,...,...
16916,New Delhi,,"North Indian, Mughlai, Chinese",Test,Punjabi Bagh,3.9,9057,11:30am – 11:30pm (Mon-Sun),CASUAL DINING,287 votes
16917,Bangalore,,"Biryani, North Indian, Sandwich, Salad, Wraps",Test,HSR Layout,4.3,1247,11am – 1am (Mon-Sun),,469 votes
16918,Faridabad,,"Continental, North Indian",Test,Sector 86,3.7,8617,9:30am – 10:30pm (Mon-Sun),QUICK BITES,53 votes
16919,Kochi,,"Rolls, Beverages",Test,Kochi,-,6485,"11am – 11:30pm (Mon, Tue, Wed, Thu, Sat, Sun),...",QUICK BITES,0


In [10]:
# Converting Votes values into integers by stripping votes
data['VOTES']=data['VOTES'].str.strip('votes')

In [11]:
data['VOTES'].fillna(0,inplace=True)

In [12]:
data['VOTES']

0         49 
1         30 
2        221 
3         24 
4        165 
         ... 
16916    287 
16917    469 
16918     53 
16919       0
16920     63 
Name: VOTES, Length: 16921, dtype: object

In [13]:
data['VOTES']=data['VOTES'].astype(int)

In [14]:
data.dtypes

CITY              object
COST             float64
CUISINES          object
Dataset           object
LOCALITY          object
RATING            object
RESTAURANT_ID      int64
TIME              object
TITLE             object
VOTES              int32
dtype: object

In [15]:
data['RATING'].value_counts()

3.9    1643
3.8    1604
4.0    1498
3.7    1437
3.6    1261
4.1    1217
3.5     988
4.2     968
NEW     930
3.4     777
4.3     740
-       672
3.3     485
4.4     484
3.2     349
4.5     274
3.1     263
2.9     242
3.0     229
2.8     191
4.6     184
2.7     129
4.7      90
2.6      85
2.5      51
4.8      49
4.9      28
2.4      24
2.3      15
2.2       4
2.1       4
2.0       2
Name: RATING, dtype: int64

In [16]:
# For restaurants that are new or have '-' ratings, following imputation can be done
data["RATING"].replace("NEW",0,inplace=True)
data["RATING"].replace("-",0,inplace=True)
data["RATING"].fillna(0,inplace=True)

In [17]:
data['RATING']=data['RATING'].astype(float)

In [18]:
data.dtypes

CITY              object
COST             float64
CUISINES          object
Dataset           object
LOCALITY          object
RATING           float64
RESTAURANT_ID      int64
TIME              object
TITLE             object
VOTES              int32
dtype: object

In [19]:
# Analyzing the missing values for City column
data.loc[data["CITY"].isnull()]

Unnamed: 0,CITY,COST,CUISINES,Dataset,LOCALITY,RATING,RESTAURANT_ID,TIME,TITLE,VOTES
129,,500.0,"North Indian, South Indian, Chinese, Continental",Train,,3.6,9101,"12noon – 3pm, 7pm – 2am (Mon-Sun)",,728
246,,300.0,Biryani,Train,Palarivattom Kochi,3.4,14502,12noon – 11pm (Mon-Sun),QUICK BITES,82
411,,600.0,"North Indian, Chinese, Pizza, Burger, Fast Food",Train,,4.0,6313,7pm – 4am (Mon-Sun),,29
466,,200.0,North Indian,Train,,4.2,1133,4pm – 9pm (Mon-Sun),,55
481,,250.0,Street Food,Train,,3.7,4788,"11am – 11pm (Mon),9:30am – 12midnight (Tue-Sun)",,616
...,...,...,...,...,...,...,...,...,...,...
16311,,,"Continental, Mexican, American",Test,Maharashtra 400013,4.0,9779,5pm – 1:30am (Mon-Sun),"PUB,CASUAL DINING",1142
16313,,,"North Indian, South Indian, Chinese, Biryani",Test,,3.5,5922,6am – 11:30pm (Mon-Sun),CASUAL DINING,102
16803,,,"Fast Food, Italian, Pizza, Sandwich",Test,,3.7,4748,1pm – 8am (Mon-Sun),,27
16815,,,"Chinese, North Indian",Test,,3.5,8587,"11:45am – 3:30pm, 7:30pm – 11:40pm (Mon, Sat)...",QUICK BITES,25


In [20]:
data['CITY'].fillna('missing', inplace=True)

In [21]:
data['LOCALITY'].fillna('missing', inplace=True)

In [22]:
data

Unnamed: 0,CITY,COST,CUISINES,Dataset,LOCALITY,RATING,RESTAURANT_ID,TIME,TITLE,VOTES
0,Thane,1200.0,"Malwani, Goan, North Indian",Train,Dombivali East,3.6,9438,"11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",CASUAL DINING,49
1,Chennai,1500.0,"Asian, Modern Indian, Japanese",Train,Ramapuram,4.2,13198,6pm – 11pm (Mon-Sun),"CASUAL DINING,BAR",30
2,Chennai,800.0,"North Indian, Chinese, Biryani, Hyderabadi",Train,Saligramam,3.8,10915,"11am – 3:30pm, 7pm – 11pm (Mon-Sun)",CASUAL DINING,221
3,Mumbai,800.0,"Tibetan, Chinese",Train,Bandra West,4.1,6346,11:30am – 1am (Mon-Sun),QUICK BITES,24
4,Mumbai,300.0,Desserts,Train,Lower Parel,3.8,15387,11am – 1am (Mon-Sun),DESSERT PARLOR,165
...,...,...,...,...,...,...,...,...,...,...
16916,New Delhi,,"North Indian, Mughlai, Chinese",Test,Punjabi Bagh,3.9,9057,11:30am – 11:30pm (Mon-Sun),CASUAL DINING,287
16917,Bangalore,,"Biryani, North Indian, Sandwich, Salad, Wraps",Test,HSR Layout,4.3,1247,11am – 1am (Mon-Sun),,469
16918,Faridabad,,"Continental, North Indian",Test,Sector 86,3.7,8617,9:30am – 10:30pm (Mon-Sun),QUICK BITES,53
16919,Kochi,,"Rolls, Beverages",Test,Kochi,0.0,6485,"11am – 11:30pm (Mon, Tue, Wed, Thu, Sat, Sun),...",QUICK BITES,0


In [23]:
#Dropping "RESTAURANT_ID" and "TIME" columns from the dataset
data.drop(columns=["RESTAURANT_ID","TIME"],inplace=True)

In [24]:
# Separating the CUISINES column into multiple columns
cuisines=-1
for i in range(len(data["CUISINES"])):
    c=len(data["CUISINES"].iloc[i].split(","))
    if c>cuisines:
        cuisines=c
print("Maximum number of titles are :{}".format(cuisines))

Maximum number of titles are :8


In [25]:
C1=list()
C2=list()
C3=list()
C4=list()
C5=list()
C6=list()
C7=list()
C8=list()
for i in range(len(data['CUISINES'])):
    temp=data['CUISINES'].iloc[i].split(',')
    try :
        C1.append(temp[0].strip().upper())
    except :
        C1.append('None')
    try :
        C2.append(temp[1].strip().upper())
    except :
        C2.append('None')
    try :
        C3.append(temp[2].strip().upper())
    except :
        C3.append('None')
    try :
        C4.append(temp[3].strip().upper())
    except :
        C4.append('None')
    try :
        C5.append(temp[4].strip().upper())
    except :
        C5.append('None')
    try :
        C6.append(temp[5].strip().upper())
    except :
        C6.append('None')
    try :
        C7.append(temp[6].strip().upper())
    except :
        C7.append('None')
    try :
        C8.append(temp[7].strip().upper())
    except :
        C8.append('None')

In [26]:
data["C1"]=C1
data["C2"]=C2
data["C3"]=C3
data["C4"]=C4
data["C5"]=C5
data["C6"]=C6
data["C7"]=C7
data["C8"]=C8

In [27]:
# Checking the maximum length of string values to separate values in different columns
max_qual_length=-1
for i in range(len(data['TITLE'])):
    temp=len(data['TITLE'].iloc[i].strip().upper().split(","))
    if temp>max_qual_length:
        max_qual_length=temp
print('Max length is :{}'.format(max_qual_length))

Max length is :2


In [28]:
T1=list()
T2=list()
for i in range(len(data['TITLE'])):
    temp=data['TITLE'].iloc[i].split(',')
    try :
        T1.append(temp[0].strip().upper())
    except :
        T1.append('None')
    try :
        T2.append(temp[1].strip().upper())
    except :
        T2.append('None')

In [29]:
data['T1']=T1
data['T2']=T2

In [30]:
data

Unnamed: 0,CITY,COST,CUISINES,Dataset,LOCALITY,RATING,TITLE,VOTES,C1,C2,C3,C4,C5,C6,C7,C8,T1,T2
0,Thane,1200.0,"Malwani, Goan, North Indian",Train,Dombivali East,3.6,CASUAL DINING,49,MALWANI,GOAN,NORTH INDIAN,,,,,,CASUAL DINING,
1,Chennai,1500.0,"Asian, Modern Indian, Japanese",Train,Ramapuram,4.2,"CASUAL DINING,BAR",30,ASIAN,MODERN INDIAN,JAPANESE,,,,,,CASUAL DINING,BAR
2,Chennai,800.0,"North Indian, Chinese, Biryani, Hyderabadi",Train,Saligramam,3.8,CASUAL DINING,221,NORTH INDIAN,CHINESE,BIRYANI,HYDERABADI,,,,,CASUAL DINING,
3,Mumbai,800.0,"Tibetan, Chinese",Train,Bandra West,4.1,QUICK BITES,24,TIBETAN,CHINESE,,,,,,,QUICK BITES,
4,Mumbai,300.0,Desserts,Train,Lower Parel,3.8,DESSERT PARLOR,165,DESSERTS,,,,,,,,DESSERT PARLOR,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16916,New Delhi,,"North Indian, Mughlai, Chinese",Test,Punjabi Bagh,3.9,CASUAL DINING,287,NORTH INDIAN,MUGHLAI,CHINESE,,,,,,CASUAL DINING,
16917,Bangalore,,"Biryani, North Indian, Sandwich, Salad, Wraps",Test,HSR Layout,4.3,,469,BIRYANI,NORTH INDIAN,SANDWICH,SALAD,WRAPS,,,,NONE,
16918,Faridabad,,"Continental, North Indian",Test,Sector 86,3.7,QUICK BITES,53,CONTINENTAL,NORTH INDIAN,,,,,,,QUICK BITES,
16919,Kochi,,"Rolls, Beverages",Test,Kochi,0.0,QUICK BITES,0,ROLLS,BEVERAGES,,,,,,,QUICK BITES,


In [31]:
data.drop(columns=['TITLE', 'CUISINES'], inplace = True)

In [32]:
data

Unnamed: 0,CITY,COST,Dataset,LOCALITY,RATING,VOTES,C1,C2,C3,C4,C5,C6,C7,C8,T1,T2
0,Thane,1200.0,Train,Dombivali East,3.6,49,MALWANI,GOAN,NORTH INDIAN,,,,,,CASUAL DINING,
1,Chennai,1500.0,Train,Ramapuram,4.2,30,ASIAN,MODERN INDIAN,JAPANESE,,,,,,CASUAL DINING,BAR
2,Chennai,800.0,Train,Saligramam,3.8,221,NORTH INDIAN,CHINESE,BIRYANI,HYDERABADI,,,,,CASUAL DINING,
3,Mumbai,800.0,Train,Bandra West,4.1,24,TIBETAN,CHINESE,,,,,,,QUICK BITES,
4,Mumbai,300.0,Train,Lower Parel,3.8,165,DESSERTS,,,,,,,,DESSERT PARLOR,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16916,New Delhi,,Test,Punjabi Bagh,3.9,287,NORTH INDIAN,MUGHLAI,CHINESE,,,,,,CASUAL DINING,
16917,Bangalore,,Test,HSR Layout,4.3,469,BIRYANI,NORTH INDIAN,SANDWICH,SALAD,WRAPS,,,,NONE,
16918,Faridabad,,Test,Sector 86,3.7,53,CONTINENTAL,NORTH INDIAN,,,,,,,QUICK BITES,
16919,Kochi,,Test,Kochi,0.0,0,ROLLS,BEVERAGES,,,,,,,QUICK BITES,


In [33]:
from sklearn.preprocessing import LabelEncoder
for col in data.columns:
    if data[col].dtype=="object":
        data[col]=LabelEncoder().fit_transform(data[col])

In [34]:
data

Unnamed: 0,CITY,COST,Dataset,LOCALITY,RATING,VOTES,C1,C2,C3,C4,C5,C6,C7,C8,T1,T2
0,400,1200.0,1,328,3.6,49,61,32,71,65,63,51,42,19,5,16
1,75,1500.0,1,1214,4.2,30,6,62,45,65,63,51,42,19,5,1
2,75,800.0,1,1272,3.8,221,74,23,14,34,63,51,42,19,5,16
3,277,800.0,1,161,4.1,24,102,23,72,65,63,51,42,19,23,16
4,277,300.0,1,709,3.8,165,30,69,72,65,63,51,42,19,9,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16916,316,,0,1184,3.9,287,74,64,23,65,63,51,42,19,5,16
16917,42,,0,461,4.3,469,15,68,83,73,83,51,42,19,20,16
16918,128,,0,1370,3.7,53,29,68,72,65,63,51,42,19,23,16
16919,216,,0,642,0.0,0,84,12,72,65,63,51,42,19,23,16


In [35]:
df_train=data.loc[data['Dataset']==1]
df_test=data.loc[data['Dataset']==0]

In [36]:
df_train.drop(columns=['Dataset'], inplace=True)
df_test.drop(columns=['Dataset', 'COST'], inplace=True)

In [37]:
df_test.reset_index(drop=True,inplace=True)

In [38]:
df_x=df_train.drop(columns=['COST'])
y=df_train[['COST']]

In [39]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(df_x)
x=pd.DataFrame(x,columns=df_x.columns)

In [40]:
x

Unnamed: 0,CITY,LOCALITY,RATING,VOTES,C1,C2,C3,C4,C5,C6,C7,C8,T1,T2
0,2.020329,-0.935521,0.147237,-0.403533,0.349159,-0.684916,0.521965,0.337892,0.206285,0.104873,0.087065,0.073201,-0.902725,0.272430
1,-0.936909,0.972996,0.655819,-0.426964,-1.545521,0.488275,-0.601427,0.337892,0.206285,0.104873,0.087065,0.073201,-0.902725,-3.737861
2,-0.936909,1.097933,0.316765,-0.191427,0.796993,-1.036873,-1.940857,-1.663120,0.206285,0.104873,0.087065,0.073201,-0.902725,0.272430
3,0.901128,-1.295253,0.571055,-0.434363,1.761558,-1.036873,0.565173,0.337892,0.206285,0.104873,0.087065,0.073201,1.140765,0.272430
4,0.901128,-0.114816,0.316765,-0.260485,-0.718752,0.762019,0.565173,0.337892,0.206285,0.104873,0.087065,0.073201,-0.448616,0.272430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12685,-0.163477,-0.791197,0.316765,0.209355,0.796993,-1.271511,-0.471805,0.337892,0.206285,0.104873,0.087065,0.073201,1.140765,0.272430
12686,0.901128,-1.301715,0.740582,1.033116,-0.408713,-0.958660,0.565173,0.337892,0.206285,0.104873,0.087065,0.073201,-0.902725,-3.737861
12687,1.110410,1.692459,0.486292,0.285812,-0.512059,-0.958660,-2.329724,-2.631352,0.206285,0.104873,0.087065,0.073201,0.346075,0.272430
12688,-0.936909,-0.091121,0.062474,-0.424497,0.796993,1.465933,-1.551990,1.370672,0.206285,0.104873,0.087065,0.073201,-0.902725,0.272430


In [41]:
from sklearn.metrics import r2_score
def maxr2score(cm,x,y):
    maxr2=0
    rs=0
    for r_state in range(0,150):
        x_train,x_test,y_train,y_test=train_test_split(x, y,random_state = r_state,test_size=0.2)
        cm.fit(x_train,y_train)
        y_pred=cm.predict(x_test)
        rsc=r2_score(y_test,y_pred)
        if rsc>maxr2:
            maxr2=rsc
            rs=r_state
    print("Maximum r2_score is at random state :",rs," and it is :",maxr2)

In [42]:
# Using Linear regression model
from sklearn.linear_model import LinearRegression
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=42,test_size=.20)
lr=LinearRegression()
lr.fit(x_train,y_train)
predicty=lr.predict(x_test)
print('The mean absolute error for linear regression model is :', mean_absolute_error(y_test, predicty))
print('The mean square error for linear regression model is :', mean_squared_error(y_test, predicty))
print('The root mean square error for linear regression model is :', np.sqrt(mean_squared_error(y_test, predicty)))
print('The r2_score is :', r2_score(y_test,predicty))

The mean absolute error for linear regression model is : 306.45372752628265
The mean square error for linear regression model is : 284217.74234231445
The root mean square error for linear regression model is : 533.1207577484809
The r2_score is : 0.21028662646816643


In [43]:
# Using Ridge regression
from sklearn.linear_model import Ridge
rr=Ridge()
hyperparametersrr={'alpha':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
gridrr = GridSearchCV(rr, hyperparametersrr, cv=10)
gridrr.fit(x,y)
gridrr.best_params_

{'alpha': 100}

In [44]:
from sklearn.linear_model import Ridge
rr=Ridge(alpha=100)
maxr2score(rr,x,y)

Maximum r2_score is at random state : 67  and it is : 0.24645603783911119


In [45]:
print("Mean r2 score for Ridge regression after cross validation: ", cross_val_score(rr,x,y,cv=5,scoring="r2").mean())
print("Standard deviation for Ridge regression from mean r2 score is : ", cross_val_score(rr,x,y,cv=5,scoring="r2").std())

Mean r2 score for Ridge regression after cross validation:  0.19316799332421747
Standard deviation for Ridge regression from mean r2 score is :  0.02561409790490468


In [46]:
# Using Lasso regression
from sklearn.linear_model import Lasso
lar=Lasso()
hyperparameterlar={'alpha':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
gridlar = GridSearchCV(lar, hyperparameterlar, cv=5)
gridlar.fit(x,y)
gridlar.best_params_

{'alpha': 1}

In [47]:
from sklearn.linear_model import Lasso
lar=Lasso(alpha=1)
maxr2score(lar,x,y)

Maximum r2_score is at random state : 67  and it is : 0.24655954886586007


In [48]:
print("Mean r2 score for Lasso regression after cross validation: ", cross_val_score(lar,x,y,cv=5,scoring="r2").mean())
print("Standard deviation for Lasso regression from mean r2 score is : ", cross_val_score(lar,x,y,cv=5,scoring="r2").std())

Mean r2 score for Lasso regression after cross validation:  0.19316783876583146
Standard deviation for Lasso regression from mean r2 score is :  0.025669083048822698


In [49]:
# Using KNN regressor
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
knr=KNeighborsRegressor()
hyperparameter={"n_neighbors":range(1,12)}
gridknr = GridSearchCV(knr, hyperparameter, cv=5, n_jobs=-1)
gridknr.fit(x,y)
gridknr.best_params_

{'n_neighbors': 11}

In [50]:
from sklearn.neighbors import KNeighborsRegressor
knr=KNeighborsRegressor(n_neighbors=11)
maxr2score(knr,x,y)

Maximum r2_score is at random state : 133  and it is : 0.41671063442746714


In [51]:
print("Mean r2 score for KNeighborsRegressor after cross validation: ", cross_val_score(knr,x,y,cv=5,scoring="r2").mean())
print("Standard deviation for KNeighborsRegressor from mean r2 score is : ", cross_val_score(knr,x,y,cv=5,scoring="r2").std())

Mean r2 score for KNeighborsRegressor after cross validation:  0.35206305356915657
Standard deviation for KNeighborsRegressor from mean r2 score is :  0.01762828130689847


In [52]:
# Using Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
dtr=DecisionTreeRegressor()
hyperparametersdtr={'criterion':['mse','mae'],'max_depth':range(1,2)}
griddtr=GridSearchCV(dtr, hyperparametersdtr, cv=5, scoring='r2', n_jobs=-1)
griddtr.fit(x,y)
print(griddtr.best_params_)

{'criterion': 'mse', 'max_depth': 1}


In [53]:
from sklearn.tree import DecisionTreeRegressor
dtr=DecisionTreeRegressor(criterion='mse', max_depth=1)
maxr2score(dtr,x,y)

Maximum r2_score is at random state : 67  and it is : 0.17204964330091255


In [54]:
print("Mean r2 score for Decision Tree Regressor after cross validation: ", cross_val_score(dtr,x,y,cv=5,scoring="r2").mean())
print("Standard deviation for Decision Tree Regressor from mean r2 score is : ", cross_val_score(dtr,x,y,cv=5,scoring="r2").std())

Mean r2 score for Decision Tree Regressor after cross validation:  0.1506626190113985
Standard deviation for Decision Tree Regressor from mean r2 score is :  0.016716458346719407


In [55]:
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor()
hyperparametersrfr={'criterion':['mse','mae'],'max_depth':range(1,10)}
gridrfr=GridSearchCV(rfr, hyperparametersrfr, cv=5, scoring='r2', n_jobs=-1)
gridrfr.fit(x,y)
print(gridrfr.best_params_)

{'criterion': 'mse', 'max_depth': 9}


In [57]:
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor(criterion='mse', max_depth=9)
maxr2score(rfr,x,y)

Maximum r2_score is at random state : 139  and it is : 0.7409121982837132


In [58]:
print("Mean r2 score for Random Forest Regressor after cross validation: ", cross_val_score(rfr,x,y,cv=5,scoring="r2").mean())
print("Standard deviation for Random Forest Regressor from mean r2 score is : ", cross_val_score(rfr,x,y,cv=5,scoring="r2").std())

Mean r2 score for Random Forest Regressor after cross validation:  0.6709257119062539
Standard deviation for Random Forest Regressor from mean r2 score is :  0.041125417503890566


In [59]:
# Among all the models, Random Forest Regressor is the best performing model
x_train,x_test,y_train,y_test=train_test_split(x, y,random_state = 139,test_size=0.20)
rfr=RandomForestRegressor(criterion='mse', max_depth=9)
rfr.fit(x_train,y_train)
predictrfry=rfr.predict(x_test)
print('The mean absolute error for Random Forest Regressor model is :', mean_absolute_error(y_test, predictrfry))
print('The mean square error for Random Forest Regressor model is :', mean_squared_error(y_test, predictrfry))
print('The root mean square error for Random Forest Regressor model is :', np.sqrt(mean_squared_error(y_test, predictrfry)))
print("The r2_score for Random Forest Regressor is: ",r2_score(y_test,predictrfry))

The mean absolute error for Random Forest Regressor model is : 192.1030902544909
The mean square error for Random Forest Regressor model is : 101908.36892925552
The root mean square error for Random Forest Regressor model is : 319.2309022153957
The r2_score for Random Forest Regressor is:  0.7401686003196383


In [60]:
Cost_pred=rfr.predict(df_test)
Cost_pred

array([432.9245283 , 432.9245283 , 432.9245283 , ..., 432.9245283 ,
       390.83333333, 432.9245283 ])

In [61]:
# Cost_pred dataframe
Cost_prediction=pd.DataFrame(Cost_pred,columns=["COST"])

In [62]:
Cost_prediction.to_csv("Restaurant_Cost.csv",index=False)