In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
pd.pandas.set_option('display.max_columns',None)

In [3]:
data = pd.read_csv("./data/Dataset.csv")

In [4]:
data.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Rating color,Rating text,Votes,Aggregate rating
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",1100,Botswana Pula(P),Yes,No,No,No,3,Dark Green,Excellent,314,4.8
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,1200,Botswana Pula(P),Yes,No,No,No,3,Dark Green,Excellent,591,4.5
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",4000,Botswana Pula(P),Yes,No,No,No,4,Green,Very Good,270,4.4
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",1500,Botswana Pula(P),No,No,No,No,4,Dark Green,Excellent,365,4.9
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",1500,Botswana Pula(P),Yes,No,No,No,4,Dark Green,Excellent,229,4.8


In [5]:
data.columns

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Rating color', 'Rating text', 'Votes',
       'Aggregate rating'],
      dtype='object')

In [6]:
data = data.drop(labels=['Restaurant ID','Restaurant Name','Country Code','City','Address','Locality','Locality Verbose','Longitude','Latitude','Cuisines','Currency','Switch to order menu','Rating color'],axis = 1)

In [7]:
data.head(2)

Unnamed: 0,Average Cost for two,Has Table booking,Has Online delivery,Is delivering now,Price range,Rating text,Votes,Aggregate rating
0,1100,Yes,No,No,3,Excellent,314,4.8
1,1200,Yes,No,No,3,Excellent,591,4.5


In [8]:
# Seperating Independent and dependent feature:

X = data.drop(labels=['Aggregate rating'],axis=1)
y = data[["Aggregate rating"]]

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler

In [10]:
numeric_features = ['Average Cost for two', 'Votes']
numeric_transformer = Pipeline(
    steps=[
    ('scaler', StandardScaler())
])

categorical_features = ['Has Table booking', 'Has Online delivery', 'Is delivering now', 'Price range']
categorical_transformer = Pipeline(
    steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

ordinal_features = ['Rating text']
ordinal_transformer = Pipeline(
    steps=[
    ('ordinal', OrdinalEncoder(categories=[['Poor', 'Average', 'Good', 'Very Good', 'Excellent', 'Not rated']]))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('ord', ordinal_transformer, ordinal_features)
    ])

In [11]:
preprocessor

In [12]:
# Train test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=42)

In [13]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [14]:
X_train.head()

Unnamed: 0,num__Average Cost for two,num__Votes,cat__Has Table booking_No,cat__Has Table booking_Yes,cat__Has Online delivery_No,cat__Has Online delivery_Yes,cat__Is delivering now_No,cat__Is delivering now_Yes,cat__Price range_1,cat__Price range_2,cat__Price range_3,cat__Price range_4,ord__Rating text
0,-0.045529,-0.258704,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0
1,-0.045529,-0.339507,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,-0.039274,-0.363273,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,5.0
3,-0.026764,0.173834,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0
4,-0.064294,-0.363273,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,5.0


In [15]:
# model training
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [16]:
regression = LinearRegression()
regression.fit(X_train,y_train)

In [17]:
regression.intercept_

array([-1.57717563e+13])

In [18]:
import numpy as np
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae, rmse, r2_square

In [19]:
# Basic Import
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge,Lasso,ElasticNet

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV


In [20]:
models = {
                "Linear Regression": LinearRegression(),
                "Lasso": Lasso(),
                "Ridge": Ridge(),
                "K-Neighbors Regressor": KNeighborsRegressor(),
                "Decision Tree": DecisionTreeRegressor(),
                "Random Forest Regressor": RandomForestRegressor(),
                "AdaBoost Regressor": AdaBoostRegressor()
            }

trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)


    #Model Prediction
    y_pred = model.predict(X_test)

    mae, rmse, r2_square = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])



    print("Model Training Performance")
    print("RMSE: ",rmse)
    print("MAE: ",mae)
    print("R2: ",r2_square*100)

    r2_list.append(r2_square)

    print("="*35)
    print("\n")

Linear Regression
Model Training Performance
RMSE:  0.9012134377162363
MAE:  0.6893618065247732
R2:  64.10679511700583


Lasso
Model Training Performance
RMSE:  1.2689207563788525
MAE:  1.0112228128755798
R2:  28.841641135755246


Ridge
Model Training Performance
RMSE:  0.9011034457682171
MAE:  0.6892019201234705
R2:  64.11555602188935


K-Neighbors Regressor
Model Training Performance
RMSE:  0.2136937958411962
MAE:  0.12992323796231683
R2:  97.98190899116186


Decision Tree
Model Training Performance
RMSE:  0.2295274959391812
MAE:  0.146230426935242
R2:  97.67176744298133


Random Forest Regressor
Model Training Performance
RMSE:  0.18848296052029534
MAE:  0.12738977966132592
R2:  98.42999473132397


AdaBoost Regressor
Model Training Performance
RMSE:  0.1924034170471732
MAE:  0.14028490565106527
R2:  98.36400307888604




In [21]:
def evaluate_model(X_train,y_train,X_test,y_test,models):
    try:
        report = {}
        for i in range(len(models)):
            model = list(models.values())[i]
            # Train model
            model.fit(X_train,y_train)

            

            # Predict Testing data
            y_test_pred =model.predict(X_test)

            # Get R2 scores for train and test data
            #train_model_score = r2_score(ytrain,y_train_pred)
            test_model_score = r2_score(y_test,y_test_pred)

            report[list(models.keys())[i]] =  test_model_score

        return report
    
    except Exception as e:
        logging.info('Exception occured during model training')
        raise CustomException(e,sys)


In [22]:
model_report:dict=evaluate_model(X_train,y_train,X_test,y_test,models)
print('\n====================================================================================\n')
print(model_report)




{'Linear Regression': 0.6410679511700583, 'Lasso': 0.28841641135755247, 'Ridge': 0.6411555602188934, 'K-Neighbors Regressor': 0.9798190899116186, 'Decision Tree': 0.976736690557129, 'Random Forest Regressor': 0.9844892681763524, 'AdaBoost Regressor': 0.9836295526349561}


In [23]:
best_model_score = max(sorted(model_report.values()))

best_model_name = list(model_report.keys())[
    list(model_report.values()).index(best_model_score)
]

best_model = models[best_model_name]

print(f'Best Model Found , Model Name : {best_model_name} , R2 Score : {best_model_score}')

Best Model Found , Model Name : Random Forest Regressor , R2 Score : 0.9844892681763524
