# Selecting the best model with Best hyperparameters


In [21]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# train test split the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [22]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [23]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [24]:
encode = ['sex','smoker','day','time']

save = {}
for col in encode:
    le = LabelEncoder()
    df[col]=le.fit_transform(df[col])
    save[col]=le
    
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3
2,21.01,3.5,1,0,2,0,3
3,23.68,3.31,1,0,2,0,2
4,24.59,3.61,0,0,2,0,4


In [25]:
df.isnull().sum().sort_values(ascending=False)

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [26]:
x = df.drop('tip', axis=1)
y = df['tip']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

models = { 
    'LinearRegression': LinearRegression(),
    'SVR': SVR(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor()          
}

model_score = []

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pre = model.predict(x_test)
    metric = mean_absolute_error(y_test, y_pre)
    model_score.append((name, metric))

sorted_model = sorted(model_score, key=lambda x: x[1], reverse=False)

for model in sorted_model:
    print(f"{model[0]} : Mean Absolute Error = {model[1]:.2f}")


SVR : Mean Absolute Error = 0.57
LinearRegression : Mean Absolute Error = 0.67
XGBRegressor : Mean Absolute Error = 0.67
KNeighborsRegressor : Mean Absolute Error = 0.73
GradientBoostingRegressor : Mean Absolute Error = 0.73
RandomForestRegressor : Mean Absolute Error = 0.77
DecisionTreeRegressor : Mean Absolute Error = 0.93


In [27]:
x = df.drop('tip', axis=1)
y = df['tip']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

models = { 
    'LinearRegression': LinearRegression(),
    'SVR': SVR(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor()          
}

model_score = []

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pre = model.predict(x_test)
    metric = r2_score(y_test, y_pre)
    model_score.append((name, metric))

sorted_model = sorted(model_score, key=lambda x: x[1], reverse=False)

for model in sorted_model:
    print(f"{model[0]} : R_2 square error = {model[1]:.2f}")


DecisionTreeRegressor : R_2 square error = -0.09
RandomForestRegressor : R_2 square error = 0.22
KNeighborsRegressor : R_2 square error = 0.33
GradientBoostingRegressor : R_2 square error = 0.36
XGBRegressor : R_2 square error = 0.41
LinearRegression : R_2 square error = 0.44
SVR : R_2 square error = 0.57


In [29]:
%%time
# Note, ensure to give more time to run this cell
# Create a dictionaries of list of models to evaluate performance with hyperparameters
from sklearn.model_selection import RandomizedSearchCV
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'epsilon': [0.1, 0.01, 0.001]}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10], 'splitter': ['best', 'random']}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100, 1000], 'max_depth': [None, 5, 10]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2), 'weights': ['uniform', 'distance']}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'loss': ['ls', 'lad', 'huber', 'quantile'], 'n_estimators': [10, 100, 1000]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100, 1000], 'learning_rate': [0.1, 0.01, 0.001]}),          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

for name, (model, params) in models.items():
    # create a pipline
    pipeline = RandomizedSearchCV(model, params, cv=5)
    
    # fit the pipeline
    pipeline.fit(x_train, y_train)
    
    # make prediction from each model
    y_pre = pipeline.predict(x_test)
    
      
    # print the performing metric
    print(name, 'MSE: ', mean_squared_error(y_test, y_pre))
    print(name, 'R2: ', r2_score(y_test, y_pre))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pre))
    print('\n')



LinearRegression MSE:  0.6948129686287711
LinearRegression R2:  0.4441368826121931
LinearRegression MAE:  0.6703807496461158


SVR MSE:  0.6654921450889252
SVR R2:  0.46759407917172435
SVR MAE:  0.6271844423788759


DecisionTreeRegressor MSE:  0.8774153020453993
DecisionTreeRegressor R2:  0.298051667053291
DecisionTreeRegressor MAE:  0.718948162948163






RandomForestRegressor MSE:  1.0395937344321824
RandomForestRegressor R2:  0.1683059468812914
RandomForestRegressor MAE:  0.8371064561427674


KNeighborsRegressor MSE:  0.611582775695312
KNeighborsRegressor R2:  0.5107225633545738
KNeighborsRegressor MAE:  0.6117006802721088




25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        self._parameter_constraints,
 

GradientBoostingRegressor MSE:  0.8024435902322148
GradientBoostingRegressor R2:  0.35803041144348513
GradientBoostingRegressor MAE:  0.7330355055532771






XGBRegressor MSE:  0.7601696611425505
XGBRegressor R2:  0.3918503299956485
XGBRegressor MAE:  0.7351689690959697


CPU times: total: 3min
Wall time: 1min 32s
