# Selecting the best model with best hyperparameters

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# train test split the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# import grid search for cross validation
from sklearn.model_selection import GridSearchCV
# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# load dataset
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [4]:
# selecting features and variables
X = df.drop('tip', axis=1)
y = df['tip']

# label encode categorical variables
le = LabelEncoder()
X['sex'] = le.fit_transform(X['sex'])
X['smoker'] = le.fit_transform(X['smoker'])
X['day'] = le.fit_transform(X['day'])
X['time'] = le.fit_transform(X['time'])

In [5]:
%%time
# train test split the data
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a dictionaries of list of model to evaluate the performance
models = {
    'LinearRegression' : LinearRegression(),
    'SVR' : SVR(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor()
}

# train and predict each model with evaluation metrics
model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)

    # make prediction on each model
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))

# selecting the best model from all above models with evalution metrics and sorting the value
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Absolute error for', f"{model[0]} is {model[1]: .2f}")

Mean Absolute error for SVR is  0.57
Mean Absolute error for LinearRegression is  0.67
Mean Absolute error for XGBRegressor is  0.67
Mean Absolute error for GradientBoostingRegressor is  0.72
Mean Absolute error for KNeighborsRegressor is  0.73
Mean Absolute error for RandomForestRegressor is  0.78
Mean Absolute error for DecisionTreeRegressor is  0.97
CPU times: total: 1.28 s
Wall time: 1.81 s


In [6]:
%%time
# train test split the data
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a dictionaries of list of model to evaluate the performance
models = {
    'LinearRegression' : LinearRegression(),
    'SVR' : SVR(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor()
}

# train and predict each model with evaluation metrics
model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)

    # make prediction on each model
    y_pred = model.predict(X_test)
    metric = r2_score(y_test, y_pred)
    model_scores.append((name, metric))

# selecting the best model from all above models with evalution metrics and sorting the value
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=True)
for model in sorted_models:
    print('r2 score for', f"{model[0]} is {model[1]: .2f}")

r2 score for SVR is  0.57
r2 score for LinearRegression is  0.44
r2 score for XGBRegressor is  0.41
r2 score for GradientBoostingRegressor is  0.35
r2 score for KNeighborsRegressor is  0.33
r2 score for RandomForestRegressor is  0.24
r2 score for DecisionTreeRegressor is  0.07
CPU times: total: 1.11 s
Wall time: 464 ms


In [7]:
%%time
# train test split the data
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a dictionaries of list of model to evaluate the performance
models = {
    'LinearRegression' : LinearRegression(),
    'SVR' : SVR(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor()
}

# train and predict each model with evaluation metrics
model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)

    # make prediction on each model
    y_pred = model.predict(X_test)
    metric = mean_squared_error(y_test, y_pred)
    model_scores.append((name, metric))

# selecting the best model from all above models with evalution metrics and sorting the value
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('mean squared error for', f"{model[0]} is {model[1]: .2f}")

mean squared error for SVR is  0.54
mean squared error for LinearRegression is  0.69
mean squared error for XGBRegressor is  0.74
mean squared error for GradientBoostingRegressor is  0.81
mean squared error for KNeighborsRegressor is  0.84
mean squared error for RandomForestRegressor is  0.98
mean squared error for DecisionTreeRegressor is  1.13
CPU times: total: 1.23 s
Wall time: 531 ms


# With `diamonds` dataset

In [8]:
diamonds = sns.load_dataset('diamonds')
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [9]:
diamonds.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
price         int64
x           float64
y           float64
z           float64
dtype: object

In [10]:
# selecting features and variables
X = diamonds.drop('price', axis=1)
y = diamonds['price']

# label encode categorical variables
le = LabelEncoder()
X['cut'] = le.fit_transform(X['cut'])
X['color'] = le.fit_transform(X['color'])
X['clarity'] = le.fit_transform(X['clarity'])
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,2,1,3,61.5,55.0,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,4.2,4.23,2.63
4,0.31,1,6,3,63.3,58.0,4.34,4.35,2.75


In [11]:
%%time
# train test split the data
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a dictionaries of list of model to evaluate the performance
models = {
    'LinearRegression' : LinearRegression(),
    'SVR' : SVR(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor()
}

# train and predict each model with evaluation metrics
model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)

    # make prediction on each model
    y_pred = model.predict(X_test)
    metric = r2_score(y_test, y_pred)
    model_scores.append((name, metric))

# selecting the best model from all above models with evalution metrics and sorting the value
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('r2 score for', f"{model[0]} is {model[1]: .2f}")

r2 score for SVR is -0.12
r2 score for LinearRegression is  0.89
r2 score for KNeighborsRegressor is  0.95
r2 score for DecisionTreeRegressor is  0.97
r2 score for GradientBoostingRegressor is  0.97
r2 score for XGBRegressor is  0.98
r2 score for RandomForestRegressor is  0.98
CPU times: total: 3min 51s
Wall time: 3min 54s
