<a href="https://colab.research.google.com/github/sb2356-iiitr/ML_Projects/blob/main/HyperparameterTuning/HPTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hyperparameter Tuning
Compare multiple classifiers for different train and test values

In [2]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving hpt_small.csv to hpt_small.csv
User uploaded file "hpt_small.csv" with length 29709 bytes


### Part 1: Import data and prepare the models

In [5]:
# Import Libraries
import pandas as pd

In [6]:
# Read dataset
data = pd.read_csv('hpt_small.csv')

In [7]:
# Create dumy variables
data_prep = pd.get_dummies(data, drop_first=True)

In [8]:
# Create X and Y variables
X = data_prep.iloc[:, :-1]
Y = data_prep.iloc[:, -1]

In [9]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state = 1234)

In [10]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1234)

In [11]:
# Support vector
from sklearn.svm import SVC
svc = SVC(kernel='rbf', gamma=0.5)

In [12]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=1234)

In [13]:
# Cross Validation
from sklearn.model_selection import cross_validate
cv_result_dtc = cross_validate(dtc, X, Y, cv=10, return_train_score=True)
cv_result_rfc = cross_validate(rfc, X, Y, cv=10, return_train_score=True)
cv_result_svc = cross_validate(svc, X, Y, cv=10, return_train_score=True)
cv_result_lrc = cross_validate(lrc, X, Y, cv=10, return_train_score=True)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [14]:
# Get average of all results
import numpy as np
dtc_test_avg = np.average(cv_result_dtc['test_score'])
rfc_test_avg = np.average(cv_result_rfc['test_score'])
svc_test_avg = np.average(cv_result_svc['test_score'])
lrc_test_avg = np.average(cv_result_lrc['test_score'])

dtc_train_avg = np.average(cv_result_dtc['train_score'])
rfc_train_avg = np.average(cv_result_rfc['train_score'])
svc_train_avg = np.average(cv_result_svc['train_score'])
lrc_train_avg = np.average(cv_result_lrc['train_score'])


In [15]:
# Print and analyse the results
print('         ', 'Decision Tree  ', 'Random Forest  ', 'Support Vector Machines',
      'Logistic Regression  ')
print('         ', '---------------', '---------------', '-----------------------',
      '---------------------')
print(' Test  : ',
      round(dtc_test_avg, 4), '         ',
      round(rfc_test_avg, 4), '         ',
      round(svc_test_avg, 4), '         ',
      round(lrc_test_avg, 4))
print(' Train : ',
      round(dtc_train_avg, 4), '         ',
      round(rfc_train_avg, 4), '         ',
      round(svc_train_avg, 4), '         ',
      round(lrc_train_avg, 4))


          Decision Tree   Random Forest   Support Vector Machines Logistic Regression  
          --------------- --------------- ----------------------- ---------------------
 Test  :  0.75           0.776           0.734           0.814
 Train :  0.984           0.9836           0.9549           0.8267


In [16]:
svc = SVC(random_state=1234)

### Part 2: Create GridSearchCV object

In [17]:
# GridSearchCV
from sklearn.model_selection import GridSearchCV

In [18]:
# Create parameter grid for RandomForest
rfc_param = {'n_estimators': [10, 15, 20],
             'min_samples_split': [8, 16],
             'min_samples_leaf': [1, 2, 3, 4, 5]}

Diferent possible combos of parameters:
3 x 2 x 5 = 30 models

In [19]:
rfc_grid = GridSearchCV(estimator=rfc,
                        param_grid=rfc_param,
                        scoring='accuracy',
                        cv=10,
                        return_train_score=True)

No. of jobs = 30 * 10 = 300

### Part 3: Fit the data into GridSearchCV object

In [20]:
rfc_grid_fit = rfc_grid.fit(X, Y)

In [21]:
# Get the result of the gridsearchcv
cv_result_rfc = pd.DataFrame.from_dict(rfc_grid_fit.cv_results_)
print(cv_result_rfc)

    mean_fit_time  std_fit_time  ...  mean_train_score  std_train_score
0        0.018706      0.001781  ...          0.909111         0.006242
1        0.026949      0.002767  ...          0.909556         0.006049
2        0.033527      0.000865  ...          0.913778         0.004848
3        0.018467      0.002039  ...          0.867111         0.008766
4        0.027347      0.002724  ...          0.869778         0.006061
5        0.034048      0.002388  ...          0.876667         0.007252
6        0.018071      0.001153  ...          0.878000         0.005917
7        0.025841      0.001366  ...          0.879333         0.007374
8        0.034480      0.001994  ...          0.883111         0.004683
9        0.017300      0.000365  ...          0.853778         0.007749
10       0.026597      0.002948  ...          0.853556         0.010997
11       0.033072      0.000703  ...          0.859333         0.008494
12       0.018517      0.002253  ...          0.866000         0

### Part 4: GridSearchCV using Logistic Regression

In [22]:
# Implement the GridSearch for Logistic Regression
lrc_param = {'C': [0.01, 0.1, 0.5, 1, 2, 5, 10],
             'penalty': ['l2'],
             'solver': ['liblinear', 'lbfgs', 'saga']}

Number of combinations = 7 X 1 X 3 = 21  
Number of jobs = 21 X 10 folds = 210

In [23]:
# Create the grid search object
lrc_grid = GridSearchCV(estimator=lrc,
                        param_grid=lrc_param,
                        scoring='accuracy',
                        cv=10,
                        return_train_score=True)

In [24]:
# Fit the data
lrc_grid_test = lrc_grid.fit(X, Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [25]:
# Get the result of the gridsearchcv
cv_result_lrc = pd.DataFrame.from_dict(rfc_grid_fit.cv_results_)

In [26]:
cv_result_lrc.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.018706,0.001781,0.002483,8e-05,1,8,10,"{'min_samples_leaf': 1, 'min_samples_split': 8...",0.74,0.82,0.76,0.78,0.78,0.78,0.86,0.76,0.8,0.74,0.782,0.035157,28,0.911111,0.904444,0.924444,0.906667,0.913333,0.908889,0.902222,0.902222,0.911111,0.906667,0.909111,0.006242
1,0.026949,0.002767,0.003014,0.000146,1,8,15,"{'min_samples_leaf': 1, 'min_samples_split': 8...",0.74,0.84,0.76,0.78,0.76,0.76,0.82,0.8,0.8,0.74,0.78,0.032249,29,0.913333,0.897778,0.911111,0.906667,0.917778,0.908889,0.904444,0.913333,0.904444,0.917778,0.909556,0.006049
2,0.033527,0.000865,0.003728,0.000644,1,8,20,"{'min_samples_leaf': 1, 'min_samples_split': 8...",0.72,0.82,0.76,0.78,0.78,0.74,0.82,0.76,0.8,0.76,0.774,0.031048,30,0.908889,0.904444,0.917778,0.917778,0.911111,0.911111,0.917778,0.92,0.911111,0.917778,0.913778,0.004848
3,0.018467,0.002039,0.002574,0.000434,1,16,10,"{'min_samples_leaf': 1, 'min_samples_split': 1...",0.76,0.84,0.76,0.8,0.8,0.82,0.8,0.78,0.74,0.76,0.786,0.029732,27,0.875556,0.855556,0.871111,0.882222,0.857778,0.857778,0.864444,0.871111,0.875556,0.86,0.867111,0.008766
4,0.027347,0.002724,0.003573,0.000854,1,16,15,"{'min_samples_leaf': 1, 'min_samples_split': 1...",0.78,0.84,0.76,0.82,0.8,0.8,0.8,0.78,0.76,0.76,0.79,0.02569,23,0.88,0.86,0.873333,0.871111,0.864444,0.864444,0.864444,0.871111,0.871111,0.877778,0.869778,0.006061


### Part 4: GridSearchCV using Support Vector

In [27]:
# Define parameters for Support Vector Clasifiers
svc_param = {'C': [0.01, 0.1, 0.5, 1, 2, 5, 10],
             'kernel': ['rbf', 'linear'],
             'gamma': [0.1, 0.25, 0.5, 1, 5]}

Number of combinations = 7 X 2 X 5 = 70
Number of jobs = 70 X 10 folds = 700

In [28]:
svc_grid = GridSearchCV(estimator=svc,
                        param_grid=svc_param,
                        scoring='accuracy',
                        cv=10,
                        n_jobs=-1,
                        return_train_score=True)

In [29]:
# Fit the data to do Grid Search for Support Vector
svc_grid_fit = svc_grid.fit(X, Y)

In [30]:
# Get the result of the gridsearchcv
cv_result_svc = pd.DataFrame.from_dict(svc_grid_fit.cv_results_)
cv_result_svc.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.018723,0.004846,0.003003,0.000762,0.01,0.1,rbf,"{'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}",0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.74,0.74,0.756,0.008,35,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.757778,0.757778,0.756,0.000889
1,0.01392,0.001998,0.00215,0.00012,0.01,0.1,linear,"{'C': 0.01, 'gamma': 0.1, 'kernel': 'linear'}",0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.74,0.74,0.756,0.008,35,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.757778,0.757778,0.756,0.000889
2,0.018211,0.001544,0.002823,0.000149,0.01,0.25,rbf,"{'C': 0.01, 'gamma': 0.25, 'kernel': 'rbf'}",0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.74,0.74,0.756,0.008,35,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.757778,0.757778,0.756,0.000889
3,0.012189,0.001261,0.001972,7.3e-05,0.01,0.25,linear,"{'C': 0.01, 'gamma': 0.25, 'kernel': 'linear'}",0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.74,0.74,0.756,0.008,35,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.757778,0.757778,0.756,0.000889
4,0.020544,0.002358,0.004002,0.002089,0.01,0.5,rbf,"{'C': 0.01, 'gamma': 0.5, 'kernel': 'rbf'}",0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.74,0.74,0.756,0.008,35,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.755556,0.757778,0.757778,0.756,0.000889


In [32]:
# Get the top ranked test score for all the three classifiers
rfc_top_rank = cv_result_rfc[cv_result_rfc['rank_test_score']==1].iloc[0]
lrc_top_rank = cv_result_lrc[cv_result_lrc['rank_test_score']==1].iloc[0]
svc_top_rank = cv_result_svc[cv_result_svc['rank_test_score']==1].iloc[0]


In [34]:
# Print the train and test score for 3 classifiers
print('                   ',
      ' Random Forest     ',
      ' Logistic Regression   ',
      ' Support Vector    ')

print('                   ',
      ' ----------------  ',
      ' -------------------   ',
      ' ----------------- ')

print(' Mean Test Score  : ',
      str('%.4f' %rfc_top_rank['mean_test_score']),
      '                   ',
      str('%.4f' %lrc_top_rank['mean_test_score']),
      '                       ',
      str('%.4f' %svc_top_rank['mean_test_score']))

print(' Mean Train Score : ',
      str('%.4f' %rfc_top_rank['mean_train_score']),
      '                   ',
      str('%.4f' %lrc_top_rank['mean_train_score']),
      '                       ',
      str('%.4f' %svc_top_rank['mean_train_score']))

                     Random Forest       Logistic Regression     Support Vector    
                     ----------------    -------------------     ----------------- 
 Mean Test Score  :  0.8160                     0.8160                         0.8200
 Mean Train Score :  0.8564                     0.8564                         0.8324


In [35]:
# Import RnadomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [37]:
# Create the RandomizedSearchCV object
rfc_rs = RandomizedSearchCV(estimator=rfc,
                        param_distributions=rfc_param,
                        scoring='accuracy',
                        cv=10,
                        n_iter=10,
                        return_train_score=True,
                        random_state=1234)

In [38]:
# n_iter selects 10 combinations out of 30 possible
# Now 10 X 10 = 100 jobs will be created

In [39]:
# Fit the data into RandomizedSearchCV object
rfc_rs_fit = rfc_rs.fit(X, Y)

In [40]:
# Get the results of Randomized search
cv_result_rfc_rs = pd.DataFrame.from_dict(rfc_grid_fit.cv_results_)

In [41]:
# Print the best parameters of Randomized search
print('The best parameters are: ')
print(rfc_rs_fit.best_params_)

The best parameters are: 
{'n_estimators': 15, 'min_samples_split': 16, 'min_samples_leaf': 5}
