#  Hyperparameter Tuning Techniques

In [4]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [5]:
df = pd.read_csv('penguins.csv')

In [6]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


### Dropping rows with missing values

In [8]:
df_cleaned = df.dropna()

In [10]:
df_cleaned

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,FEMALE
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


### Encoding categorical variables

In [11]:
le_species = LabelEncoder()
le_island = LabelEncoder()
le_sex = LabelEncoder()

In [13]:
df_cleaned['species_encoded'] = le_species.fit_transform(df_cleaned['species'])
df_cleaned['island_encoded'] = le_island.fit_transform(df_cleaned['island'])
df_cleaned['sex_encoded'] = le_sex.fit_transform(df_cleaned['sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['species_encoded'] = le_species.fit_transform(df_cleaned['species'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['island_encoded'] = le_island.fit_transform(df_cleaned['island'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['sex_encoded'] = le_sex.fit_transform

### Features and target

In [14]:
features = df_cleaned[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'island_encoded', 'sex_encoded']]
target = df_cleaned['species_encoded']

### Standardizing the features

In [15]:
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

### Splitting the data

In [16]:
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42, stratify=target)

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((266, 6), (67, 6), (266,), (67,))

### Define the model

In [18]:
rf = RandomForestClassifier(random_state=42)

In [19]:
rf

### Define the hyperparameters grid

In [20]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

### Apply Grid Search

In [21]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


### Best parameters and best score

In [22]:
best_params_grid_search = grid_search.best_params_
best_score_grid_search = grid_search.best_score_

In [23]:
best_params_grid_search

{'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 50}

In [24]:
best_score_grid_search

0.9811320754716981

# Mini Project 

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [40]:
df = pd.read_csv('heart.csv')

In [41]:
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [42]:
df.shape

(303, 14)

In [43]:
X = df.iloc[:, 0: -1]
y = df.iloc[:, -1]

In [44]:
X

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [45]:
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: output, Length: 303, dtype: int64

### Split the data

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
X_train.shape

(242, 13)

In [48]:
X_test.shape

(61, 13)

### Define the models

In [50]:
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
svc = SVC()
lr = LogisticRegression()

In [52]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8524590163934426

In [53]:
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
accuracy_score(y_test, y_pred)

0.7704918032786885

In [54]:
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy_score(y_test, y_pred)

0.7049180327868853

In [55]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8852459016393442

In [57]:
rf = RandomForestClassifier(max_samples = 0.75, random_state = 42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9016393442622951

### Cross Validation Score

In [63]:
from sklearn .model_selection import cross_val_score
np.mean(cross_val_score(RandomForestClassifier(), X, y, cv = 10, scoring = 'accuracy'))

0.8180645161290323

In [64]:
np.mean(cross_val_score(RandomForestClassifier(max_samples = 0.75), X, y, cv = 10, scoring = 'accuracy'))

0.821505376344086

### Hyper Parameter Tunning

#### GridSearchCV (One of the methods to tun hperparamters)

#### No of trees in Random Forest

In [65]:
n_estimators = [20, 60, 100, 120]

#### No of features to consider at every split

In [66]:
max_features = [0, 2, 0.6, 1.0]

#### MAx No of levels in tree

In [68]:
max_depth = [2, 8, None]

#### No of samples

In [70]:
max_samples = [0.5, 0.75, 1.0]

In [71]:
param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'max_samples': max_samples
             }



In [72]:
param_grid

{'n_estimators': [20, 60, 100, 120],
 'max_features': [0, 2, 0.6, 1.0],
 'max_depth': [2, 8, None],
 'max_samples': [0.5, 0.75, 1.0]}

In [73]:
rf = RandomForestClassifier()

In [77]:
from sklearn.model_selection import GridSearchCV

In [78]:
rf_grid = GridSearchCV (estimator = rf,
                        param_grid = param_grid,
                        cv = 5,
                        verbose = 2,
                        n_jobs = -1)

In [79]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


180 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
63 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Fahan\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Fahan\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 340, in fit
    self._validate_params()
  File "C:\Users\Fahan\anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Fahan\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParam

### Best parameters

In [81]:
rf_grid.best_params_

{'max_depth': 2, 'max_features': 2, 'max_samples': 0.75, 'n_estimators': 60}

### Best Score

In [82]:
rf_grid.best_score_

0.8387755102040817

### RandomSearchCV ( Another Method) 

In [92]:
n_estimators = [20, 60, 100, 120]
max_features = [0, 2, 0.6, 1.0]
max_depth = [2, 8, None]
max_samples = [0.5, 0.75, 1.0]
bootstrap = [True, False]
min_samples_split = [2, 5]
min_samples_leaf = [1, 2]

In [93]:
param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'max_samples': max_samples,
              'bootstrap': bootstrap,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf
             }

In [94]:
param_grid

{'n_estimators': [20, 60, 100, 120],
 'max_features': [0, 2, 0.6, 1.0],
 'max_depth': [2, 8, None],
 'max_samples': [0.5, 0.75, 1.0],
 'bootstrap': [True, False],
 'min_samples_split': [2, 5],
 'min_samples_leaf': [1, 2]}

In [95]:
from sklearn.model_selection import RandomizedSearchCV

In [96]:
rf_grid = GridSearchCV (estimator = rf,
                        param_grid = param_grid,
                        cv = 5,
                        verbose = 2,
                        n_jobs = -1)

In [97]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


3600 fits failed out of a total of 5760.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
680 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Fahan\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Fahan\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 340, in fit
    self._validate_params()
  File "C:\Users\Fahan\anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Fahan\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidPa

In [98]:
rf_grid.best_params_

{'bootstrap': True,
 'max_depth': None,
 'max_features': 2,
 'max_samples': 1.0,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 20}

In [99]:
rf_grid.best_score_

0.8427721088435375