# Tuning Hyperparameters

## changing model's paramters called Hyperparameters, to enhance the overall model performance and get the best possible scores

### Now instead of train and test datasets we will need one other one called validation dataset, for tuning hyperparameters

# Hyperparameters can be changed:
## 1. Manually
## 2. Randomly with RandomSearchCV
## 3. Exhaustively with GridSearchCV

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("./../heart.csv")

# Now for Manual Hyperparameter Tuning, We will create a function to do that!

In [3]:
from sklearn.metrics import accuracy_score, precision_score,recall_score ,f1_score
def evaluate_metrics(Y_actual,Y_pred):
    accuracy = accuracy_score(Y_actual,Y_pred)
    precision = precision_score(Y_actual,Y_pred,zero_division=0)
    recall = recall_score(Y_actual,Y_pred, zero_division=0)
    f1 = f1_score(Y_actual,Y_pred)
    scores = {"accuracy":accuracy,
              "precision":precision,
              "recall":recall,
              "f1":f1}
    # we can also print these scores
    print(f"Accuracy:{round(accuracy,2)}")
    print(f"precision:{round(precision,2)}")
    print(f"recall:{round(recall,2)}")
    print(f"f1_score:{round(f1,2)}")
    
    return scores
    

## step1 of Hypreparameter Tuning(split the dataset)

In [4]:
from sklearn.model_selection import train_test_split
# lets shuffle the data
mixed_dataset = dataset.sample(frac=1)
# X and Y dataset samples
X = dataset.drop("target",axis = 1)
Y = dataset["target"]

# train 70%, valid 15% and test 15%
X_temp,X_train,Y_temp,Y_train = train_test_split(X,Y,test_size=0.15,random_state=42,stratify=Y)

# Now let's split the temp into train and valid 70% and 15%
X_train,X_valid,Y_train, Y_valid = train_test_split(X_temp,Y_temp,test_size = 0.1765, random_state=42, stratify=Y_temp)

# Baseline/Initial Results without Tuning

In [8]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(5)
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train,Y_train)
# baseline(first predictions)
y_predicted = rfc_model.predict(X_valid)
result = evaluate_metrics(Y_valid,y_predicted)

Accuracy:0.76
precision:0.75
recall:0.84
f1_score:0.79


# 1-Manually Tuning Hyperparameter

In [7]:
np.random.seed(5)
rfc_model2 = RandomForestClassifier(n_estimators=7,max_depth=9) # By default n_estimators = 100
rfc_model2.fit(X_train,Y_train)
# baseline(first predictions)
y_predicted = rfc_model2.predict(X_valid)
result = evaluate_metrics(Y_valid,y_predicted)

Accuracy:0.8
precision:0.77
recall:0.92
f1_score:0.84


# 2- Tuning Using RSCV

In [15]:
# FIRST CREATE A DICTIIONARY FOR THE PARAMETERS TO BE CHANGED
h_params = {
    "max_depth":[None,5,10,20,30],
    "n_estimators":[10,100,200,500,1000],
    "min_samples_split":[2,4,6]
}

X = mixed_dataset.drop("target",axis = 1)
Y = mixed_dataset["target"]
x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size=0.25)
clf = RandomForestClassifier(n_jobs=1)

In [16]:
from sklearn.model_selection import RandomizedSearchCV

rscv = RandomizedSearchCV(estimator=clf,
                         param_distributions=h_params,
                          n_iter=5,
                          cv = 3,
                          verbose=2
                         )

In [17]:
rscv.fit(x_train,y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END .max_depth=5, min_samples_split=6, n_estimators=100; total time=   0.3s
[CV] END .max_depth=5, min_samples_split=6, n_estimators=100; total time=   0.2s
[CV] END .max_depth=5, min_samples_split=6, n_estimators=100; total time=   0.2s
[CV] END max_depth=20, min_samples_split=4, n_estimators=200; total time=   0.6s
[CV] END max_depth=20, min_samples_split=4, n_estimators=200; total time=   0.5s
[CV] END max_depth=20, min_samples_split=4, n_estimators=200; total time=   0.6s
[CV] END max_depth=30, min_samples_split=6, n_estimators=1000; total time=   3.4s
[CV] END max_depth=30, min_samples_split=6, n_estimators=1000; total time=   3.2s
[CV] END max_depth=30, min_samples_split=6, n_estimators=1000; total time=   3.0s
[CV] END max_depth=20, min_samples_split=6, n_estimators=1000; total time=   3.0s
[CV] END max_depth=20, min_samples_split=6, n_estimators=1000; total time=   3.0s
[CV] END max_depth=20, min_samples_split=6, 

0,1,2
,estimator,RandomForestC...fier(n_jobs=1)
,param_distributions,"{'max_depth': [None, 5, ...], 'min_samples_split': [2, 4, ...], 'n_estimators': [10, 100, ...]}"
,n_iter,5
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,5
,min_samples_split,6
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [19]:
# Best results at Best Hyperparameters
rscv.best_params_

{'n_estimators': 100, 'min_samples_split': 6, 'max_depth': 5}

In [20]:
# Now let us check the accuracy with the best_params_ but FIRST:
# 1>> Get the current model's prediction
rscv_pred_y_labels = rscv.predict(x_test)
# lets get the result by calling that method evaluate_metrics
rscv_result = evaluate_metrics(y_test,rscv_pred_y_labels)

Accuracy:0.83
precision:0.8
recall:0.87
f1_score:0.84


# Compare the Manually tuned results vs RSCV TUNED results
## Without RSCV
* Accuracy:0.76
* precision:0.75
* recall:0.84
* f1_score:0.79
## With RSCV
* Accuracy:0.83
* precision:0.8
* recall:0.87
* f1_score:0.84

### REMEMBER:
### The total number of combinations possible by the given Hyperparameter is in hundreds but the model does it for only fifteen random samples/combinations
### so in order to check the model for all of the possible combinations we have an other Concept/Technique called:
# Grid Search CV