## All techniques of hyper parameter techniques employed in machine learning

1. Randomized search CV
2. Grid search CV
3. Bayesian optimization--> Automated hyper parameter tuning(Hyperpot)
4. Sequantial model based based optimization.
5. Optuna-Automated hyper parameter tuning
6. Genetic algorithems(TPOT Classifier)

In [None]:
# installing librarys
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load the dataset
data=pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
data.head()

In [None]:
data.rename(columns={'DiabetesPedigreeFunction':'DPF'},inplace=True)

In [None]:
data.head()

In [None]:
# check how many columns and rows are inside data
data.shape

In [None]:
# chech any null values are present
data.isnull().sum()

In [None]:
# our dataset countains some zero values we need to handle that by using,median or mean
# check any zero values present
print(len(data[data['Insulin']==0]))
print(len(data[data['Glucose']==0]))
print(len(data[data['SkinThickness']==0]))

In [None]:
#plot the data
data['SkinThickness'].plot.hist()

In [None]:
data['Glucose'].plot.hist()

In [None]:
data['Insulin'].plot.hist()

In [None]:
# Fill zero values with median

data['SkinThickness']=np.where(data['SkinThickness']==0,data['SkinThickness'].median(),data['SkinThickness'])
data['Insulin']=np.where(data['Insulin']==0,data['Insulin'].median(),data['Insulin'])
data['Glucose']=np.where(data['Glucose']==0,data['Glucose'].median(),data['Glucose'])


In [None]:
data.head()

In [None]:
data['Pregnancies']=np.where(data['Pregnancies']==0,data['Pregnancies'].median(),data['Pregnancies'])

In [None]:
data.head()

In [None]:
data['SkinThickness'].plot.hist()

In [None]:
data['Insulin'].plot.hist()

In [None]:
x=data.iloc[:,:-1]
y=data['Outcome']

In [None]:
x.head()

In [None]:
# Train the set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=10)
rf.fit(x,y)

In [None]:
model=rf.predict(x_test)
model[:5]

In [None]:
y.value_counts()

In [None]:
# Check the accuracy
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
print(classification_report(y_test,model))
print(confusion_matrix(y_test,model))
print(accuracy_score(y_test,model))

#### The main parameters used by a Random Forest Classifier are:

1. criterion = the function used to evaluate the quality of a split.
2. max_depth = maximum number of levels allowed in each tree.
3. max_features = maximum number of features considered when splitting a node.
4. min_samples_leaf = minimum number of samples which can be stored in a tree leaf.
5. min_samples_split = minimum number of samples necessary in a node to cause node splitting.
6. n_estimators = number of trees in the ensamble.

In [None]:
# Manual hyper parameter tuning
rf_model=RandomForestClassifier(n_estimators=100,criterion='entropy',max_features='sqrt',min_samples_leaf=10,random_state=10)
rf_model.fit(x_train,y_train)
pred=rf_model.predict(x_test)

In [None]:
print(classification_report(y_test,pred))
print(confusion_matrix(y_test,pred))
print(accuracy_score(y_test,pred))

In [None]:
## Randoomized Search cv
from sklearn.model_selection import RandomizedSearchCV
# Number of tress would you define
n_estimators=[int(x) for x in np.linspace(start=300,stop=2000,num=10)]
# Number of features to consider
max_features=['auto','sqrt','log2']
# maximum number of levels in tree
max_depth=[int(x) for x in np.linspace(10,1000,num=10)]
# Minimum number of samples required to split
min_samples_split=[1,3,5,7,9,10,12,15]
# minimum number samples required at each node
min_samples_leaf=[1,3,5,7,0.1,0.5,0.2,10,0.1]


random_grid={'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth':max_depth,
            'min_samples_split':min_samples_split,
            'min_samples_leaf':min_samples_leaf,
            'criterion':['entrophy','gini']}


In [None]:
rf=RandomForestClassifier()
rf_1=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,cv=5,n_iter=100,verbose=2,n_jobs=-1,random_state=0)
rf_1.fit(x_train,y_train)

In [None]:
rf_1.best_estimator_

In [None]:
rf_1.best_params_

In [None]:
rf_best=rf_1.best_estimator_
rf_best

In [None]:
y_pred=rf_best.predict(x_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

## GridSearchCV

In [None]:
rf_1.best_params_

In [None]:
from sklearn.model_selection import GridSearchCV

params={'criterion':[rf_1.best_params_['criterion']],
       'max_depth':[rf_1.best_params_['max_depth']],
       'max_features':[rf_1.best_params_['max_features']],
       'min_samples_split':[rf_1.best_params_['min_samples_split']-1,
                           rf_1.best_params_['min_samples_split'],
                           rf_1.best_params_['min_samples_split']+1,
                           rf_1.best_params_['min_samples_split']+2],
        'min_samples_leaf':[rf_1.best_params_['min_samples_leaf']-2,
                            rf_1.best_params_['min_samples_split']-1,
                            rf_1.best_params_['min_samples_leaf'],
                            rf_1.best_params_['min_samples_leaf']+1,
                            rf_1.best_params_['min_samples_leaf']+2],
        'n_estimators':[rf_1.best_params_['n_estimators']-200,
                       rf_1.best_params_['n_estimators']-100,
                       rf_1.best_params_['n_estimators'],
                       rf_1.best_params_['n_estimators']+100,
                       rf_1.best_params_['n_estimators']+200]
       }

print(params)
        

In [None]:
rf=RandomForestClassifier()
grid_model=GridSearchCV(estimator=rf,param_grid=params,cv=5,n_jobs=-1,verbose=2)
grid_model.fit(x_train,y_train)

In [None]:
grid_best=grid_model.best_estimator_
grid_best

In [None]:
grid_model.best_params_


In [None]:
y_pred=grid_best.predict(x_test)
y_pred[:5]

In [None]:
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))