In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score
from sklearn.model_selection import RandomizedSearchCV , GridSearchCV

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("diabetes.csv")

In [4]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df["Glucose"] = np.where(df["Glucose"] == 0 , df["Glucose"].median(), df["Glucose"])
df["Insulin"] = np.where(df["Insulin"] == 0 , df["Insulin"].median(), df["Insulin"])
df["SkinThickness"] = np.where(df["SkinThickness"] == 0 , df["SkinThickness"].median(), df["SkinThickness"])

In [6]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [7]:
x = df.drop("Outcome", axis=1)
y = df["Outcome"]

In [8]:
pd.DataFrame(x,columns=df.columns[:-1])

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35.0,30.5,33.6,0.627,50
1,1,85.0,66,29.0,30.5,26.6,0.351,31
2,8,183.0,64,23.0,30.5,23.3,0.672,32
3,1,89.0,66,23.0,94.0,28.1,0.167,21
4,0,137.0,40,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76,48.0,180.0,32.9,0.171,63
764,2,122.0,70,27.0,30.5,36.8,0.340,27
765,5,121.0,72,23.0,112.0,26.2,0.245,30
766,1,126.0,60,23.0,30.5,30.1,0.349,47


In [9]:
x_train , x_test , y_train , y_test = train_test_split(x, y, random_state=33, test_size=0.2)

In [10]:
rn = RandomForestClassifier(n_estimators=100).fit(x_train , y_train)

In [11]:
prediction = rn.predict(x_test)

In [12]:
print(confusion_matrix(y_test , prediction))
print(accuracy_score(y_test , prediction))
print(classification_report(y_test , prediction))

[[88 11]
 [25 30]]
0.7662337662337663
              precision    recall  f1-score   support

           0       0.78      0.89      0.83        99
           1       0.73      0.55      0.62        55

    accuracy                           0.77       154
   macro avg       0.76      0.72      0.73       154
weighted avg       0.76      0.77      0.76       154



The main parameters used by a Random Forest Classifier are:

- criterion = the function used to evaluate the quality of a split.
- max_depth = maximum number of levels allowed in each tree.
- max_features = maximum number of features considered when splitting a node.
- min_samples_leaf = minimum number of samples which can be stored in a tree leaf.
- min_samples_split = minimum number of samples necessary in a node to cause node splitting.
- n_estimators = number of trees in the ensamble.

In [13]:
model = RandomForestClassifier(n_estimators=500 , criterion="gini", max_features="sqrt", max_depth=15 , min_samples_leaf=10 , random_state=100).fit(x_train , y_train)

In [14]:
pred = model.predict(x_test) 

In [15]:
print(confusion_matrix(y_test, pred))
print(accuracy_score(y_test , pred))
print(classification_report(y_test ,pred))

[[87 12]
 [28 27]]
0.7402597402597403
              precision    recall  f1-score   support

           0       0.76      0.88      0.81        99
           1       0.69      0.49      0.57        55

    accuracy                           0.74       154
   macro avg       0.72      0.68      0.69       154
weighted avg       0.73      0.74      0.73       154



In [16]:
n_estimators = [int(x) for x in np.linspace(start=200 , stop=2000 , num=10)]

criterion = ["gini", "entropy", "log_loss"]

max_features = ["sqrt", "log2"]

max_depth = [int(x) for x in np.linspace(10 , 1000 , 10)]

min_samples_leaf = [1, 2, 4,6,8]

min_samples_split = [2, 5, 10,14]

random_grid = {
    "n_estimators": n_estimators, 
    "criterion": criterion,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_leaf": min_samples_leaf,
    "min_samples_split": min_samples_split
}

In [17]:
random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'criterion': ['gini', 'entropy', 'log_loss'],
 'max_features': ['sqrt', 'log2'],
 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000],
 'min_samples_leaf': [1, 2, 4, 6, 8],
 'min_samples_split': [2, 5, 10, 14]}

In [18]:
rf = RandomForestClassifier()
rf_randomcv = RandomizedSearchCV(estimator=rf, param_distributions=random_grid ,n_iter=100 , cv=3 , verbose=2 , random_state=100 , n_jobs=1 )

In [19]:
rf_randomcv.fit(x_train , y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END criterion=entropy, max_depth=450, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END criterion=entropy, max_depth=450, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END criterion=entropy, max_depth=450, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END criterion=entropy, max_depth=780, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=1400; total time=   2.8s
[CV] END criterion=entropy, max_depth=780, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=1400; total time=   2.8s
[CV] END criterion=entropy, max_depth=780, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=1400; total time=   2.8s
[CV] END criterion=log_loss, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_

[CV] END criterion=entropy, max_depth=450, max_features=sqrt, min_samples_leaf=8, min_samples_split=2, n_estimators=2000; total time=   3.8s
[CV] END criterion=entropy, max_depth=450, max_features=sqrt, min_samples_leaf=8, min_samples_split=2, n_estimators=2000; total time=   3.8s
[CV] END criterion=entropy, max_depth=560, max_features=log2, min_samples_leaf=1, min_samples_split=14, n_estimators=200; total time=   0.4s
[CV] END criterion=entropy, max_depth=560, max_features=log2, min_samples_leaf=1, min_samples_split=14, n_estimators=200; total time=   0.4s
[CV] END criterion=entropy, max_depth=560, max_features=log2, min_samples_leaf=1, min_samples_split=14, n_estimators=200; total time=   0.4s
[CV] END criterion=gini, max_depth=670, max_features=sqrt, min_samples_leaf=1, min_samples_split=14, n_estimators=1000; total time=   1.8s
[CV] END criterion=gini, max_depth=670, max_features=sqrt, min_samples_leaf=1, min_samples_split=14, n_estimators=1000; total time=   1.8s
[CV] END criterio

[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=1800; total time=   4.3s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=1800; total time=   3.7s
[CV] END criterion=gini, max_depth=890, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=1800; total time=   3.7s
[CV] END criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=600; total time=   1.1s
[CV] END criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=600; total time=   1.1s
[CV] END criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=600; total time=   1.1s
[CV] END criterion=gini, max_depth=230, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   3.7s
[CV] END criterion=gini, max_depth=230, 

[CV] END criterion=entropy, max_depth=120, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=800; total time=   1.5s
[CV] END criterion=gini, max_depth=450, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=1800; total time=   3.3s
[CV] END criterion=gini, max_depth=450, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=1800; total time=   3.3s
[CV] END criterion=gini, max_depth=450, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=1800; total time=   3.3s
[CV] END criterion=entropy, max_depth=10, max_features=log2, min_samples_leaf=8, min_samples_split=2, n_estimators=2000; total time=   4.0s
[CV] END criterion=entropy, max_depth=10, max_features=log2, min_samples_leaf=8, min_samples_split=2, n_estimators=2000; total time=   4.0s
[CV] END criterion=entropy, max_depth=10, max_features=log2, min_samples_leaf=8, min_samples_split=2, n_estimators=2000; total time=   4.0s
[CV] END criterion=log_los

[CV] END criterion=entropy, max_depth=120, max_features=sqrt, min_samples_leaf=8, min_samples_split=14, n_estimators=400; total time=   0.7s
[CV] END criterion=entropy, max_depth=120, max_features=sqrt, min_samples_leaf=8, min_samples_split=14, n_estimators=400; total time=   0.8s
[CV] END criterion=log_loss, max_depth=230, max_features=sqrt, min_samples_leaf=1, min_samples_split=14, n_estimators=1200; total time=   3.0s
[CV] END criterion=log_loss, max_depth=230, max_features=sqrt, min_samples_leaf=1, min_samples_split=14, n_estimators=1200; total time=   2.8s
[CV] END criterion=log_loss, max_depth=230, max_features=sqrt, min_samples_leaf=1, min_samples_split=14, n_estimators=1200; total time=   3.0s
[CV] END criterion=gini, max_depth=780, max_features=sqrt, min_samples_leaf=6, min_samples_split=2, n_estimators=400; total time=   0.7s
[CV] END criterion=gini, max_depth=780, max_features=sqrt, min_samples_leaf=6, min_samples_split=2, n_estimators=400; total time=   0.7s
[CV] END criter

[CV] END criterion=log_loss, max_depth=560, max_features=sqrt, min_samples_leaf=1, min_samples_split=14, n_estimators=1600; total time=   3.2s
[CV] END criterion=log_loss, max_depth=560, max_features=sqrt, min_samples_leaf=1, min_samples_split=14, n_estimators=1600; total time=   3.4s
[CV] END criterion=log_loss, max_depth=560, max_features=sqrt, min_samples_leaf=1, min_samples_split=14, n_estimators=1600; total time=   4.2s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=4, min_samples_split=14, n_estimators=1000; total time=   2.6s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=4, min_samples_split=14, n_estimators=1000; total time=   2.2s
[CV] END criterion=entropy, max_depth=230, max_features=sqrt, min_samples_leaf=4, min_samples_split=14, n_estimators=1000; total time=   1.9s


In [20]:
rf_randomcv.best_params_

{'n_estimators': 800,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 890,
 'criterion': 'gini'}

In [21]:
rf_randomcv.best_estimator_

In [22]:
best_random_grid = rf_randomcv.best_estimator_

In [23]:
y_pred = best_random_grid.predict(x_test)

In [24]:
print(confusion_matrix(y_test ,y_pred))
print("accurcay scorce is:" , accuracy_score(y_test , y_pred))
print(classification_report(y_test , y_pred))

[[83 16]
 [22 33]]
accurcay scorce is: 0.7532467532467533
              precision    recall  f1-score   support

           0       0.79      0.84      0.81        99
           1       0.67      0.60      0.63        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.72       154
weighted avg       0.75      0.75      0.75       154



### GridSearchCV

In [25]:
rf_randomcv.best_params_

{'n_estimators': 800,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 890,
 'criterion': 'gini'}

In [26]:
param_grid = {
    "criterion": [rf_randomcv.best_params_["criterion"]],
    "max_depth": [rf_randomcv.best_params_["max_depth"]],
    "min_samples_leaf": [rf_randomcv.best_params_["min_samples_leaf"],
                        rf_randomcv.best_params_["min_samples_leaf"]+2,
                        rf_randomcv.best_params_["min_samples_leaf"]+4],
    "min_samples_split": [rf_randomcv.best_params_["min_samples_split"]-2,
                         rf_randomcv.best_params_["min_samples_split"]-1,
                         rf_randomcv.best_params_["min_samples_split"],
                         rf_randomcv.best_params_["min_samples_split"]+2,
                         rf_randomcv.best_params_["min_samples_split"]+1],
    "max_features": [rf_randomcv.best_params_["max_features"]],
    "n_estimators": [rf_randomcv.best_params_["n_estimators"]-200,
                    rf_randomcv.best_params_["n_estimators"]-100,
                    rf_randomcv.best_params_["n_estimators"],
                    rf_randomcv.best_params_["n_estimators"]+200,
                    rf_randomcv.best_params_["n_estimators"]+100]
}

In [27]:
param_grid

{'criterion': ['gini'],
 'max_depth': [890],
 'min_samples_leaf': [1, 3, 5],
 'min_samples_split': [3, 4, 5, 7, 6],
 'max_features': ['sqrt'],
 'n_estimators': [600, 700, 800, 1000, 900]}

In [28]:
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf , param_grid=param_grid , cv=10 , n_jobs=-1, verbose=2)

In [29]:
grid_search.fit(x_train , y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


In [30]:
grid_search.best_estimator_

In [31]:
best_grid = grid_search.best_estimator_

In [32]:
best_grid

In [33]:
y_pred = best_grid.predict(x_test)

In [34]:
print(confusion_matrix(y_test , y_pred))
print(accuracy_score(y_test , y_pred ))
print(classification_report(y_test , y_pred))

[[86 13]
 [24 31]]
0.7597402597402597
              precision    recall  f1-score   support

           0       0.78      0.87      0.82        99
           1       0.70      0.56      0.63        55

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.72       154
weighted avg       0.75      0.76      0.75       154

