In [1]:
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
data=pd.read_csv('diabetes.csv')

In [4]:
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [5]:
data['Glucose']=np.where(data['Glucose']==0,data['Glucose'].median(),data['Glucose'])
data['Insulin']=np.where(data['Insulin']==0,data['Insulin'].median(),data['Insulin'])
data['SkinThickness']=np.where(data['SkinThickness']==0,data['SkinThickness'].median(),data['SkinThickness'])

In [6]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [8]:
#Independent & Dependent features

X=data.drop("Outcome",axis=1)
Y=data['Outcome']

In [9]:
print(X.head())
print(Y.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6    148.0             72           35.0     30.5  33.6   
1            1     85.0             66           29.0     30.5  26.6   
2            8    183.0             64           23.0     30.5  23.3   
3            1     89.0             66           23.0     94.0  28.1   
4            0    137.0             40           35.0    168.0  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64


In [10]:
# Train Test split

from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, Y_train,Y_test= train_test_split(X,Y,test_size=0.2, random_state=33)

In [12]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier=RandomForestClassifier(n_estimators=10).fit(X_train,Y_train)
prediction=rf_classifier.predict(X_test)

In [13]:
Y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [14]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(Y_test,prediction))
print(accuracy_score(Y_test,prediction))
print(classification_report(Y_test,prediction))

[[87 12]
 [27 28]]
0.7467532467532467
              precision    recall  f1-score   support

           0       0.76      0.88      0.82        99
           1       0.70      0.51      0.59        55

    accuracy                           0.75       154
   macro avg       0.73      0.69      0.70       154
weighted avg       0.74      0.75      0.74       154



    ##Randomized CV

In [18]:
from sklearn.model_selection import RandomizedSearchCV
#No of trees in the Random Forest
n_estimators=[int(x) for x in np.linspace(start=200,stop=2000,num=10)]
#No of features to consider at every split
max_features=['auto','sqrt','log2']
#Maximum number of levels in tree
max_depth=[int(x) for x in np.linspace(10,1000,10)]
min_samples_split=[1,3,4,5,7,9]
#Minimum number of samples required at each leaf node
min_samples_leaf=[1,2,4,6,8]
#Create the random grid
random_grid={'n_estimators':n_estimators,
            'max_features':max_features,
             'max_depth':max_depth,
             'min_samples_split':min_samples_split,
             'min_samples_leaf':min_samples_leaf,
             'criterion':['entropy','gini']}
print(random_grid)            

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [1, 3, 4, 5, 7, 9], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [19]:
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=100,n_jobs=-1 )

rf_randomcv.fit(X_train,Y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [1, 3, 4, 5, 7, 9],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [20]:
best_random_grid=rf_randomcv.best_estimator_

In [21]:
from sklearn.metrics import accuracy_score
Y_pred=best_random_grid.predict(X_test)
print(confusion_matrix(Y_test,Y_pred))
print("Accuracy Score{}".format(accuracy_score(Y_test,Y_pred)))
print("Classification report: {}".format(classification_report(Y_test,Y_pred)))

[[86 13]
 [24 31]]
Accuracy Score0.7597402597402597
Classification report:               precision    recall  f1-score   support

           0       0.78      0.87      0.82        99
           1       0.70      0.56      0.63        55

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.72       154
weighted avg       0.75      0.76      0.75       154



In [22]:
rf_randomcv.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=560, max_features='log2',
                       min_samples_leaf=6, min_samples_split=3,
                       n_estimators=200)

In [23]:
rf_randomcv.best_params_

{'n_estimators': 200,
 'min_samples_split': 3,
 'min_samples_leaf': 6,
 'max_features': 'log2',
 'max_depth': 560,
 'criterion': 'entropy'}

##GridSearch CV

In [24]:
rf_randomcv.best_params_

{'n_estimators': 200,
 'min_samples_split': 3,
 'min_samples_leaf': 6,
 'max_features': 'log2',
 'max_depth': 560,
 'criterion': 'entropy'}

In [26]:
[rf_randomcv.best_params_['min_samples_split']-2,
                          rf_randomcv.best_params_['min_samples_split']-1,
                          rf_randomcv.best_params_['min_samples_split'],
                          rf_randomcv.best_params_['min_samples_split']+1,
                          rf_randomcv.best_params_['min_samples_split']+2]      

[1, 2, 3, 4, 5]

In [30]:
from sklearn.model_selection import GridSearchCV

param_grid={
    'criterion':[rf_randomcv.best_params_['criterion']],
    'max_depth':[rf_randomcv.best_params_['max_depth']],
    'max_features':[rf_randomcv.best_params_['max_features']],
    'min_sample_leaf':[rf_randomcv.best_params_['min_samples_leaf'],
                      rf_randomcv.best_params_['min_samples_leaf']+2,
                      rf_randomcv.best_params_['min_samples_leaf']+4],
    'min_samples_split':[rf_randomcv.best_params_['min_samples_split']-2,
                          rf_randomcv.best_params_['min_samples_split']-1,
                          rf_randomcv.best_params_['min_samples_split'],
                          rf_randomcv.best_params_['min_samples_split']+1,
                          rf_randomcv.best_params_['min_samples_split']+2], 
     'n_estimator':[rf_randomcv.best_params_['n_estimators']-200 ,
                    rf_randomcv.best_params_['n_estimators']-100,
                    rf_randomcv.best_params_['n_estimators'],
                    rf_randomcv.best_params_['n_estimators']+100,
                    rf_randomcv.best_params_['n_estimators']+200 ] 
}

print(param_grid)

{'criterion': ['entropy'], 'max_depth': [560], 'max_features': ['log2'], 'min_sample_leaf': [6, 8, 10], 'min_samples_split': [1, 2, 3, 4, 5], 'n_estimator': [0, 100, 200, 300, 400]}


In [34]:
####Fit the grid_search to the data

rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,verbose=2,n_jobs=-1 )

grid_search.fit(X_train,Y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


ValueError: Invalid parameter min_sample_leaf for estimator RandomForestClassifier(criterion='entropy', max_depth=560, max_features='log2'). Check the list of available parameters with `estimator.get_params().keys()`.