In [1]:
import numpy as np

In [2]:
import pandas as pd

In [15]:
df = pd.read_csv("diabetes.csv")

In [16]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [17]:
df['Glucose'] = np.where(df['Glucose'] == 0, df['Glucose'].median(),df["Glucose"])
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35,0,33.6,0.627,50,1
1,1,85.0,66,29,0,26.6,0.351,31,0
2,8,183.0,64,0,0,23.3,0.672,32,1
3,1,89.0,66,23,94,28.1,0.167,21,0
4,0,137.0,40,35,168,43.1,2.288,33,1


In [18]:
X = df.drop('Outcome', axis = 1)
y = df['Outcome']

In [19]:
df['Insulin'] = np.where(df['Insulin'] == 0, df['Insulin'].median(), df['Insulin'])
df['SkinThickness'] = np.where(df['SkinThickness'] == 0, df['SkinThickness'].median(), df['Insulin'])

In [20]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35,0,33.6,0.627,50
1,1,85.0,66,29,0,26.6,0.351,31
2,8,183.0,64,0,0,23.3,0.672,32
3,1,89.0,66,23,94,28.1,0.167,21
4,0,137.0,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76,48,180,32.9,0.171,63
764,2,122.0,70,27,0,36.8,0.340,27
765,5,121.0,72,23,112,26.2,0.245,30
766,1,126.0,60,0,0,30.1,0.349,47


In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.25, random_state =4)

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
rf = RandomForestClassifier(n_estimators = 10).fit(X_train,y_train)
pred = rf.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [28]:
print(accuracy_score(pred,y_test))
print(classification_report(pred,y_test))
print(confusion_matrix(pred,y_test))

0.765625
              precision    recall  f1-score   support

           0       0.85      0.80      0.83       133
           1       0.61      0.68      0.64        59

    accuracy                           0.77       192
   macro avg       0.73      0.74      0.73       192
weighted avg       0.77      0.77      0.77       192

[[107  26]
 [ 19  40]]


#  Randomized search CV

In [31]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [ int(x) for x in np.linspace(start = 200, stop = 2000, num= 10)]

max_features = ['auto','sqrt', 'log2']

max_depth = [int(x) for x in np.linspace(10,1000,10)]

min_samples_split = [1,3,4,5,10,14]

min_samples_leaf = [1,2,4,6,8]

random_grid = {
    
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'min_samples_split':min_samples_split,
    'min_samples_leaf' :min_samples_leaf,
    'criterion':['entropy', 'gini']
}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [1, 3, 4, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [35]:
rf = RandomForestClassifier()

rf_randomcv = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv= 3,verbose = 2, random_state = 100, n_jobs =-1)
rf_randomcv.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 12.8min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [50]:
best_paras = rf_randomcv.best_estimator_

In [51]:
y_pred = best_paras.predict(X_test)

In [52]:
accuracy_score(y_test,y_pred)

0.796875

In [53]:
rf_randomcv.best_params_

{'n_estimators': 1200,
 'min_samples_split': 14,
 'min_samples_leaf': 8,
 'max_features': 'log2',
 'max_depth': 230,
 'criterion': 'gini'}

In [None]:
from sklearn.model_selection import GridSearchCV
para_grid ={
    'criterion': [rf_randomcv.best_params_['criterion']],
    'max_depth':[rf_randomcv.best_params_['max_depth']],
    'max_features':[rf_randomcv.best_params_['max_features']],
    'min_samples_leaf':[rf_randomcv.best_params_['min_samples_leaf'],
                        rf_randomcv.best_params_['min_samples_leaf']+2,
                         rf_randomcv.best_params_['min_samples_leaf']+4],
    'min_samples_split':[rf_randomcv.best_params_['min_samples_split']-2,
                         rf_randomcv.best_params_['min_samples_split']-1,
                         rf_randomcv.best_params_['min_samples_split'],
                         rf_randomcv.best_params_['min_samples_split']+2,
                         rf_randomcv.best_params_['min_samples_split']+1],
    'n_estimators':[rf_randomcv.best_params_['n_estimators']-200,
                   rf_randomcv.best_params_['n_estimators'],
                   rf_randomcv.best_params_['n_estimators']+100,
                   rf_randomcv.best_params_['n_estimators']+200,
                   rf_randomcv.best_params_['n_estimators']-100,]

                         
    
}
print(para_grid)

In [None]:
rf = RandomForestClassifier()
gcv = GridSearchCV(estimator= rf,param_grid =para_grid,verbose= 2,cv= 10,verbose=2)
gcv.fit(X_train,y_train)