#### Use diabetes dataset. The datasets consist of several medical predictor (independent) variables and one target (dependent) variable, Outcome. Independent variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on. Use random forest ensemble method to build the classification model. Evaluate your model performance. Use “gridsearchcv( )” to find the best value of decision trees for creating forest.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/rahul96rajan/sample_datasets/master/diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']

In [5]:
y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
X_train.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0
mean,3.742671,120.855049,69.415309,20.399023,81.438111,31.983388,0.469168,32.907166
std,3.313264,32.035057,18.512599,15.433974,116.234835,7.740625,0.336847,11.503437
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,100.0,64.0,0.0,0.0,27.1,0.2415,24.0
50%,3.0,117.0,72.0,23.0,42.5,32.0,0.3725,29.0
75%,6.0,139.0,80.0,32.0,129.75,36.375,0.61375,40.0
max,17.0,199.0,122.0,63.0,846.0,67.1,2.42,81.0


In [20]:
X_train = X_train.copy()
false_zeroes_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
X_train.loc[:, false_zeroes_features] = X_train[false_zeroes_features].replace(0, np.nan)

In [21]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 60 to 102
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               614 non-null    int64  
 1   Glucose                   609 non-null    float64
 2   BloodPressure             590 non-null    float64
 3   SkinThickness             438 non-null    float64
 4   Insulin                   324 non-null    float64
 5   BMI                       607 non-null    float64
 6   DiabetesPedigreeFunction  614 non-null    float64
 7   Age                       614 non-null    int64  
dtypes: float64(6), int64(2)
memory usage: 43.2 KB


In [22]:
preprop = ColumnTransformer([("imp_med", SimpleImputer(strategy='median'),
                              false_zeroes_features)], remainder='passthrough')

In [23]:
X_train = preprop.fit_transform(X_train)

In [24]:
rfr = RandomForestClassifier(random_state=0, n_jobs=-1)
params = {'n_estimators':[500, 750, 1000],'max_features': [4, 8],
          'max_leaf_nodes': [3,5,7]}

gs = GridSearchCV(estimator=rfr, param_grid=params, scoring='f1_macro', cv=5)

In [25]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
             param_grid={'max_features': [4, 8], 'max_leaf_nodes': [3, 5, 7],
                         'n_estimators': [500, 750, 1000]},
             scoring='f1_macro')

In [26]:
print(gs.best_params_)
print(gs.best_estimator_)

{'max_features': 8, 'max_leaf_nodes': 7, 'n_estimators': 1000}
RandomForestClassifier(max_features=8, max_leaf_nodes=7, n_estimators=1000,
                       n_jobs=-1, random_state=0)


In [27]:
X_test = preprop.fit_transform(X_test)
y_pred_test = gs.predict(X_test)
y_pred_train = gs.predict(X_train)

In [28]:
print('F1 Score(Train): {:.4f}'.format(f1_score(y_train, y_pred_train,
                                                average='macro')))
print('F1 Score(Test): {:.4f}'.format(f1_score(y_test, y_pred_test,
                                               average='macro')))

print('\nAccuracy Score(Train): {:.4f}'.format(accuracy_score(y_train,
                                                              y_pred_train)))
print('Accuracy Score(Test): {:.4f}'.format(accuracy_score(y_test,
                                                           y_pred_test)))

F1 Score(Train): 0.7913
F1 Score(Test): 0.7471

Accuracy Score(Train): 0.8143
Accuracy Score(Test): 0.7727
