# **Import Libraries**

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [2]:
df=pd.read_csv('diabetes.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [6]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [8]:
df['Outcome'].value_counts(normalize=True)

0    0.651042
1    0.348958
Name: Outcome, dtype: float64

# **Machine Learning**

In [9]:
X=df.drop('Outcome', axis=1)
Y=df['Outcome']

In [10]:
def build_model(algo, X, Y, r_s=42, t_s=0.33):
    X_train, X_test, Y_train, Y_test=train_test_split(X, Y, random_state=r_s, test_size=t_s, stratify=Y)
    
    model=algo()
    model.fit(X_train, Y_train)
    
    Y_pred=model.predict(X_test)
    
    print(type(model).__name__)
    
    print(classification_report(Y_test, Y_pred))

In [11]:
models=[LogisticRegression, DecisionTreeClassifier, KNeighborsClassifier, RandomForestClassifier, SVC, MLPClassifier]

In [12]:
for m in models:
    build_model(m, X, Y)

LogisticRegression
              precision    recall  f1-score   support

           0       0.77      0.85      0.81       165
           1       0.67      0.54      0.60        89

    accuracy                           0.74       254
   macro avg       0.72      0.70      0.70       254
weighted avg       0.74      0.74      0.74       254

DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.74      0.79      0.76       165
           1       0.55      0.47      0.51        89

    accuracy                           0.68       254
   macro avg       0.64      0.63      0.64       254
weighted avg       0.67      0.68      0.67       254

KNeighborsClassifier
              precision    recall  f1-score   support

           0       0.76      0.84      0.80       165
           1       0.63      0.51      0.56        89

    accuracy                           0.72       254
   macro avg       0.70      0.67      0.68       254
weighted av

# **Model Tuning RandomForestClassifier**

In [13]:
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, random_state=42, test_size=0.33)

In [14]:
rf_param_grid = {
    'min_samples_split': [3, 5, 10], 
    'n_estimators' : [100, 300],
    'max_depth': [3, 5, 15, 25],
    'max_features': [3, 5, 10, 20]
}

model=RandomForestClassifier()

clf = GridSearchCV(model, param_grid=rf_param_grid, 
                       cv = StratifiedKFold(n_splits = 5), scoring = "accuracy", n_jobs = -1,verbose = 1)

clf.fit(X_train, Y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [3, 5, 15, 25],
                         'max_features': [3, 5, 10, 20],
                         'min_samples_split': [3, 5, 10],
                         'n_estimators': [100, 300]},
             scoring='accuracy', verbose=1)

In [15]:
clf.best_estimator_

RandomForestClassifier(max_depth=25, max_features=5, min_samples_split=10,
                       n_estimators=300)

In [16]:
final_model=clf.best_estimator_

final_model.fit(X_train, Y_train)
Y_pred=final_model.predict(X_test)

print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       168
           1       0.63      0.65      0.64        86

    accuracy                           0.75       254
   macro avg       0.72      0.73      0.73       254
weighted avg       0.75      0.75      0.75       254

