In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV


from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv("/content/drive/MyDrive/DSS_Pipeline/brfss2020.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
completely_empty = [feature for feature in df.columns if df[feature].isnull().sum() == df.shape[0]]

exp1 = df.fillna(-1)
exp1 = exp1.drop(columns=completely_empty)

exp1["DIABETE4"] = exp1["DIABETE4"].apply(lambda x: 1 if x in [1] else 0)
exp1 = exp1.drop(columns=["DIABAGE3", "RENTHOM1", "PREDIAB1", "_AGE80","GENHLTH"])

x_train, x_test, y_train, y_test = train_test_split(exp1.drop(columns=["DIABETE4"]),
                                                    exp1["DIABETE4"],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=exp1["DIABETE4"])

In [None]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
print(exp1.shape)
print(df.shape)

(321566, 270) (80392, 270) (321566,) (80392,)
(401958, 271)
(401958, 279)


In [None]:
## exp1 - Diabetes prediction with decision tree

#Gridcv on Decision Tree Classifier with to find best parameters for RECALL 

clf = DecisionTreeClassifier(random_state = 42, class_weight = "balanced")

parameters = {"max_depth" : range(5,31,2)}
grid_dtree = GridSearchCV(estimator=clf, param_grid = parameters,scoring ="recall", cv=3, n_jobs=1)

x_train_grid = grid_dtree.fit(x_train, y_train)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train)
test_pred = x_train_grid.predict(x_test)

print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno diabetes\tdiabetes")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train, train_pred)
recall_train = recall_score(y_train, train_pred)
f1_train = f1_score(y_train, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

{'max_depth': 5}
Accuracy on train data 0.7797124074062557
Accuracy on test data 0.7799034729823863

			no diabetes	diabetes
class-wise accuracy for train [0.75310746 0.95839232]
class-wise accuracy for test [0.75297615 0.96074479]

precision train and test 0.3662833351674553 0.36673383403553766
recall train and test 0.9583923215356929 0.9607447931663308
f1 train and test 0.5300061704231053 0.5308373548284457



In [None]:
#Gridcv on Decision Tree Classifier with to find best parameters for Precision 

clf = DecisionTreeClassifier(random_state = 42, class_weight = "balanced")
parameters = {"max_depth" : range(5,31,2)}
grid_dtree = GridSearchCV(estimator=clf, param_grid = parameters,scoring ="precision", cv=3, n_jobs=1)

x_train_grid = grid_dtree.fit(x_train, y_train)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train)
test_pred = x_train_grid.predict(x_test)

print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno diabetes\tdiabetes")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train, train_pred)
recall_train = recall_score(y_train, train_pred)
f1_train = f1_score(y_train, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

{'max_depth': 29}
Accuracy on train data 0.9790494019890162
Accuracy on test data 0.9241715593591402

			no diabetes	diabetes
class-wise accuracy for train [0.97613357 0.99863227]
class-wise accuracy for test [0.94336387 0.79527786]

precision train and test 0.8616919955277651 0.6764633847661033
recall train and test 0.9986322735452909 0.7952778577598618
f1 train and test 0.925121981038756 0.7310746426680783



In [None]:
clf = DecisionTreeClassifier(max_depth=15, random_state = 42, class_weight = "balanced")
clf.fit(x_train, y_train)

test_pred = clf.predict(x_test)

importances = clf.feature_importances_

feature_importances = pd.Series(importances, index=x_train.columns)

print(feature_importances.sort_values(ascending=False)[:25])

predictions_percentage = clf.predict_proba(x_test)

print(predictions_percentage[:50])

PDIABTST    0.582219
_AGEG5YR    0.088949
DIABEYE     0.049702
_STSTR      0.043375
_RFHLTH     0.032953
ECIGARET    0.029205
_BMI5       0.020305
CAREGIV1    0.014199
CHECKUP1    0.014163
PNEUVAC4    0.012104
_STATE      0.011205
LCSCTSCN    0.009634
_RFBMI5     0.005998
_CHISPNC    0.005791
_DUALCOR    0.004008
DROCDY3_    0.003484
TRNSGNDR    0.003129
ACEPRISN    0.002747
_DRNKWK1    0.002705
WEIGHT2     0.002595
MARIJAN1    0.002290
CSTATE1     0.002269
WTKG3       0.002232
DRNKDRI2    0.002024
_LLCPWT     0.002010
dtype: float64
[[1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [0.53694994 0.46305006]
 [0.79471719 0.20528281]
 [0.61803148 0.38196852]
 [1.         0.        ]
 [0.         1.        ]
 [0.53694994 0.46305006]
 [1.         0.        ]
 [0.79400167 0.20599833]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [0.14749793 0.85250207]
 [1.         0.        ]
 [0.82217959 0.17782041]
 [1.         0.        ]
 [0.973853

In [None]:
# exp2 - Diabetes prediction with mlp
#Recall

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
sc = StandardScaler()

x_train_small = pd.concat([x_train[y_train == 0].sample(2*18170), x_train[y_train == 1].sample(18170)])
y_train_small = [0]*2*18170 + [1]*18170


x_train_scale = sc.fit_transform(x_train_small)
x_test_scale = sc.fit_transform(x_test)

mlp_clf = MLPClassifier(random_state=42)


parameters = {"hidden_layer_sizes": [(7,)],"max_iter":[40] }

grid_mlp = GridSearchCV(estimator=mlp_clf, param_grid = parameters, scoring="recall",cv=3, n_jobs=1)

x_train_grid = grid_mlp.fit(x_train_scale,y_train_small)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train_scale)
test_pred = x_train_grid.predict(x_test_scale)

print("Accuracy on train data", metrics.accuracy_score(y_train_small, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train_small, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno diabetes\tdiabetes")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train_small, train_pred)
recall_train = recall_score(y_train_small, train_pred)
f1_train = f1_score(y_train_small, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)




{'hidden_layer_sizes': (7,), 'max_iter': 40}
Accuracy on train data 0.9168409466152999
Accuracy on test data 0.7956139914419346

			no diabetes	diabetes
class-wise accuracy for train [0.93847001 0.87358283]
class-wise accuracy for test [0.76888228 0.97514157]

precision train and test 0.8765254845656856 0.3858423211301838
recall train and test 0.8735828288387452 0.9751415682887034
f1 train and test 0.8750516827916975 0.5529101248945607


In [None]:
# exp2 - Diabetes prediction with mlp
#precision

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
sc = StandardScaler()

x_train_small = pd.concat([x_train[y_train == 0].sample(2*18170), x_train[y_train == 1].sample(18170)])
y_train_small = [0]*2*18170 + [1]*18170


x_train_scale = sc.fit_transform(x_train_small)
x_test_scale = sc.fit_transform(x_test)

mlp_clf = MLPClassifier(random_state=42)


parameters = {"hidden_layer_sizes": [(7,)],"max_iter":[40] }

grid_mlp = GridSearchCV(estimator=mlp_clf, param_grid = parameters, scoring="precision",cv=3, n_jobs=1)

x_train_grid = grid_mlp.fit(x_train_scale,y_train_small)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train_scale)
test_pred = x_train_grid.predict(x_test_scale)


print("Accuracy on train data", metrics.accuracy_score(y_train_small, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train_small, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno diabetes\tdiabetes")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train_small, train_pred)
recall_train = recall_score(y_train_small, train_pred)
f1_train = f1_score(y_train_small, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)



{'hidden_layer_sizes': (7,), 'max_iter': 40}
Accuracy on train data 0.9168409466152999
Accuracy on test data 0.8124067071350383

			no diabetes	diabetes
class-wise accuracy for train [0.94433132 0.86186021]
class-wise accuracy for test [0.78826119 0.9745657 ]

precision train and test 0.8855963354634394 0.4066479775730877
recall train and test 0.8618602091359384 0.9745656972838084
f1 train and test 0.8735670655175299 0.5738506315521773


In [None]:
# exp3 - diabetes prediction using ensemble - random forest


rf_clf = RandomForestClassifier(random_state = 42, class_weight = "balanced")
    
parameters = {"max_depth": range(10,35,2 )}
grid_rf = GridSearchCV(estimator=rf_clf, param_grid = parameters, scoring= "recall", cv=3, n_jobs=1)

x_train_grid = grid_rf.fit(x_train,y_train)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train)
test_pred = x_train_grid.predict(x_test)

print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno diabetes\tdiabetes")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train, train_pred)
recall_train = recall_score(y_train, train_pred)
f1_train = f1_score(y_train, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

{'max_depth': 10}
Accuracy on train data 0.8809264661064914
Accuracy on test data 0.8807717185789631

			no diabetes	diabetes
class-wise accuracy for train [0.87803466 0.90034793]
class-wise accuracy for test [0.87951067 0.88924081]

precision train and test 0.5236188057327062 0.5235646473779385
recall train and test 0.9003479304139173 0.8892408100585469
f1 train and test 0.6621490461820813 0.6590787835674906



In [None]:
# exp3 - diabetes prediction using ensemble - random forest

#precision
rf_clf = RandomForestClassifier(random_state = 42, class_weight = "balanced")
    
parameters = {"max_depth": range(10,35,2 )}
grid_rf = GridSearchCV(estimator=rf_clf, param_grid = parameters, scoring= "precision", cv=3, n_jobs=1)

x_train_grid = grid_rf.fit(x_train,y_train)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train)
test_pred = x_train_grid.predict(x_test)

print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno diabetes\tdiabetes")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train, train_pred)
recall_train = recall_score(y_train, train_pred)
f1_train = f1_score(y_train, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

{'max_depth': 34}
Accuracy on train data 0.999962682621919
Accuracy on test data 0.9359762165389591

			no diabetes	diabetes
class-wise accuracy for train [0.99995713 1.        ]
class-wise accuracy for test [0.9931545  0.55197236]

precision train and test 0.999712140475448 0.9231139646869984
recall train and test 1.0 0.551972358191765
f1 train and test 0.9998560495189656 0.690852303441648



In [25]:

clf = DecisionTreeClassifier(max_depth=5, random_state = 42, class_weight = "balanced")
clf.fit(x_train,y_train)
dtree_train_pred = clf.predict_proba(x_train)
dtree_test_pred = clf.predict_proba(x_test)


mlp_clf = MLPClassifier(hidden_layer_sizes =(7,) , max_iter = 40 ,random_state=42)
mlp_clf.fit(x_train_scale, y_train_small)
mlp_train_pred = mlp_clf.predict_proba(sc.fit_transform(x_train))
mlp_test_pred = mlp_clf.predict_proba(sc.fit_transform(x_test))

rf_clf = RandomForestClassifier(max_depth=10, random_state = 42, class_weight = "balanced")
rf_clf.fit(x_train, y_train)
rf_train_pred = rf_clf.predict_proba(x_train)
rf_test_pred = rf_clf.predict_proba(x_test)


final_pred_train = []
final_pred_test = []

for i in range(len(dtree_train_pred)):
    val = (dtree_train_pred[i][0] + mlp_train_pred[i][0] + rf_train_pred[i][0])/3
    if val <= 0.5:
        final_pred_train.append(1)
    else:
        final_pred_train.append(0)

for i in range(len(dtree_test_pred)):
    val = (dtree_test_pred[i][0] + mlp_test_pred[i][0] + rf_test_pred[i][0])/3
    if val <= 0.5:
        final_pred_test.append(1)
    else:
        final_pred_test.append(0)

print("Accuracy on train data", metrics.accuracy_score(y_train, final_pred_train))
print("Accuracy on test data", metrics.accuracy_score(y_test, final_pred_test))

train_matrix = confusion_matrix(y_train, final_pred_train)
test_matrix = confusion_matrix(y_test, final_pred_test)

print()
print("\t\t\tno diabetes\t diabetes")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

print()

precision_train = precision_score(y_train, final_pred_train)
recall_train = recall_score(y_train, final_pred_train)
f1_train = f1_score(y_train, final_pred_train)

precision_test = precision_score(y_test, final_pred_test)
recall_test = recall_score(y_test, final_pred_test)
f1_test = f1_score(y_test, final_pred_test)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()



Accuracy on train data 0.8281907913150022
Accuracy on test data 0.8286645437356951

			no diabetes	 diabetes
class-wise accuracy for train [0.80738216 0.96794241]
class-wise accuracy for test [0.80816887 0.96631155]


precision train and test 0.4279954589341227 0.4285896726405858
recall train and test 0.9679424115176964 0.9663115462136481
f1 train and test 0.5935435457528362 0.5938071365378945



In [None]:
#Experimenting with PCA

In [None]:
for depth in range(5, 40, 2):
    print("max-depth", depth)
    clf = DecisionTreeClassifier(max_depth=depth, random_state = 42, class_weight = "balanced")
    pca = PCA(n_components = 100)
    x_train_pca = pca.fit_transform(x_train)
    x_test_pca = pca.fit_transform(x_test)
    clf.fit(x_train_pca, y_train)

    train_pred = clf.predict(x_train_pca)
    test_pred = clf.predict(x_test_pca)

    print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
    print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

    train_matrix = confusion_matrix(y_train, train_pred)
    test_matrix = confusion_matrix(y_test, test_pred)

    print()
    print("\t\t\tno diabetes\tdiabetes")
    print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
    print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

    precision_train = precision_score(y_train, train_pred)
    recall_train = recall_score(y_train, train_pred)
    f1_train = f1_score(y_train, train_pred)

    precision_test = precision_score(y_test, test_pred)
    recall_test = recall_score(y_test, test_pred)
    f1_test = f1_score(y_test, test_pred)

    print()
    print("precision train and test", precision_train, precision_test)
    print("recall train and test", recall_train, recall_test)
    print("f1 train and test", f1_train, f1_test)

max-depth 5
Accuracy on train data 0.7675656008408849
Accuracy on test data 0.6557120111453876

			no diabetes	diabetes
class-wise accuracy for train [0.77764558 0.69986803]
class-wise accuracy for test [0.67153045 0.54947692]

precision train and test 0.31910680291459703 0.19941481765300081
recall train and test 0.699868026394721 0.5494769171705538
f1 train and test 0.4383477234984256 0.29262931915763646
max-depth 7
Accuracy on train data 0.74032702462325
Accuracy on test data 0.6050228878495373

			no diabetes	diabetes
class-wise accuracy for train [0.72950541 0.8130054 ]
class-wise accuracy for test [0.61698084 0.52471446]

precision train and test 0.30916772362694017 0.16942481715631585
recall train and test 0.813005398920216 0.5247144639600729
f1 train and test 0.4479790567609377 0.2561435565863143
max-depth 9
Accuracy on train data 0.7642723422252352
Accuracy on test data 0.6859637774902976

			no diabetes	diabetes
class-wise accuracy for train [0.75207849 0.84616677]
class-wise 

In [None]:
# exp2 - Diabetes prediction with mlp

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x_train_small = pd.concat([x_train[y_train == 0].sample(2*18170), x_train[y_train == 1].sample(18170)])
y_train_small = [0]*2*18170 + [1]*18170

print(x_train_small.shape, len(y_train_small))
print(x_train_small[y_train == 1].shape)
print(x_test[y_test == 1].shape)

pca = PCA(n_components= 40)

mlp_clf = MLPClassifier(hidden_layer_sizes=(7,), random_state=42, max_iter=40)
x_train_pca = pca.fit_transform(x_train_small)
x_test_pca = pca.fit_transform(x_test)

mlp_clf.fit(sc.fit_transform(x_train_pca), y_train_small)

train_pred = mlp_clf.predict(sc.fit_transform(x_train_pca))
test_pred = mlp_clf.predict(sc.fit_transform(x_test_pca))

print("Accuracy on train data", metrics.accuracy_score(y_train_small, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train_small, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno diabetes\tdiabetes")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train_small, train_pred)
recall_train = recall_score(y_train_small, train_pred)
f1_train = f1_score(y_train_small, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)


(54510, 270) 54510
(18170, 270)
(10419, 270)


  # This is added back by InteractiveShellApp.init_path()


Accuracy on train data 0.7727572922399559
Accuracy on test data 0.4586774803463031

			no diabetes	diabetes
class-wise accuracy for train [0.92410567 0.47006054]
class-wise accuracy for test [0.43391022 0.625012  ]

precision train and test 0.7559076024426941 0.14118769377533985
recall train and test 0.4700605393505779 0.6250119973126019
f1 train and test 0.5796599816756592 0.23034204662021152


In [20]:
# exp3 - cancer prediction using ensemble - random forest

for depth in range(10, 35, 2):
    print("max-depth", depth)
    rf_clf = RandomForestClassifier(max_depth=depth, random_state = 42, class_weight = "balanced")
    pca = PCA(n_components= 100)
    
    x_train_pca = pca.fit_transform(x_train)
    x_test_pca = pca.fit_transform(x_test)
    
    rf_clf.fit(x_train_pca, y_train)


    train_pred = rf_clf.predict(x_train_pca)
    test_pred = rf_clf.predict(x_test_pca)

    print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
    print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

    train_matrix = confusion_matrix(y_train, train_pred)
    test_matrix = confusion_matrix(y_test, test_pred)

    print()
    print("\t\t\tno diabetes\tdiabetes")
    print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
    print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

    precision_train = precision_score(y_train, train_pred)
    recall_train = recall_score(y_train, train_pred)
    f1_train = f1_score(y_train, train_pred)

    precision_test = precision_score(y_test, test_pred)
    recall_test = recall_score(y_test, test_pred)
    f1_test = f1_score(y_test, test_pred)

    print()
    print("precision train and test", precision_train, precision_test)
    print("recall train and test", recall_train, recall_test)
    print("f1 train and test", f1_train, f1_test)

max-depth 10
Accuracy on train data 0.8571428571428571
Accuracy on test data 0.8320230868743158

			no diabetes	diabetes
class-wise accuracy for train [0.86218564 0.82327534]
class-wise accuracy for test [0.88251183 0.49294558]

precision train and test 0.4707544969334413 0.38451748147039005
recall train and test 0.8232753449310138 0.4929455801900374
f1 train and test 0.5989978875329528 0.4320323014804845
max-depth 12
Accuracy on train data 0.8887662252850115
Accuracy on test data 0.8544880087570903

			no diabetes	diabetes
class-wise accuracy for train [0.89155778 0.870018  ]
class-wise accuracy for test [0.91512441 0.44725981]

precision train and test 0.5443326827803633 0.43966411925653365
recall train and test 0.8700179964007199 0.44725981380170843
f1 train and test 0.6696772406150436 0.4434294414311542
max-depth 14
Accuracy on train data 0.9275141028591332
Accuracy on test data 0.8803114737784854

			no diabetes	diabetes
class-wise accuracy for train [0.92856148 0.9204799 ]
class-

In [26]:
clf = DecisionTreeClassifier(max_depth=13, random_state = 42, class_weight = "balanced")
pca = PCA(n_components = 100)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.fit_transform(x_test)
clf.fit(x_train_pca, y_train)
dtree_train_pred = clf.predict_proba(x_train_pca)
dtree_test_pred = clf.predict_proba(x_test_pca)

In [32]:
pca = PCA(n_components= 40)

mlp_clf = MLPClassifier(hidden_layer_sizes=(7,), random_state=42, max_iter=40)
x_train_small_pca = pca.fit_transform(x_train_small)
x_test_pca = pca.transform(x_test)
x_train_pca = pca.transform(x_train)

mlp_clf.fit(sc.fit_transform(x_train_small_pca), y_train_small)

mlp_train_pred = mlp_clf.predict_proba(sc.fit_transform(x_train_pca))
mlp_test_pred = mlp_clf.predict_proba(sc.fit_transform(x_test_pca))



In [28]:
rf_clf = RandomForestClassifier(max_depth=10, random_state = 42, class_weight = "balanced")
pca = PCA(n_components= 100)

x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.fit_transform(x_test)

rf_clf.fit(x_train_pca, y_train)

rf_train_pred = rf_clf.predict_proba(x_train_pca)
rf_test_pred = rf_clf.predict_proba(x_test_pca)


In [33]:
final_pred_train = []
final_pred_test = []

for i in range(len(dtree_train_pred)):
    val = (dtree_train_pred[i][0] + mlp_train_pred[i][0] + rf_train_pred[i][0])/3
    if val <= 0.5:
        final_pred_train.append(1)
    else:
        final_pred_train.append(0)

for i in range(len(dtree_test_pred)):
    val = (dtree_test_pred[i][0] + mlp_test_pred[i][0] + rf_test_pred[i][0])/3
    if val <= 0.5:
        final_pred_test.append(1)
    else:
        final_pred_test.append(0)

print("Accuracy on train data", metrics.accuracy_score(y_train, final_pred_train))
print("Accuracy on test data", metrics.accuracy_score(y_test, final_pred_test))

train_matrix = confusion_matrix(y_train, final_pred_train)
test_matrix = confusion_matrix(y_test, final_pred_test)

print()
print("\t\t\tno diabetes\t diabetes")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

print()

precision_train = precision_score(y_train, final_pred_train)
recall_train = recall_score(y_train, final_pred_train)
f1_train = f1_score(y_train, final_pred_train)

precision_test = precision_score(y_test, final_pred_test)
recall_test = recall_score(y_test, final_pred_test)
f1_test = f1_score(y_test, final_pred_test)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

Accuracy on train data 0.8810228693332006
Accuracy on test data 0.8248830729425813

			no diabetes	 diabetes
class-wise accuracy for train [0.88104655 0.88086383]
class-wise accuracy for test [0.87548054 0.48507534]


precision train and test 0.5243986057939546 0.3671097552117382
recall train and test 0.8808638272345531 0.48507534312314043
f1 train and test 0.6574199267543585 0.41792772678408996

