In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV


from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv("/content/drive/MyDrive/DSS_Pipeline/brfss2020.csv")

Mounted at /content/drive


In [None]:
# exp1 - Asthma prediction with decision tree
completely_empty = [feature for feature in df.columns if df[feature].isnull().sum() == df.shape[0]]

exp1 = df.fillna(-1)
exp1 = exp1.drop(columns=completely_empty)

exp1["ASTHMA3"] = exp1["ASTHMA3"].apply(lambda x: 1 if x == 1 else 0)
exp1 = exp1.drop(columns=['ASTHNOW', '_LTASTH1', '_ASTHMS1', '_CASTHM1'])
x_train, x_test, y_train, y_test = train_test_split(exp1.drop(columns=["ASTHMA3"]),
                                                    exp1["ASTHMA3"],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=exp1["ASTHMA3"])

In [None]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
print(exp1.shape)
print(df.shape)

(321566, 271) (80392, 271) (321566,) (80392,)
(401958, 272)
(401958, 279)


In [None]:
## exp1 - asthma prediction with decision tree

#Gridcv on Decision Tree Classifier with to find best parameters for RECALL 

clf = DecisionTreeClassifier(random_state = 42, class_weight = "balanced")

parameters = {"max_depth" : range(5,31,2)}
grid_dtree = GridSearchCV(estimator=clf, param_grid = parameters,scoring ="recall", cv=3, n_jobs=1)

x_train_grid = grid_dtree.fit(x_train, y_train)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train)
test_pred = x_train_grid.predict(x_test)

print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno asthma\tasthma")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train, train_pred)
recall_train = recall_score(y_train, train_pred)
f1_train = f1_score(y_train, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

{'max_depth': 7}
Accuracy on train data 0.6457243614063676
Accuracy on test data 0.6443302816200617

			no asthma	asthma
class-wise accuracy for train [0.64677343 0.63898509]
class-wise accuracy for test [0.64759071 0.62338382]

precision train and test 0.2197232520660829 0.21589636974252358
recall train and test 0.6389850856535993 0.6233838197266347
f1 train and test 0.3270024870478565 0.32071840923669015



In [None]:
#Gridcv on Decision Tree Classifier with to find best parameters for Precision 

clf = DecisionTreeClassifier(random_state = 42, class_weight = "balanced")
parameters = {"max_depth" : range(5,31,2)}
grid_dtree = GridSearchCV(estimator=clf, param_grid = parameters,scoring ="precision", cv=3, n_jobs=1)

x_train_grid = grid_dtree.fit(x_train, y_train)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train)
test_pred = x_train_grid.predict(x_test)

print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno asthma\tasthma")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train, train_pred)
recall_train = recall_score(y_train, train_pred)
f1_train = f1_score(y_train, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

{'max_depth': 5}
Accuracy on train data 0.7379667004596258
Accuracy on test data 0.7404592496765847

			no asthma	asthma
class-wise accuracy for train [0.77599083 0.49369719]
class-wise accuracy for test [0.7797999  0.48771703]

precision train and test 0.25543809353162517 0.2563716685275984
recall train and test 0.493697187976174 0.48771702992242333
f1 train and test 0.3366790260491699 0.33608044038565565



In [None]:
clf = DecisionTreeClassifier(max_depth=15, random_state = 42, class_weight = "balanced")
clf.fit(x_train, y_train)

test_pred = clf.predict(x_test)

importances = clf.feature_importances_

feature_importances = pd.Series(importances, index=x_train.columns)

print(feature_importances.sort_values(ascending=False)[:25])

predictions_percentage = clf.predict_proba(x_test)

print(predictions_percentage[:50])

CHCCOPD2    0.165574
ADDEPEV3    0.075588
_AGE80      0.039296
_BMI5       0.038150
POORHLTH    0.037376
_STSTR      0.018983
IDATE       0.018974
_LLCPWT     0.018558
PNEUVAC4    0.017653
_LLCPWT2    0.016538
IDAY        0.016197
GENHLTH     0.014498
WEIGHT2     0.013047
_WT2RAKE    0.012774
HIVTSTD3    0.012742
_RFPSA23    0.012015
SEQNO       0.011651
_STRWT      0.010394
_DUALCOR    0.009978
FLSHTMY3    0.009878
SLEPTIM1    0.009741
_PSU        0.009654
WTKG3       0.009432
PHYSHLTH    0.008377
HEIGHT3     0.008340
dtype: float64
[[1.         0.        ]
 [0.64895589 0.35104411]
 [0.59479376 0.40520624]
 [0.48293368 0.51706632]
 [1.         0.        ]
 [0.45092602 0.54907398]
 [0.69104667 0.30895333]
 [0.54248087 0.45751913]
 [0.35797118 0.64202882]
 [0.3789057  0.6210943 ]
 [1.         0.        ]
 [0.75029961 0.24970039]
 [1.         0.        ]
 [0.43767254 0.56232746]
 [0.96581563 0.03418437]
 [0.21713908 0.78286092]
 [0.69540811 0.30459189]
 [0.73312148 0.26687852]
 [0.596968

In [None]:
# exp2 - asthma prediction with mlp
#Recall

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
sc = StandardScaler()

x_train_small = pd.concat([x_train[y_train == 0].sample(2*18170), x_train[y_train == 1].sample(18170)])
y_train_small = [0]*2*18170 + [1]*18170


x_train_scale = sc.fit_transform(x_train_small)
x_test_scale = sc.fit_transform(x_test)

mlp_clf = MLPClassifier(random_state=42)


parameters = {"hidden_layer_sizes": [(7,)],"max_iter":[40] }

grid_mlp = GridSearchCV(estimator=mlp_clf, param_grid = parameters, scoring="recall",cv=3, n_jobs=1)

x_train_grid = grid_mlp.fit(x_train_scale,y_train_small)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train_scale)
test_pred = x_train_grid.predict(x_test_scale)

print("Accuracy on train data", metrics.accuracy_score(y_train_small, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train_small, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno asthma\tasthma")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train_small, train_pred)
recall_train = recall_score(y_train_small, train_pred)
f1_train = f1_score(y_train_small, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)




{'hidden_layer_sizes': (7,), 'max_iter': 40}
Accuracy on train data 0.7295725554944047
Accuracy on test data 0.8044830331376256

			no asthma	asthma
class-wise accuracy for train [0.90698954 0.37473858]
class-wise accuracy for test [0.86863895 0.39231622]

precision train and test 0.668269702620473 0.31734648139847604
recall train and test 0.3747385800770501 0.3923162172146287
f1 train and test 0.48020028914982893 0.3508713967126456


In [None]:
# exp2 - asthma prediction with mlp
#precision

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
sc = StandardScaler()

x_train_small = pd.concat([x_train[y_train == 0].sample(2*18170), x_train[y_train == 1].sample(18170)])
y_train_small = [0]*2*18170 + [1]*18170


x_train_scale = sc.fit_transform(x_train_small)
x_test_scale = sc.fit_transform(x_test)

mlp_clf = MLPClassifier(random_state=42)


parameters = {"hidden_layer_sizes": [(7,)],"max_iter":[40] }

grid_mlp = GridSearchCV(estimator=mlp_clf, param_grid = parameters, scoring="precision",cv=3, n_jobs=1)

x_train_grid = grid_mlp.fit(x_train_scale,y_train_small)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train_scale)
test_pred = x_train_grid.predict(x_test_scale)


print("Accuracy on train data", metrics.accuracy_score(y_train_small, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train_small, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno asthma\tasthma")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train_small, train_pred)
recall_train = recall_score(y_train_small, train_pred)
f1_train = f1_score(y_train_small, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)



{'hidden_layer_sizes': (7,), 'max_iter': 40}
Accuracy on train data 0.7272610530177949
Accuracy on test data 0.8063364513881979

			no asthma	asthma
class-wise accuracy for train [0.90855806 0.36466703]
class-wise accuracy for test [0.87254902 0.38095678]

precision train and test 0.6659965825711127 0.31752751905165116
recall train and test 0.3646670335718217 0.3809567787218323
f1 train and test 0.47128276254489854 0.3463621478651497


In [None]:
# exp3 - asthma prediction using ensemble - random forest


rf_clf = RandomForestClassifier(random_state = 42, class_weight = "balanced")
    
parameters = {"max_depth": range(10,35,2 )}
grid_rf = GridSearchCV(estimator=rf_clf, param_grid = parameters, scoring= "recall", cv=3, n_jobs=1)

x_train_grid = grid_rf.fit(x_train,y_train)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train)
test_pred = x_train_grid.predict(x_test)

print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno asthma\tasthma")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train, train_pred)
recall_train = recall_score(y_train, train_pred)
f1_train = f1_score(y_train, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

{'max_depth': 10}
Accuracy on train data 0.717728242413688
Accuracy on test data 0.7062269877599762

			no asthma	asthma
class-wise accuracy for train [0.73143769 0.62965785]
class-wise accuracy for test [0.72679834 0.57406723]

precision train and test 0.2673797315712591 0.24646128226477934
recall train and test 0.629657847347278 0.574067233099372
f1 train and test 0.37536386470770394 0.344863935199312



In [None]:
# exp3 - asthma prediction using ensemble - random forest

#precision
rf_clf = RandomForestClassifier(random_state = 42, class_weight = "balanced")
    
parameters = {"max_depth": range(10,35,2 )}
grid_rf = GridSearchCV(estimator=rf_clf, param_grid = parameters, scoring= "precision", cv=3, n_jobs=1)

x_train_grid = grid_rf.fit(x_train,y_train)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train)
test_pred = x_train_grid.predict(x_test)

print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno asthma\tasthma")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train, train_pred)
recall_train = recall_score(y_train, train_pred)
f1_train = f1_score(y_train, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

{'max_depth': 34}
Accuracy on train data 0.9999689021849325
Accuracy on test data 0.8667529107373868

			no asthma	asthma
class-wise accuracy for train [1.         0.99976913]
class-wise accuracy for test [0.99712495 0.0291836 ]

precision train and test 1.0 0.6124031007751938
recall train and test 0.9997691277646951 0.029183598079054303
f1 train and test 0.9998845505553119 0.055712270803949214



In [None]:
clf = DecisionTreeClassifier(max_depth=5, random_state = 42, class_weight = "balanced")
clf.fit(x_train,y_train)
dtree_train_pred = clf.predict_proba(x_train)
dtree_test_pred = clf.predict_proba(x_test)

mlp_clf = MLPClassifier(hidden_layer_sizes =(7,) , max_iter = 40 ,random_state=42)
mlp_clf.fit(x_train_scale, y_train_small)
mlp_train_pred = mlp_clf.predict_proba(sc.fit_transform(x_train))
mlp_test_pred = mlp_clf.predict_proba(sc.fit_transform(x_test))

rf_clf = RandomForestClassifier(max_depth=10, random_state = 42, class_weight = "balanced")
rf_clf.fit(x_train, y_train)
rf_train_pred = rf_clf.predict_proba(x_train)
rf_test_pred = rf_clf.predict_proba(x_test)


final_pred_train = []
final_pred_test = []

for i in range(len(dtree_train_pred)):
    val = (dtree_train_pred[i][0] + mlp_train_pred[i][0] + rf_train_pred[i][0])/3
    if val <= 0.5:
        final_pred_train.append(1)
    else:
        final_pred_train.append(0)

for i in range(len(dtree_test_pred)):
    val = (dtree_test_pred[i][0] + mlp_test_pred[i][0] + rf_test_pred[i][0])/3
    if val <= 0.5:
        final_pred_test.append(1)
    else:
        final_pred_test.append(0)

print("Accuracy on train data", metrics.accuracy_score(y_train, final_pred_train))
print("Accuracy on test data", metrics.accuracy_score(y_test, final_pred_test))

train_matrix = confusion_matrix(y_train, final_pred_train)
test_matrix = confusion_matrix(y_test, final_pred_test)

print()
print("\t\t\tno asthma\t asthma")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

print()

precision_train = precision_score(y_train, final_pred_train)
recall_train = recall_score(y_train, final_pred_train)
f1_train = f1_score(y_train, final_pred_train)

precision_test = precision_score(y_test, final_pred_test)
recall_test = recall_score(y_test, final_pred_test)
f1_test = f1_score(y_test, final_pred_test)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()



Accuracy on train data 0.774105471349583
Accuracy on test data 0.7711836998706338

			no asthma	 asthma
class-wise accuracy for train [0.81608398 0.50443275]
class-wise accuracy for test [0.81595365 0.48356114]


precision train and test 0.29920300175284836 0.29025999223903765
recall train and test 0.5044327469178557 0.4835611377909125
f1 train and test 0.3756124396156028 0.36276717358921956



In [None]:
#Experimenting with PCA

In [None]:
for depth in range(5, 40, 2):
    print("max-depth", depth)
    clf = DecisionTreeClassifier(max_depth=depth, random_state = 42, class_weight = "balanced")
    pca = PCA(n_components = 100)
    x_train_pca = pca.fit_transform(x_train)
    x_test_pca = pca.fit_transform(x_test)
    clf.fit(x_train_pca, y_train)

    train_pred = clf.predict(x_train_pca)
    test_pred = clf.predict(x_test_pca)

    print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
    print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

    train_matrix = confusion_matrix(y_train, train_pred)
    test_matrix = confusion_matrix(y_test, test_pred)

    print()
    print("\t\t\tno asthma\tasthma")
    print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
    print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

    precision_train = precision_score(y_train, train_pred)
    recall_train = recall_score(y_train, train_pred)
    f1_train = f1_score(y_train, train_pred)

    precision_test = precision_score(y_test, test_pred)
    recall_test = recall_score(y_test, test_pred)
    f1_test = f1_score(y_test, test_pred)

    print()
    print("precision train and test", precision_train, precision_test)
    print("recall train and test", recall_train, recall_test)
    print("f1 train and test", f1_train, f1_test)

max-depth 5
Accuracy on train data 0.643056168873575
Accuracy on test data 0.6388446611603145

			no asthma	asthma
class-wise accuracy for train [0.6590321  0.54042573]
class-wise accuracy for test [0.66166408 0.49224233]

precision train and test 0.19789826095043245 0.18464629668121665
recall train and test 0.5404257284019024 0.49224233468784634
f1 train and test 0.2897083485460745 0.2685544414773014
max-depth 7
Accuracy on train data 0.610726880329388
Accuracy on test data 0.6005945865260225

			no asthma	asthma
class-wise accuracy for train [0.61142058 0.60627049]
class-wise accuracy for test [0.61173883 0.52899889]

precision train and test 0.19541162200575965 0.17497021718544767
recall train and test 0.6062704899108833 0.5289988917620982
f1 train and test 0.2955592947545541 0.2629633880408585
max-depth 9
Accuracy on train data 0.6167816249230329
Accuracy on test data 0.5781296646432481

			no asthma	asthma
class-wise accuracy for train [0.6124664  0.64450293]
class-wise accuracy f

In [None]:
# exp2 - asthma prediction with mlp

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x_train_small = pd.concat([x_train[y_train == 0].sample(2*18170), x_train[y_train == 1].sample(18170)])
y_train_small = [0]*2*18170 + [1]*18170

print(x_train_small.shape, len(y_train_small))
print(x_train_small[y_train == 1].shape)
print(x_test[y_test == 1].shape)

pca = PCA(n_components= 40)

mlp_clf = MLPClassifier(hidden_layer_sizes=(7,), random_state=42, max_iter=40)
x_train_pca = pca.fit_transform(x_train_small)
x_test_pca = pca.fit_transform(x_test)

mlp_clf.fit(sc.fit_transform(x_train_pca), y_train_small)

train_pred = mlp_clf.predict(sc.fit_transform(x_train_pca))
test_pred = mlp_clf.predict(sc.fit_transform(x_test_pca))

print("Accuracy on train data", metrics.accuracy_score(y_train_small, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train_small, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno asthma\tasthma")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train_small, train_pred)
recall_train = recall_score(y_train_small, train_pred)
f1_train = f1_score(y_train_small, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)


(54510, 271) 54510
(18170, 271)
(10828, 271)


  # This is added back by InteractiveShellApp.init_path()


Accuracy on train data 0.6806824435883324
Accuracy on test data 0.8038486416558862

			no asthma	asthma
class-wise accuracy for train [0.93087507 0.18029719]
class-wise accuracy for test [0.90657524 0.14388622]

precision train and test 0.5659986178299931 0.19337222291175377
recall train and test 0.1802971931755641 0.14388622090875508
f1 train and test 0.2734785875281743 0.1649986761980408


In [None]:
# exp3 - asthma prediction using ensemble - random forest

for depth in range(10, 35, 2):
    print("max-depth", depth)
    rf_clf = RandomForestClassifier(max_depth=depth, random_state = 42, class_weight = "balanced")
    pca = PCA(n_components= 100)
    
    x_train_pca = pca.fit_transform(x_train)
    x_test_pca = pca.fit_transform(x_test)
    
    rf_clf.fit(x_train_pca, y_train)


    train_pred = rf_clf.predict(x_train_pca)
    test_pred = rf_clf.predict(x_test_pca)

    print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
    print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

    train_matrix = confusion_matrix(y_train, train_pred)
    test_matrix = confusion_matrix(y_test, test_pred)

    print()
    print("\t\t\tno asthma\tasthma")
    print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
    print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

    precision_train = precision_score(y_train, train_pred)
    recall_train = recall_score(y_train, train_pred)
    f1_train = f1_score(y_train, train_pred)

    precision_test = precision_score(y_test, test_pred)
    recall_test = recall_score(y_test, test_pred)
    f1_test = f1_score(y_test, test_pred)

    print()
    print("precision train and test", precision_train, precision_test)
    print("recall train and test", recall_train, recall_test)
    print("f1 train and test", f1_train, f1_test)

max-depth 10
Accuracy on train data 0.7206700957190748
Accuracy on test data 0.6994974624340731

			no asthma	asthma
class-wise accuracy for train [0.73371261 0.63688415]
class-wise accuracy for test [0.73962107 0.44172516]

precision train and test 0.2712994561422488 0.20890111809923131
recall train and test 0.636884148312324 0.4417251570003694
f1 train and test 0.3805096727473361 0.28365555687344324
max-depth 12
Accuracy on train data 0.8140941517448984
Accuracy on test data 0.7859612896805652

			no asthma	asthma
class-wise accuracy for train [0.83536147 0.67747149]
class-wise accuracy for test [0.86912771 0.25166236]

precision train and test 0.39044641075111436 0.2303660495392679
recall train and test 0.6774714872789398 0.25166235685260435
f1 train and test 0.49538697887211003 0.2405437613099704
max-depth 14
Accuracy on train data 0.9041596437434306
Accuracy on test data 0.8435789630809035

			no asthma	asthma
class-wise accuracy for train [0.92953869 0.74112296]
class-wise accura

  _warn_prf(average, modifier, msg_start, len(result))



precision train and test 0.9994460089561885 0.0
recall train and test 0.999630604423512 0.0
f1 train and test 0.9995382981670438 0.0
max-depth 30
Accuracy on train data 0.9999284750253448
Accuracy on test data 0.8652975420439845

			no asthma	asthma
class-wise accuracy for train [0.99996047 0.99972295]
class-wise accuracy for test [0.99998562 0.        ]

precision train and test 0.9997460346778103 0.0
recall train and test 0.999722953317634 0.0
f1 train and test 0.9997344938644995 0.0
max-depth 32
Accuracy on train data 0.9999657924034258
Accuracy on test data 0.865309981092646

			no asthma	asthma
class-wise accuracy for train [0.99998562 0.99983839]
class-wise accuracy for test [1. 0.]


  _warn_prf(average, modifier, msg_start, len(result))



precision train and test 0.9999076447091962 0.0
recall train and test 0.9998383894352865 0.0
f1 train and test 0.9998730158730157 0.0
max-depth 34
Accuracy on train data 0.9999689021849325
Accuracy on test data 0.865309981092646

			no asthma	asthma
class-wise accuracy for train [0.99999281 0.9998153 ]
class-wise accuracy for test [1. 0.]

precision train and test 0.9999538191558142 0.0
recall train and test 0.999815302211756 0.0
f1 train and test 0.9998845558864953 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
clf = DecisionTreeClassifier(max_depth=13, random_state = 42, class_weight = "balanced")
pca = PCA(n_components = 100)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.fit_transform(x_test)
clf.fit(x_train_pca, y_train)
dtree_train_pred = clf.predict_proba(x_train_pca)
dtree_test_pred = clf.predict_proba(x_test_pca)

In [None]:
pca = PCA(n_components= 40)

mlp_clf = MLPClassifier(hidden_layer_sizes=(7,), random_state=42, max_iter=40)
x_train_small_pca = pca.fit_transform(x_train_small)
x_test_pca = pca.transform(x_test)
x_train_pca = pca.transform(x_train)

mlp_clf.fit(sc.fit_transform(x_train_small_pca), y_train_small)

mlp_train_pred = mlp_clf.predict_proba(sc.fit_transform(x_train_pca))
mlp_test_pred = mlp_clf.predict_proba(sc.fit_transform(x_test_pca))



In [None]:
rf_clf = RandomForestClassifier(max_depth=10, random_state = 42, class_weight = "balanced")
pca = PCA(n_components= 100)

x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.fit_transform(x_test)

rf_clf.fit(x_train_pca, y_train)

rf_train_pred = rf_clf.predict_proba(x_train_pca)
rf_test_pred = rf_clf.predict_proba(x_test_pca)


In [None]:
final_pred_train = []
final_pred_test = []

for i in range(len(dtree_train_pred)):
    val = (dtree_train_pred[i][0] + mlp_train_pred[i][0] + rf_train_pred[i][0])/3
    if val <= 0.5:
        final_pred_train.append(1)
    else:
        final_pred_train.append(0)

for i in range(len(dtree_test_pred)):
    val = (dtree_test_pred[i][0] + mlp_test_pred[i][0] + rf_test_pred[i][0])/3
    if val <= 0.5:
        final_pred_test.append(1)
    else:
        final_pred_test.append(0)

print("Accuracy on train data", metrics.accuracy_score(y_train, final_pred_train))
print("Accuracy on test data", metrics.accuracy_score(y_test, final_pred_test))

train_matrix = confusion_matrix(y_train, final_pred_train)
test_matrix = confusion_matrix(y_test, final_pred_test)

print()
print("\t\t\tno asthma\t asthma")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

print()

precision_train = precision_score(y_train, final_pred_train)
recall_train = recall_score(y_train, final_pred_train)
f1_train = f1_score(y_train, final_pred_train)

precision_test = precision_score(y_test, final_pred_test)
recall_test = recall_score(y_test, final_pred_test)
f1_test = f1_score(y_test, final_pred_test)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

Accuracy on train data 0.7811895536219625
Accuracy on test data 0.7427231565329884

			no asthma	 asthma
class-wise accuracy for train [0.81439487 0.56787644]
class-wise accuracy for test [0.80537347 0.34022904]


precision train and test 0.32261745494609273 0.2139000174185682
recall train and test 0.5678764371796647 0.3402290358330255
f1 train and test 0.4114724480578139 0.2626644326405476

