In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV


from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv("/content/drive/MyDrive/DSS_Pipeline/brfss2020.csv")

Mounted at /content/drive


In [None]:
# exp1 - cancer prediction with decision tree
completely_empty = [feature for feature in df.columns if df[feature].isnull().sum() == df.shape[0]]

exp1 = df.fillna(-1)
exp1 = exp1.drop(columns=completely_empty)

exp1["CNCRDIFF"] = exp1["CNCRDIFF"].apply(lambda x: 1 if x in [1,2,3] else 0)
exp1 = exp1.drop(columns=["CNCRAGE", "CNCRTYP1", "CSRVTRT3", "CSRVDOC1", "CSRVSUM", "CSRVRTRN", "CSRVINST", "CSRVINSR", "CSRVDEIN", "CSRVCLIN", 
                   "CSRVPAIN", "CSRVCTL2", "CHCSCNCR", "CHCOCNCR"])
x_train, x_test, y_train, y_test = train_test_split(exp1.drop(columns=["CNCRDIFF"]),
                                                    exp1["CNCRDIFF"],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=exp1["CNCRDIFF"])

In [None]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
print(exp1.shape)
print(df.shape)

(321566, 261) (80392, 261) (321566,) (80392,)
(401958, 262)
(401958, 279)


In [None]:
## exp1 - cancerrediction with decision tree

#Gridcv on Decision Tree Classifier with to find best parameters for RECALL 

clf = DecisionTreeClassifier(random_state = 42, class_weight = "balanced")

parameters = {"max_depth" : range(5,31,2)}
grid_dtree = GridSearchCV(estimator=clf, param_grid = parameters,scoring ="recall", cv=3, n_jobs=1)

x_train_grid = grid_dtree.fit(x_train, y_train)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train)
test_pred = x_train_grid.predict(x_test)

print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno cancer\tcancer")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train, train_pred)
recall_train = recall_score(y_train, train_pred)
f1_train = f1_score(y_train, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

{'max_depth': 9}
Accuracy on train data 0.7374691354185455
Accuracy on test data 0.737100706537964

			no cancer	cancer
class-wise accuracy for train [0.72532927 0.94017611]
class-wise accuracy for test [0.72615328 0.91987673]

precision train and test 0.1701205971100511 0.16749498997995993
recall train and test 0.9401761144744084 0.9198767334360555
f1 train and test 0.28810915193065006 0.2833892788119215



In [None]:
#Gridcv on Decision Tree Classifier with to find best parameters for Precision 

clf = DecisionTreeClassifier(random_state = 42, class_weight = "balanced")
parameters = {"max_depth" : range(5,31,2)}
grid_dtree = GridSearchCV(estimator=clf, param_grid = parameters,scoring ="precision", cv=3, n_jobs=1)

x_train_grid = grid_dtree.fit(x_train, y_train)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train)
test_pred = x_train_grid.predict(x_test)

print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno cancer\tcancer")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train, train_pred)
recall_train = recall_score(y_train, train_pred)
f1_train = f1_score(y_train, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

{'max_depth': 29}
Accuracy on train data 0.9216863723154811
Accuracy on test data 0.8824136730022888

			no cancer	cancer
class-wise accuracy for train [0.91700616 0.99983489]
class-wise accuracy for test [0.8960039  0.65551398]

precision train and test 0.4191062818649503 0.27406589361310507
recall train and test 0.9998348926802422 0.6555139775478759
f1 train and test 0.5906334834273453 0.3865273541436822



In [None]:
clf = DecisionTreeClassifier(max_depth=15, random_state = 42, class_weight = "balanced")
clf.fit(x_train, y_train)

test_pred = clf.predict(x_test)

importances = clf.feature_importances_

feature_importances = pd.Series(importances, index=x_train.columns)

print(feature_importances.sort_values(ascending=False)[:25])

predictions_percentage = clf.predict_proba(x_test)

print(predictions_percentage[:50])

VIRCOLON    0.226194
_STATE      0.143479
_STSTR      0.129810
TRNSGNDR    0.081119
_AGE80      0.064340
_CHISPNC    0.030886
ACEDEPRS    0.028886
CIMEMLOS    0.022854
ACEPRISN    0.017956
QSTVER      0.016246
HAVEHEPB    0.015510
MARIJAN1    0.015494
HLTHCVR1    0.014898
ECIGARET    0.009838
SEQNO       0.009719
COLNTEST    0.007936
COLNSCPY    0.006960
_WT2RAKE    0.006817
HOWLONG     0.006111
IDATE       0.004453
_BMI5       0.003914
_LLCPWT     0.003883
_PRACE1     0.003633
IDAY        0.003533
_LLCPWT2    0.003383
dtype: float64
[[1.         0.        ]
 [0.94341658 0.05658342]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [0.4605958  0.5394042 ]
 [0.64297694 0.35702306]
 [0.29467949 0.70532051]
 [1.         0.        ]
 [0.16159933 0.83840067]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [0.11880505 0.88119495]
 [1.      

In [None]:
# exp2 - cancer prediction with mlp
#Recall

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
sc = StandardScaler()

x_train_small = pd.concat([x_train[y_train == 0].sample(2*18170), x_train[y_train == 1].sample(18170)])
y_train_small = [0]*2*18170 + [1]*18170


x_train_scale = sc.fit_transform(x_train_small)
x_test_scale = sc.fit_transform(x_test)

mlp_clf = MLPClassifier(random_state=42)


parameters = {"hidden_layer_sizes": [(7,)],"max_iter":[40] }

grid_mlp = GridSearchCV(estimator=mlp_clf, param_grid = parameters, scoring="recall",cv=3, n_jobs=1)

x_train_grid = grid_mlp.fit(x_train_scale,y_train_small)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train_scale)
test_pred = x_train_grid.predict(x_test_scale)

print("Accuracy on train data", metrics.accuracy_score(y_train_small, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train_small, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno cancer\tcancer")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train_small, train_pred)
recall_train = recall_score(y_train_small, train_pred)
f1_train = f1_score(y_train_small, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)




{'hidden_layer_sizes': (7,), 'max_iter': 40}
Accuracy on train data 0.8665749403779123
Accuracy on test data 0.8185515971738482

			no cancer	cancer
class-wise accuracy for train [0.8684645  0.86279582]
class-wise accuracy for test [0.81353742 0.90226722]

precision train and test 0.7663391504130616 0.2247012388992435
recall train and test 0.8627958172812328 0.9022672243011226
f1 train and test 0.8117120149118492 0.35979811279350454


In [None]:
# exp2 - cancer prediction with mlp
#precision

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
sc = StandardScaler()

x_train_small = pd.concat([x_train[y_train == 0].sample(2*18170), x_train[y_train == 1].sample(18170)])
y_train_small = [0]*2*18170 + [1]*18170


x_train_scale = sc.fit_transform(x_train_small)
x_test_scale = sc.fit_transform(x_test)

mlp_clf = MLPClassifier(random_state=42)


parameters = {"hidden_layer_sizes": [(7,)],"max_iter":[40] }

grid_mlp = GridSearchCV(estimator=mlp_clf, param_grid = parameters, scoring="precision",cv=3, n_jobs=1)

x_train_grid = grid_mlp.fit(x_train_scale,y_train_small)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train_scale)
test_pred = x_train_grid.predict(x_test_scale)


print("Accuracy on train data", metrics.accuracy_score(y_train_small, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train_small, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno cancer\tcancer")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train_small, train_pred)
recall_train = recall_score(y_train_small, train_pred)
f1_train = f1_score(y_train_small, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)



{'hidden_layer_sizes': (7,), 'max_iter': 40}
Accuracy on train data 0.8659878921298845
Accuracy on test data 0.8180540352273858

			no cancer	cancer
class-wise accuracy for train [0.87930655 0.83935058]
class-wise accuracy for test [0.8130628  0.90138675]

precision train and test 0.7766461272088404 0.22408886943197986
recall train and test 0.8393505778756192 0.901386748844376
f1 train and test 0.8067818128917925 0.35894289345663316


In [None]:
# exp3 - cancer prediction using ensemble - random forest


rf_clf = RandomForestClassifier(random_state = 42, class_weight = "balanced")
    
parameters = {"max_depth": range(10,35,2)}
grid_rf = GridSearchCV(estimator=rf_clf, param_grid = parameters, scoring= "recall", cv=3, n_jobs=1)

x_train_grid = grid_rf.fit(x_train,y_train)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train)
test_pred = x_train_grid.predict(x_test)

print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno cancer\tcancer")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train, train_pred)
recall_train = recall_score(y_train, train_pred)
f1_train = f1_score(y_train, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

{'max_depth': 10}
Accuracy on train data 0.7820945000404271
Accuracy on test data 0.779206886257339

			no cancer	cancer
class-wise accuracy for train [0.77450263 0.90886076]
class-wise accuracy for test [0.77364237 0.87211094]

precision train and test 0.1944447715150302 0.18749704226018646
recall train and test 0.9088607594936708 0.8721109399075501
f1 train and test 0.3203522827573498 0.3086390901300928



In [None]:
# exp3 - cancer prediction using ensemble - random forest

#precision
rf_clf = RandomForestClassifier(random_state = 42, class_weight = "balanced")
    
parameters = {"max_depth": range(10,35,2 )}
grid_rf = GridSearchCV(estimator=rf_clf, param_grid = parameters, scoring= "precision", cv=3, n_jobs=1)

x_train_grid = grid_rf.fit(x_train,y_train)

print(x_train_grid.best_params_)

train_pred = x_train_grid.predict(x_train)
test_pred = x_train_grid.predict(x_test)

print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno cancer\tcancer")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train, train_pred)
recall_train = recall_score(y_train, train_pred)
f1_train = f1_score(y_train, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

{'max_depth': 34}
Accuracy on train data 0.9872094686627317
Accuracy on test data 0.9348069459647727

			no cancer	cancer
class-wise accuracy for train [0.98645005 0.99988993]
class-wise accuracy for test [0.97284078 0.29980189]

precision train and test 0.8154764576507024 0.3980128579777908
recall train and test 0.9998899284534948 0.299801893022232
f1 train and test 0.8983163984276497 0.34199623352165726



In [None]:
clf = DecisionTreeClassifier(max_depth=9, random_state=42, class_weight="balanced")
clf.fit(x_train, y_train)
dtree_train_pred = clf.predict_proba(x_train)
dtree_test_pred = clf.predict_proba(x_test)

mlp_clf = MLPClassifier(hidden_layer_sizes =(7,) , max_iter = 40 ,random_state=42)
mlp_clf.fit(x_train_scale, y_train_small)
mlp_train_pred = mlp_clf.predict_proba(sc.fit_transform(x_train))
mlp_test_pred = mlp_clf.predict_proba(sc.fit_transform(x_test))

rf_clf = RandomForestClassifier(max_depth=10, random_state = 42, class_weight = "balanced")
rf_clf.fit(x_train, y_train)
rf_train_pred = rf_clf.predict_proba(x_train)
rf_test_pred = rf_clf.predict_proba(x_test)


final_pred_train = []
final_pred_test = []

for i in range(len(dtree_train_pred)):
    val = (dtree_train_pred[i][0] + mlp_train_pred[i][0] + rf_train_pred[i][0])/3
    if val <= 0.5:
        final_pred_train.append(1)
    else:
        final_pred_train.append(0)

for i in range(len(dtree_test_pred)):
    val = (dtree_test_pred[i][0] + mlp_test_pred[i][0] + rf_test_pred[i][0])/3
    if val <= 0.5:
        final_pred_test.append(1)
    else:
        final_pred_test.append(0)

print("Accuracy on train data", metrics.accuracy_score(y_train, final_pred_train))
print("Accuracy on test data", metrics.accuracy_score(y_test, final_pred_test))

train_matrix = confusion_matrix(y_train, final_pred_train)
test_matrix = confusion_matrix(y_test, final_pred_test)

print()
print("\t\t\tno cancer\t cancer")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

print()

precision_train = precision_score(y_train, final_pred_train)
recall_train = recall_score(y_train, final_pred_train)
f1_train = f1_score(y_train, final_pred_train)

precision_test = precision_score(y_test, final_pred_test)
recall_test = recall_score(y_test, final_pred_test)
f1_test = f1_score(y_test, final_pred_test)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()



Accuracy on train data 0.8104183900039184
Accuracy on test data 0.8080157229575082

			no cancer	 cancer
class-wise accuracy for train [0.80234084 0.94529444]
class-wise accuracy for test [0.80126304 0.92075721]


precision train and test 0.2226456672499838 0.2172197123124059
recall train and test 0.9452944413869014 0.9207572088928021
f1 train and test 0.3604049729843152 0.3515126050420168



In [None]:
#Experimenting with PCA

In [None]:
for depth in range(5, 40, 2):
    print("max-depth", depth)
    clf = DecisionTreeClassifier(max_depth=depth, random_state = 42, class_weight = "balanced")
    pca = PCA(n_components = 100)
    x_train_pca = pca.fit_transform(x_train)
    x_test_pca = pca.fit_transform(x_test)
    clf.fit(x_train_pca, y_train)

    train_pred = clf.predict(x_train_pca)
    test_pred = clf.predict(x_test_pca)

    print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
    print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

    train_matrix = confusion_matrix(y_train, train_pred)
    test_matrix = confusion_matrix(y_test, test_pred)

    print()
    print("\t\t\tno cancer\tcancer")
    print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
    print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

    precision_train = precision_score(y_train, train_pred)
    recall_train = recall_score(y_train, train_pred)
    f1_train = f1_score(y_train, train_pred)

    precision_test = precision_score(y_test, test_pred)
    recall_test = recall_score(y_test, test_pred)
    f1_test = f1_score(y_test, test_pred)

    print()
    print("precision train and test", precision_train, precision_test)
    print("recall train and test", recall_train, recall_test)
    print("f1 train and test", f1_train, f1_test)

max-depth 5
Accuracy on train data 0.5814202994097635
Accuracy on test data 0.5498308289382028

			no cancer	cancer
class-wise accuracy for train [0.56661591 0.8286186 ]
class-wise accuracy for test [0.5353795 0.7911072]

precision train and test 0.10274117494523792 0.0925453843182696
recall train and test 0.8286186020913594 0.7911071978868589
f1 train and test 0.18281495692507577 0.16570611830882012
max-depth 7
Accuracy on train data 0.672658179036341
Accuracy on test data 0.6568190864762663

			no cancer	cancer
class-wise accuracy for train [0.66294216 0.83489268]
class-wise accuracy for test [0.65091168 0.75544794]

precision train and test 0.129181143129641 0.11474423269809428
recall train and test 0.8348926802421573 0.7554479418886199
f1 train and test 0.22374301263993157 0.19922793370678893
max-depth 9
Accuracy on train data 0.683299851352444
Accuracy on test data 0.6384714897004677

			no cancer	cancer
class-wise accuracy for train [0.67069441 0.89378096]
class-wise accuracy for

In [None]:
# exp2 - cancer prediction with mlp

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x_train_small = pd.concat([x_train[y_train == 0].sample(2*18170), x_train[y_train == 1].sample(18170)])
y_train_small = [0]*2*18170 + [1]*18170

print(x_train_small.shape, len(y_train_small))
print(x_train_small[y_train == 1].shape)
print(x_test[y_test == 1].shape)

pca = PCA(n_components= 40)

mlp_clf = MLPClassifier(hidden_layer_sizes=(7,), random_state=42, max_iter=40)
x_train_pca = pca.fit_transform(x_train_small)
x_test_pca = pca.fit_transform(x_test)

mlp_clf.fit(sc.fit_transform(x_train_pca), y_train_small)

train_pred = mlp_clf.predict(sc.fit_transform(x_train_pca))
test_pred = mlp_clf.predict(sc.fit_transform(x_test_pca))

print("Accuracy on train data", metrics.accuracy_score(y_train_small, train_pred))
print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

train_matrix = confusion_matrix(y_train_small, train_pred)
test_matrix = confusion_matrix(y_test, test_pred)

print()
print("\t\t\tno cancer\tcancer")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

precision_train = precision_score(y_train_small, train_pred)
recall_train = recall_score(y_train_small, train_pred)
f1_train = f1_score(y_train_small, train_pred)

precision_test = precision_score(y_test, test_pred)
recall_test = recall_score(y_test, test_pred)
f1_test = f1_score(y_test, test_pred)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)


(54510, 261) 54510
(18170, 261)
(4543, 261)


  # This is added back by InteractiveShellApp.init_path()


Accuracy on train data 0.7491469455145845
Accuracy on test data 0.7769927355955817

			no cancer	cancer
class-wise accuracy for train [0.830765   0.58591084]
class-wise accuracy for test [0.80786826 0.26150121]

precision train and test 0.633841390807335 0.07537592792335512
recall train and test 0.5859108420473308 0.26150121065375304
f1 train and test 0.6089343934107417 0.11702127659574468


In [None]:
# exp3 - cancer prediction using ensemble - random forest

for depth in range(10, 35, 2):
    print("max-depth", depth)
    rf_clf = RandomForestClassifier(max_depth=depth, random_state = 42, class_weight = "balanced")
    pca = PCA(n_components= 100)
    
    x_train_pca = pca.fit_transform(x_train)
    x_test_pca = pca.fit_transform(x_test)
    
    rf_clf.fit(x_train_pca, y_train)


    train_pred = rf_clf.predict(x_train_pca)
    test_pred = rf_clf.predict(x_test_pca)

    print("Accuracy on train data", metrics.accuracy_score(y_train, train_pred))
    print("Accuracy on test data", metrics.accuracy_score(y_test, test_pred))

    train_matrix = confusion_matrix(y_train, train_pred)
    test_matrix = confusion_matrix(y_test, test_pred)

    print()
    print("\t\t\tno cancer\tcancer")
    print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
    print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

    precision_train = precision_score(y_train, train_pred)
    recall_train = recall_score(y_train, train_pred)
    f1_train = f1_score(y_train, train_pred)

    precision_test = precision_score(y_test, test_pred)
    recall_test = recall_score(y_test, test_pred)
    f1_test = f1_score(y_test, test_pred)

    print()
    print("precision train and test", precision_train, precision_test)
    print("recall train and test", recall_train, recall_test)
    print("f1 train and test", f1_train, f1_test)

max-depth 10
Accuracy on train data 0.799754949217268
Accuracy on test data 0.8141108568016718

			no cancer	cancer
class-wise accuracy for train [0.79380084 0.89917446]
class-wise accuracy for test [0.82537674 0.62601805]

precision train and test 0.20707749245861745 0.17676673503636026
recall train and test 0.8991744634012108 0.6260180497468633
f1 train and test 0.3366299913462727 0.2756882512601784
max-depth 12
Accuracy on train data 0.840172779460515
Accuracy on test data 0.8576724052144492

			no cancer	cancer
class-wise accuracy for train [0.83457923 0.93357182]
class-wise accuracy for test [0.87650463 0.54325336]

precision train and test 0.25260978987654686 0.20853400929446556
recall train and test 0.9335718216840947 0.5432533568126788
f1 train and test 0.3976277821403874 0.3013798998656735
max-depth 14
Accuracy on train data 0.8816790332311252
Accuracy on test data 0.896407602746542

			no cancer	cancer
class-wise accuracy for train [0.87674854 0.9640066 ]
class-wise accuracy 

In [None]:
clf = DecisionTreeClassifier(max_depth=10, random_state = 42, class_weight = "balanced")
pca = PCA(n_components = 100)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.fit_transform(x_test)
clf.fit(x_train_pca, y_train)
dtree_train_pred = clf.predict_proba(x_train_pca)
dtree_test_pred = clf.predict_proba(x_test_pca)

In [None]:
pca = PCA(n_components= 40)

mlp_clf = MLPClassifier(hidden_layer_sizes=(7,), random_state=42, max_iter=40)
x_train_small_pca = pca.fit_transform(x_train_small)
x_test_pca = pca.transform(x_test)
x_train_pca = pca.transform(x_train)

mlp_clf.fit(sc.fit_transform(x_train_small_pca), y_train_small)

mlp_train_pred = mlp_clf.predict_proba(sc.fit_transform(x_train_pca))
mlp_test_pred = mlp_clf.predict_proba(sc.fit_transform(x_test_pca))



In [None]:
rf_clf = RandomForestClassifier(max_depth=10, random_state = 42, class_weight = "balanced")
pca = PCA(n_components= 100)

x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.fit_transform(x_test)

rf_clf.fit(x_train_pca, y_train)

rf_train_pred = rf_clf.predict_proba(x_train_pca)
rf_test_pred = rf_clf.predict_proba(x_test_pca)


In [None]:
final_pred_train = []
final_pred_test = []

for i in range(len(dtree_train_pred)):
    val = (dtree_train_pred[i][0] + mlp_train_pred[i][0] + rf_train_pred[i][0])/3
    if val <= 0.5:
        final_pred_train.append(1)
    else:
        final_pred_train.append(0)

for i in range(len(dtree_test_pred)):
    val = (dtree_test_pred[i][0] + mlp_test_pred[i][0] + rf_test_pred[i][0])/3
    if val <= 0.5:
        final_pred_test.append(1)
    else:
        final_pred_test.append(0)

print("Accuracy on train data", metrics.accuracy_score(y_train, final_pred_train))
print("Accuracy on test data", metrics.accuracy_score(y_test, final_pred_test))

train_matrix = confusion_matrix(y_train, final_pred_train)
test_matrix = confusion_matrix(y_test, final_pred_test)

print()
print("\t\t\tno cancer\t cancer")
print("class-wise accuracy for train", train_matrix.diagonal()/train_matrix.sum(axis=1))
print("class-wise accuracy for test", test_matrix.diagonal()/test_matrix.sum(axis=1))

print()

precision_train = precision_score(y_train, final_pred_train)
recall_train = recall_score(y_train, final_pred_train)
f1_train = f1_score(y_train, final_pred_train)

precision_test = precision_score(y_test, final_pred_test)
recall_test = recall_score(y_test, final_pred_test)
f1_test = f1_score(y_test, final_pred_test)

print()
print("precision train and test", precision_train, precision_test)
print("recall train and test", recall_train, recall_test)
print("f1 train and test", f1_train, f1_test)

print()

Accuracy on train data 0.7842184808095384
Accuracy on test data 0.7903025176634491

			no cancer	 cancer
class-wise accuracy for train [0.77831283 0.88282884]
class-wise accuracy for test [0.79438094 0.72220999]


precision train and test 0.1925690276110444 0.17380939767971607
recall train and test 0.8828288387451844 0.7222099933964341
f1 train and test 0.3161722676653198 0.2801878736122972

