In [1]:
import pandas as pd

In [2]:
# Imports for analysis

import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier

In [3]:
df = pd.read_excel("posculture.xlsx")
df.head()

Unnamed: 0,Gender,Age,Onset of pneumonia (after admission to hospital),Positive Culture,Blood Cultures,Sputum Cultures,MDR,History of recent admission (within 90 days),Recent diagnosis of COVID,Mechanical Ventilation,CHF,DM,HTN,CAD,CKD,BMI,Antimicrobial used,Duration of stay after pneumonia (Days),Outcomes
0,Female,65,0,Yes,enterococcus faecium Vancomycin resistant,negative,Yes,Yes,yes,No,Yes,No,No,No,Yes,26.0,Gentamycin / meropenem / linezolid,54,Death
1,Male,67,22,Yes,Staphylococci aureus,Negative,No,No,yes,Yes,No,Yes,No,No,No,20.0,Teicoplanin / levofloxacin / remdesivir,10,Death
2,Female,69,0,Yes,Acinetobacter baumanii,Negative,Yes,No,yes,Yes,No,Yes,No,No,No,26.7,Remdesivir / levofloxacin / meropenem / anidu...,17,Death
3,Female,76,0,yes,Negative,staphylococcus aureus Methicillin resistant,yes,yes,yes,yes,No,Yes,Yes,Yes,yes,44.0,Tazocin / teicoplanin / levofloxacin / vancomycin,6,death
4,Male,80,0,yes,Klebsiella pneumonia carbamanase positive,Acinetobacter baumanii MDR,yes,yes,yes,yes,yes,no,yes,no,no,24.0,Tazocin / levofloxacin / teicoplanin / meropen...,48,death


In [4]:
df.rename(columns = lambda x: x.strip(), inplace=True) # Cleanup whitespace
for x in df: print(x)

Gender
Age
Onset of pneumonia (after admission to hospital)
Positive Culture
Blood Cultures
Sputum Cultures
MDR
History of recent admission (within 90 days)
Recent diagnosis of COVID
Mechanical Ventilation
CHF
DM
HTN
CAD
CKD
BMI
Antimicrobial used
Duration of stay after pneumonia (Days)
Outcomes


In [5]:
# Rename long columns

col_dict = {
    "Onset of pneumonia (after admission to hospital)": "Onset",
    "History of recent admission (within 90 days)": "Admission",
    "Mechanical Ventilation": "Ventilation",
    "Recent diagnosis of COVID": "COVID",
    "Antimicrobial used": "Antimicrobial",
    "Duration of stay after pneumonia (Days)": "Stay Duration",
}

df.rename(columns = col_dict, inplace=True)

for x in df: print(x)

Gender
Age
Onset
Positive Culture
Blood Cultures
Sputum Cultures
MDR
Admission
COVID
Ventilation
CHF
DM
HTN
CAD
CKD
BMI
Antimicrobial
Stay Duration
Outcomes


In [6]:
# Lowercase all values
df = df[1:].applymap(lambda s: s.lower() if type(s) == str else s)

# Strip whitespace in values
df = df[1:].applymap(lambda s: s.strip() if type(s) == str else s)

In [7]:
# Check for unique values
print("Gender: ", df["Gender"].unique(), "\n")
print("Outcomes: ", df["Outcomes"].unique(), "\n")
print("Age: ", df["Age"].unique(), "\n")
print("Onset", df["Onset"].unique(), "\n")
print("Positive Culture", df["Positive Culture"].unique(), "\n")

Gender:  ['female' 'male'] 

Outcomes:  ['death' 'discharge' 'discharged'] 

Age:  [69 76 80 81 88 64 67 52 77 55 63 85 70 27 19 21 24 29 35 39 40 41 43 46
 47 48 50 51 54 56 57 58 59 60 61 62 65 66 68 73 74 75 78 79 82 83 84 86
 90 16 18 20 25 28 30 33 36 87 44 72 26] 

Onset [ 0  3  4  7 30  2  5 28 16 22  8 14 18 15 13  1 10  9  6 19 11 17] 

Positive Culture ['yes'] 



In [8]:
# Codify/adjust values
df["Gender"] = df["Gender"].map({"male": 0, "female": 1}, na_action="ignore") # Male: 0, Female: 1
df["Outcomes"] = df["Outcomes"].map({"discharge": 0, "discharged": 0, "death": 1}, na_action="ignore") # Discharge: 0, Death: 1
df.drop(["Positive Culture"], axis=1, inplace=True) # Remove positive culture, since all values are yes

# Change yes/no values to numeric
yes_no_map = {"yes": 1, "no": 0}
yes_no_cols = ("MDR", "Admission", "COVID", "Ventilation", "CHF", "DM", "HTN", "CAD", "CKD")

for col_name in yes_no_cols:
    unique_vals_in_col = df[col_name].unique()
    print(col_name + ":", unique_vals_in_col) # Should just be "yes" and "no"
    
    # If you have values that aren't yes and no, then you either have typos that should be manually cleaned
    # or you're using a column that doesn't split cleanly into yes and no
    if (set(unique_vals_in_col) == set(["yes", "no"])): df[col_name] = df[col_name].map(yes_no_map)
    else: print("No change")
        
    print(col_name + ":", df[col_name].unique(), "\n") # Should just be "yes" and "no"

MDR: ['yes' 'no']
MDR: [1 0] 

Admission: ['no' 'yes']
Admission: [0 1] 

COVID: ['yes' 'no']
COVID: [1 0] 

Ventilation: ['yes' 'no']
Ventilation: [1 0] 

CHF: ['no' 'yes']
CHF: [0 1] 

DM: ['yes' 'no']
DM: [1 0] 

HTN: ['no' 'yes']
HTN: [0 1] 

CAD: ['no' 'yes']
CAD: [0 1] 

CKD: ['no' 'yes']
CKD: [0 1] 



In [9]:
df.head()

Unnamed: 0,Gender,Age,Onset,Blood Cultures,Sputum Cultures,MDR,Admission,COVID,Ventilation,CHF,DM,HTN,CAD,CKD,BMI,Antimicrobial,Stay Duration,Outcomes
2,1,69,0,acinetobacter baumanii,negative,1,0,1,1,0,1,0,0,0,26.7,remdesivir / levofloxacin / meropenem / anidul...,17,1
3,1,76,0,negative,staphylococcus aureus methicillin resistant,1,1,1,1,0,1,1,1,1,44.0,tazocin / teicoplanin / levofloxacin / vancomycin,6,1
4,0,80,0,klebsiella pneumonia carbamanase positive,acinetobacter baumanii mdr,1,1,1,1,1,0,1,0,0,24.0,tazocin / levofloxacin / teicoplanin / meropen...,48,1
5,0,81,0,negative,klebsiella pneumonia carbapenamase positive,1,0,1,1,0,1,1,0,1,27.7,teicoplanin / meropenem,16,1
6,0,81,0,acinetobacter,acinetobacter baumanii,1,0,1,1,0,1,0,0,0,25.7,levofloxacin / teicoplanin / meropenem,16,1


In [10]:
df = df.dropna()

In [11]:
target_df = df["MDR"]
target_df.head()

2    1
3    1
4    1
5    1
6    1
Name: MDR, dtype: int64

In [12]:
def standardClassificationAndScoring(name, classifier, cx, cy, scoring="f1"):
    scores = cross_val_score(classifier, cx, cy, cv=10, scoring=scoring)
    print(f"{name} Results: {scores}")
    print(f"{name} Mean: {scores.mean()}")

In [13]:
classifiers = {
    "GDT": DecisionTreeClassifier(criterion="gini", class_weight="balanced"),
    "EDT": DecisionTreeClassifier(criterion="entropy", class_weight="balanced"),
    "GNB": GaussianNB(),
    "BNB": BernoulliNB(),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "LSVM": SVC(kernel="linear", class_weight={1:1, 0:1}),
    "PSVM": SVC(kernel="poly", degree=10, class_weight={1:1, 0:1}),
    "RBFSVM": SVC(kernel="rbf", class_weight={1:1, 0:1}),
    "SSVM": SVC(kernel="sigmoid", class_weight={1:1, 0:1}),
    "RF": RandomForestClassifier(max_depth=3, random_state=0),
}

In [14]:
columns = ["Age", "Gender", "BMI"]
active_df = df[columns]
active_df.head()

Unnamed: 0,Age,Gender,BMI
2,69,1,26.7
3,76,1,44.0
4,80,0,24.0
5,81,0,27.7
6,81,0,25.7


In [15]:
crossX = active_df.values
crossY = target_df

In [16]:
for name, classifier in classifiers.items():
    standardClassificationAndScoring(name, classifier, crossX, crossY)

GDT Results: [0.82352941 0.77419355 0.6        0.55172414 0.46153846 0.88888889
 0.57142857 0.88235294 0.84848485 0.82352941]
GDT Mean: 0.7225670221364784
EDT Results: [0.82352941 0.8        0.64516129 0.55172414 0.55172414 0.88888889
 0.57142857 0.88235294 0.82352941 0.85714286]
EDT Mean: 0.7395481648350849
GNB Results: [0.89473684 0.89473684 0.86486486 0.89473684 0.89473684 0.89473684
 0.91891892 0.91891892 0.88888889 0.88888889]
GNB Mean: 0.8954164691006797
BNB Results: [0.89473684 0.89473684 0.89473684 0.89473684 0.89473684 0.89473684
 0.91891892 0.91891892 0.88888889 0.88888889]
BNB Mean: 0.8984036668247196
KNN Results: [0.89473684 0.89473684 0.64516129 0.6        0.76470588 0.86486486
 0.88888889 0.94444444 0.88888889 0.88888889]
KNN Mean: 0.8275316832862025
LSVM Results: [0.89473684 0.89473684 0.89473684 0.89473684 0.89473684 0.89473684
 0.91891892 0.91891892 0.88888889 0.88888889]
LSVM Mean: 0.8984036668247196
PSVM Results: [0.89473684 0.89473684 0.86486486 0.89473684 0.8947368

In [17]:
from sklearn.metrics import make_scorer
from imblearn.metrics import geometric_mean_score

gm_scorer = make_scorer(geometric_mean_score, average='binary')

for name, classifier in classifiers.items():
    standardClassificationAndScoring(name, classifier, crossX, crossY, gm_scorer)

GDT Results: [0.         0.59408853 0.         0.         0.29704426 0.48507125
 0.         0.54232614 0.46770717 0.        ]
GDT Mean: 0.23862373566450562
EDT Results: [0.45374261 0.72760688 0.         0.         0.         0.48507125
 0.         0.54232614 0.         0.        ]
EDT Mean: 0.22087468762148205
GNB Results: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
GNB Mean: 0.0
BNB Results: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
BNB Mean: 0.0
KNN Results: [0.         0.         0.         0.         0.         0.
 0.         0.57735027 0.         0.        ]
KNN Mean: 0.057735026918962574
LSVM Results: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
LSVM Mean: 0.0
PSVM Results: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
PSVM Mean: 0.0
RBFSVM Results: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
RBFSVM Mean: 0.0
SSVM Results: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
SSVM Mean: 0.0
RF Results: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
RF Mean: 0.0


In [18]:
for name, classifier in classifiers.items():
    standardClassificationAndScoring(name, classifier, crossX, crossY, "f1_macro")

GDT Results: [0.53676471 0.56891496 0.32258065 0.27586207 0.22222222 0.61111111
 0.28571429 0.65714286 0.53125    0.56709957]
GDT Mean: 0.4578662419310934
EDT Results: [0.53676471 0.69208211 0.3        0.27586207 0.27586207 0.61111111
 0.28571429 0.65714286 0.41176471 0.60784314]
EDT Mean: 0.4654147052355846
GNB Results: [0.44736842 0.44736842 0.43243243 0.44736842 0.44736842 0.44736842
 0.45945946 0.45945946 0.44444444 0.44444444]
GNB Mean: 0.44770823455033987
BNB Results: [0.44736842 0.44736842 0.44736842 0.44736842 0.44736842 0.44736842
 0.45945946 0.45945946 0.44444444 0.44444444]
BNB Mean: 0.4492018334123598
KNN Results: [0.44736842 0.44736842 0.32258065 0.3        0.38235294 0.43243243
 0.44444444 0.72222222 0.44444444 0.44444444]
KNN Mean: 0.43876584164310123
LSVM Results: [0.44736842 0.44736842 0.44736842 0.44736842 0.44736842 0.44736842
 0.45945946 0.45945946 0.44444444 0.44444444]
LSVM Mean: 0.4492018334123598
PSVM Results: [0.44736842 0.44736842 0.43243243 0.44736842 0.44736

In [19]:
from sklearn.model_selection import KFold, StratifiedKFold
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

kf = StratifiedKFold(n_splits=10)

results = {}
for name in classifiers: results[name] = []
yt, yp = [], []

for sm in (SMOTEENN(sampling_strategy="minority"), SMOTETomek(sampling_strategy="minority")):
    for name in classifiers: results[name] = []
    print("\n", sm)
    for fold, (train_index, test_index) in enumerate(kf.split(active_df, target_df), 1):
        X_train = active_df.iloc[train_index]
        y_train = target_df.iloc[train_index]
        X_test = active_df.iloc[test_index]
        y_test = target_df.iloc[test_index]
        X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
        for name, classifier in classifiers.items():
            classifier.fit(X_train_oversampled, y_train_oversampled)
            y_pred = classifier.predict(X_test)
            for x in y_test: yt.append(x)
            for x in y_pred: yp.append(x)
    
    # for name, res in results.items():
    #     print(f"{name}: {sum(res) / len(res)}")
    #     print(res)

    print(confusion_matrix(yt, yp))


 SMOTEENN(sampling_strategy='minority')
[[ 218  162]
 [1186  494]]

 SMOTETomek(sampling_strategy='minority')
[[ 395  365]
 [2213 1147]]


In [20]:
from sklearn.model_selection import KFold, StratifiedKFold
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

kf = StratifiedKFold(n_splits=10)

results = {}
for name in classifiers: results[name] = []
yt, yp = [], []

for sm in (SMOTE(sampling_strategy="minority"), SMOTETomek(sampling_strategy="minority")):
    for name in classifiers: results[name] = []
    print("\n", sm)
    for fold, (train_index, test_index) in enumerate(kf.split(active_df, target_df), 1):
        X_train = active_df.iloc[train_index]
        y_train = target_df.iloc[train_index]
        X_test = active_df.iloc[test_index]
        y_test = target_df.iloc[test_index]
        X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
        for name, classifier in classifiers.items():
            classifier.fit(X_train_oversampled, y_train_oversampled)
            y_pred = classifier.predict(X_test)
            for x in y_test: yt.append(x)
            for x in y_pred: yp.append(x)
    
    # for name, res in results.items():
    #     print(f"{name}: {sum(res) / len(res)}")
    #     print(res)

    print(confusion_matrix(yt, yp))


 SMOTE(sampling_strategy='minority')
[[175 205]
 [987 693]]

 SMOTETomek(sampling_strategy='minority')
[[ 360  400]
 [2002 1358]]


In [21]:
for col in df:
    print(col)

Gender
Age
Onset
Blood Cultures
Sputum Cultures
MDR
Admission
COVID
Ventilation
CHF
DM
HTN
CAD
CKD
BMI
Antimicrobial
Stay Duration
Outcomes


In [22]:
columns = ["Gender", "Age", "Admission", "COVID", "Ventilation", "CHF", "DM", "HTN", "CAD", "CKD", "BMI"]
active_df = df[columns]
active_df.head()

Unnamed: 0,Gender,Age,Admission,COVID,Ventilation,CHF,DM,HTN,CAD,CKD,BMI
2,1,69,0,1,1,0,1,0,0,0,26.7
3,1,76,1,1,1,0,1,1,1,1,44.0
4,0,80,1,1,1,1,0,1,0,0,24.0
5,0,81,0,1,1,0,1,1,0,1,27.7
6,0,81,0,1,1,0,1,0,0,0,25.7


In [23]:
crossX = active_df.values
crossY = target_df

In [24]:
from sklearn.model_selection import KFold, StratifiedKFold
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

kf = StratifiedKFold(n_splits=10)

results = {}
for name in classifiers: results[name] = []
yt, yp = [], []

for sm in (SMOTEENN(sampling_strategy="minority"), SMOTETomek(sampling_strategy="minority")):
    for name in classifiers: results[name] = []
    print("\n", sm)
    for fold, (train_index, test_index) in enumerate(kf.split(active_df, target_df), 1):
        X_train = active_df.iloc[train_index]
        y_train = target_df.iloc[train_index]
        X_test = active_df.iloc[test_index]
        y_test = target_df.iloc[test_index]
        X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
        for name, classifier in classifiers.items():
            classifier.fit(X_train_oversampled, y_train_oversampled)
            y_pred = classifier.predict(X_test)
            for x in y_test: yt.append(x)
            for x in y_pred: yp.append(x)
    
    # for name, res in results.items():
    #     print(f"{name}: {sum(res) / len(res)}")
    #     print(res)

    print(confusion_matrix(yt, yp))


 SMOTEENN(sampling_strategy='minority')
[[ 224  156]
 [1080  600]]

 SMOTETomek(sampling_strategy='minority')
[[ 387  373]
 [1812 1548]]


In [25]:
from sklearn.model_selection import KFold, StratifiedKFold
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

kf = StratifiedKFold(n_splits=10)

results = {}
for name in classifiers: results[name] = []
yt, yp = [], []

for sm in (SMOTE(sampling_strategy="minority"), SMOTETomek(sampling_strategy="minority")):
    for name in classifiers: results[name] = []
    print("\n", sm)
    for fold, (train_index, test_index) in enumerate(kf.split(active_df, target_df), 1):
        X_train = active_df.iloc[train_index]
        y_train = target_df.iloc[train_index]
        X_test = active_df.iloc[test_index]
        y_test = target_df.iloc[test_index]
        X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
        for name, classifier in classifiers.items():
            classifier.fit(X_train_oversampled, y_train_oversampled)
            y_pred = classifier.predict(X_test)
            for x in y_test: yt.append(x)
            for x in y_pred: yp.append(x)
    
    # for name, res in results.items():
    #     print(f"{name}: {sum(res) / len(res)}")
    #     print(res)

    print(confusion_matrix(yt, yp))


 SMOTE(sampling_strategy='minority')
[[174 206]
 [732 948]]

 SMOTETomek(sampling_strategy='minority')
[[ 358  402]
 [1480 1880]]


In [26]:
def testModel(model):
    kf = StratifiedKFold(n_splits=10)
    yt, yp = [], []
    for fold, (train_index, test_index) in enumerate(kf.split(active_df, target_df), 1):
        X_train = active_df.iloc[train_index]
        y_train = target_df.iloc[train_index]
        X_test = active_df.iloc[test_index]
        y_test = target_df.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        for x in y_test: yt.append(x)
        for x in y_pred: yp.append(x)
    return yt, yp

In [27]:
def flip(yt, yp):
    return yt, [1 if x == 0 else 0 for x in yp]

In [28]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def scoreModel(yt, yp):
    print("f1:", f1_score(yt, yp))
    print("accuracy:", accuracy_score(yt, yp))
    print("roc-auc:", roc_auc_score(yt, yp))
    print("precision:", precision_score(yt, yp))
    print("recall:", recall_score(yt, yp))
    print("report:\n", classification_report(yt, yp))
    #recall of the positive class is also known as “sensitivity”
    #recall of the negative class is “specificity”.
    print("confusion:\n", confusion_matrix(yt, yp))

In [29]:
scoreModel(*flip(*testModel(SVC(kernel="sigmoid", class_weight="balanced", C=1))))

f1: 0.8132530120481928
accuracy: 0.6990291262135923
roc-auc: 0.5202067669172933
precision: 0.823170731707317
recall: 0.8035714285714286
report:
               precision    recall  f1-score   support

           0       0.21      0.24      0.23        38
           1       0.82      0.80      0.81       168

    accuracy                           0.70       206
   macro avg       0.52      0.52      0.52       206
weighted avg       0.71      0.70      0.70       206

confusion:
 [[  9  29]
 [ 33 135]]


In [30]:
scoreModel(*flip(*testModel(SVC(kernel="rbf", class_weight="balanced", C=1))))

f1: 0.8738461538461538
accuracy: 0.8009708737864077
roc-auc: 0.725250626566416
precision: 0.9044585987261147
recall: 0.8452380952380952
report:
               precision    recall  f1-score   support

           0       0.47      0.61      0.53        38
           1       0.90      0.85      0.87       168

    accuracy                           0.80       206
   macro avg       0.69      0.73      0.70       206
weighted avg       0.82      0.80      0.81       206

confusion:
 [[ 23  15]
 [ 26 142]]


In [31]:
scoreModel(*flip(*testModel(SVC(kernel="poly", degree=3, class_weight="balanced", C=1))))

f1: 0.8076923076923077
accuracy: 0.7087378640776699
roc-auc: 0.6381578947368423
precision: 0.875
recall: 0.75
report:
               precision    recall  f1-score   support

           0       0.32      0.53      0.40        38
           1       0.88      0.75      0.81       168

    accuracy                           0.71       206
   macro avg       0.60      0.64      0.60       206
weighted avg       0.77      0.71      0.73       206

confusion:
 [[ 20  18]
 [ 42 126]]


In [32]:
scoreModel(*testModel(SVC(kernel="linear", class_weight="balanced", C=.25)))

f1: 0.8264984227129337
accuracy: 0.7330097087378641
roc-auc: 0.6530388471177945
precision: 0.8791946308724832
recall: 0.7797619047619048
report:
               precision    recall  f1-score   support

           0       0.35      0.53      0.42        38
           1       0.88      0.78      0.83       168

    accuracy                           0.73       206
   macro avg       0.62      0.65      0.62       206
weighted avg       0.78      0.73      0.75       206

confusion:
 [[ 20  18]
 [ 37 131]]


In [33]:
scoreModel(*testModel(DecisionTreeClassifier(random_state=0, criterion="gini", splitter="random", class_weight="balanced")))

f1: 0.842406876790831
accuracy: 0.7330097087378641
roc-auc: 0.4901315789473685
precision: 0.8121546961325967
recall: 0.875
report:
               precision    recall  f1-score   support

           0       0.16      0.11      0.13        38
           1       0.81      0.88      0.84       168

    accuracy                           0.73       206
   macro avg       0.49      0.49      0.48       206
weighted avg       0.69      0.73      0.71       206

confusion:
 [[  4  34]
 [ 21 147]]


In [34]:
scoreModel(*testModel(DecisionTreeClassifier(random_state=0, criterion="entropy", splitter="random", class_weight="balanced")))

f1: 0.834319526627219
accuracy: 0.7281553398058253
roc-auc: 0.5380639097744361
precision: 0.8294117647058824
recall: 0.8392857142857143
report:
               precision    recall  f1-score   support

           0       0.25      0.24      0.24        38
           1       0.83      0.84      0.83       168

    accuracy                           0.73       206
   macro avg       0.54      0.54      0.54       206
weighted avg       0.72      0.73      0.73       206

confusion:
 [[  9  29]
 [ 27 141]]


In [35]:
scoreModel(*testModel(RandomForestClassifier(random_state=0, max_depth=3)))

f1: 0.8954423592493298
accuracy: 0.8106796116504854
roc-auc: 0.49702380952380953
precision: 0.8146341463414634
recall: 0.9940476190476191
report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        38
           1       0.81      0.99      0.90       168

    accuracy                           0.81       206
   macro avg       0.41      0.50      0.45       206
weighted avg       0.66      0.81      0.73       206

confusion:
 [[  0  38]
 [  1 167]]


In [36]:
scoreModel(*testModel(GaussianNB()))

f1: 0.8604651162790699
accuracy: 0.7669902912621359
roc-auc: 0.5720551378446115
precision: 0.8409090909090909
recall: 0.8809523809523809
report:
               precision    recall  f1-score   support

           0       0.33      0.26      0.29        38
           1       0.84      0.88      0.86       168

    accuracy                           0.77       206
   macro avg       0.59      0.57      0.58       206
weighted avg       0.75      0.77      0.76       206

confusion:
 [[ 10  28]
 [ 20 148]]


In [37]:
scoreModel(*testModel(BernoulliNB()))

f1: 0.8948787061994609
accuracy: 0.8106796116504854
roc-auc: 0.5072055137844611
precision: 0.8177339901477833
recall: 0.9880952380952381
report:
               precision    recall  f1-score   support

           0       0.33      0.03      0.05        38
           1       0.82      0.99      0.89       168

    accuracy                           0.81       206
   macro avg       0.58      0.51      0.47       206
weighted avg       0.73      0.81      0.74       206

confusion:
 [[  1  37]
 [  2 166]]


In [38]:
scoreModel(*testModel(KNeighborsClassifier(n_neighbors=7)))

f1: 0.8394366197183099
accuracy: 0.7233009708737864
roc-auc: 0.44345238095238093
precision: 0.7967914438502673
recall: 0.8869047619047619
report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        38
           1       0.80      0.89      0.84       168

    accuracy                           0.72       206
   macro avg       0.40      0.44      0.42       206
weighted avg       0.65      0.72      0.68       206

confusion:
 [[  0  38]
 [ 19 149]]


In [39]:
from sklearn.ensemble import AdaBoostClassifier
scoreModel(*testModel(AdaBoostClassifier(n_estimators=100)))

f1: 0.8152492668621701
accuracy: 0.6941747572815534
roc-auc: 0.4663220551378446
precision: 0.8034682080924855
recall: 0.8273809523809523
report:
               precision    recall  f1-score   support

           0       0.12      0.11      0.11        38
           1       0.80      0.83      0.82       168

    accuracy                           0.69       206
   macro avg       0.46      0.47      0.46       206
weighted avg       0.68      0.69      0.69       206

confusion:
 [[  4  34]
 [ 29 139]]


In [44]:
crossX

array([[ 1. , 69. ,  0. , ...,  0. ,  0. , 26.7],
       [ 1. , 76. ,  1. , ...,  1. ,  1. , 44. ],
       [ 0. , 80. ,  1. , ...,  0. ,  0. , 24. ],
       ...,
       [ 0. , 68. ,  1. , ...,  1. ,  1. , 35. ],
       [ 1. , 26. ,  1. , ...,  0. ,  1. , 22. ],
       [ 0. , 82. ,  0. , ...,  1. ,  0. , 44.9]])