In [1]:
import os

if not os.path.exists("heart-attack-prediction.csv"):
    !rm -rf sample_data
    from google.colab import files
    files.upload()
    !xz -d *.xz
    print("Uploaded files:")
    !ls
else:
    print("Already have files:")
    !ls
# Prepare dataframe
import pandas as pd
import numpy as np

df = (pd.read_csv("heart-attack-prediction.csv", na_values="?")
#         .replace("?", np.nan) # change ? to nan
        
        .drop(["ca", "thal", "slope"], axis=1)) # drop columns w/ too many nulls

df["restecg"] = df["restecg"].astype(pd.Int64Dtype())

# Encode feature cp using one-hot encoding
df = pd.concat([df, pd.get_dummies(df["cp"], prefix="cp", drop_first=True)], axis=1) # Don't drop first if using SVM
df = pd.concat([df, pd.get_dummies(df["restecg"], prefix="restecg", drop_first=True)], axis=1)


# Reorder and drop columns
df = df[["age", "sex", "cp_2", "cp_3", "cp_4", "trestbps", "chol", "fbs", "restecg_1", "restecg_2", "thalach", "exang", "oldpeak", "num"]]

df.info()
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")

imputer.fit(df)

df = pd.DataFrame(imputer.transform(df), columns = df.columns)

df.head()
from sklearn.model_selection import train_test_split

x = df.drop("num", axis=1)
y = df["num"]


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

print(len(x_train), 'train examples')
print(len(x_test), 'test examples')
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
models = {
    "Logistic Regression": LogisticRegression(random_state = 0),
    "Decision Tree": DecisionTreeClassifier(max_depth=5),
    "Random Forest": RandomForestClassifier(n_estimators=10, max_depth=5),
    "XGBoost": XGBClassifier(),
    "SVM (linear)": SVC(kernel="linear", gamma=0.00001, C=100),
    "SVM (polynomial)": SVC(kernel="poly", degree=2, gamma=0.001, C=10),
    "SVM (rbf)": SVC(kernel="rbf", gamma=0.001, C=10),
    "Naive Bayes": GaussianNB(),
    "KNN (k=1)": KNeighborsClassifier(n_neighbors=1, weights="distance"),
    "KNN (k=3)": KNeighborsClassifier(n_neighbors=3, weights="distance"),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5, weights="distance"),
}
results = pd.DataFrame(columns=["Model", "True Positive", "False Negative", "False Positive", "True Positive", "Precision", "Recall", "Accuracy", "F1", "F2"])
from sklearn.metrics import confusion_matrix
from tqdm import tqdm
for name, model in tqdm(models.items()):
    
    model.fit(x_train, y_train)
    
    y_predict = model.predict(x_test)
    
    cm = confusion_matrix(y_predict, y_test)
    
    tn, fn, fp, tp = cm.ravel()
    
    acc = (tn+tp)/(tn+fn+fp+tp)
    prec = tp/(tp+fp)
    rec = tp/(tp+fn)
    f1 = 2*prec*rec/(prec+rec)
    f2 = 5*prec*rec/(4*prec+rec)
    
#    print("%20s\t[[%2d, %2d],[%2d, %2d]]\tacc: %6.2f%% prec: %6.2f%% rec: %6.2f%% f1: %3.2f f2: %3.2f"
#          %(name, tn, fn, fp, tp, 100*acc, 100*prec, 100*rec, f1, f2))
    
    results = results.append({
        "Model": name,
        "True Positive": tp,
        "False Negative": fn,
        "False Positive": fp,
        "True Positive": tp,
        "Precision": prec,
        "Recall": rec,
        "Accuracy": acc,
        "F1": f1,
        "F2": f2
    }, ignore_index=True)
results.sort_values("Recall", ascending=False, inplace=True)
results

Saving heart-attack-prediction.csv.xz to heart-attack-prediction.csv.xz
Uploaded files:
heart-attack-prediction.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        294 non-null    int64  
 1   sex        294 non-null    int64  
 2   cp_2       294 non-null    uint8  
 3   cp_3       294 non-null    uint8  
 4   cp_4       294 non-null    uint8  
 5   trestbps   293 non-null    float64
 6   chol       271 non-null    float64
 7   fbs        286 non-null    float64
 8   restecg_1  294 non-null    uint8  
 9   restecg_2  294 non-null    uint8  
 10  thalach    293 non-null    float64
 11  exang      293 non-null    float64
 12  oldpeak    294 non-null    float64
 13  num        294 non-null    int64  
dtypes: float64(6), int64(3), uint8(5)
memory usage: 22.2 KB
235 train examples
59 test examples


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
100%|██████████| 11/11 [00:25<00:00,  2.32s/it]


Unnamed: 0,Model,True Positive,False Negative,False Positive,True Positive.1,Precision,Recall,Accuracy,F1,F2
0,Logistic Regression,17,1,4,17,0.809524,0.944444,0.915254,0.871795,0.913978
3,XGBoost,17,1,6,17,0.73913,0.944444,0.881356,0.829268,0.894737
7,Naive Bayes,17,1,9,17,0.653846,0.944444,0.830508,0.772727,0.867347
5,SVM (polynomial),16,2,6,16,0.727273,0.888889,0.864407,0.8,0.851064
4,SVM (linear),15,3,5,15,0.75,0.833333,0.864407,0.789474,0.815217
1,Decision Tree,14,4,8,14,0.636364,0.777778,0.79661,0.7,0.744681
2,Random Forest,14,4,6,14,0.7,0.777778,0.830508,0.736842,0.76087
6,SVM (rbf),7,11,16,7,0.304348,0.388889,0.542373,0.341463,0.368421
8,KNN (k=1),7,11,18,7,0.28,0.388889,0.508475,0.325581,0.360825
9,KNN (k=3),7,11,14,7,0.333333,0.388889,0.576271,0.358974,0.376344
