In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

### What columns are available in the data?

In [None]:
df.columns

### What does each variable look like? Some variables look pretty skewed, so we take the log of them.

In [None]:
print(["anaemia = %d : %d cases"%(k,np.sum(df.anaemia.values == k)) for k in np.unique(df.anaemia.values)])

In [None]:
print(["diabetes = %d : %d cases"%(k,np.sum(df.diabetes.values == k)) for k in np.unique(df.diabetes.values)])

In [None]:
print(["high_blood_pressure = %d : %d cases"%(k,np.sum(df.high_blood_pressure.values == k)) for k in np.unique(df.high_blood_pressure.values)])

In [None]:
print(["sex = %d : %d cases"%(k,np.sum(df.sex.values == k)) for k in np.unique(df.sex.values)])

In [None]:
print(["smoking = %d : %d cases"%(k,np.sum(df.smoking.values == k)) for k in np.unique(df.smoking.values)])

In [None]:
plt.hist(np.log(df.age.values))
plt.xlabel("log(Age)", fontsize=14)
plt.show()

In [None]:
plt.hist(np.log(df.creatinine_phosphokinase.values))
plt.xlabel("log(Creatinine Phosphokinase)", fontsize=14)
plt.show()

In [None]:
plt.hist(np.log(df.ejection_fraction.values))
plt.xlabel("log(Ejection Fraction)", fontsize=14)
plt.show()

In [None]:
plt.hist(df.platelets.values)
plt.xlabel("Platelets", fontsize=14)
plt.show()

In [None]:
plt.hist(np.log(df.serum_creatinine.values))
plt.xlabel("log(Serum Creatinine)", fontsize=14)
plt.show()

In [None]:
plt.hist(np.log(df.serum_sodium.values))
plt.xlabel("log(Serum Sodium)", fontsize=14)
plt.show()

In [None]:
plt.hist(df.time.values)
plt.xlabel("Time", fontsize=14)
plt.show()

### Split the data into training and testing sets

In [None]:
np.random.seed(8)
test = np.random.choice(np.arange(df.shape[0]), size=60, replace=False)
train = np.setdiff1d(np.arange(df.shape[0]), test)
X = np.array([np.log(df.age.values[train]), np.log(df.creatinine_phosphokinase.values[train]), 
              np.log(df.ejection_fraction.values[train]), np.log(df.platelets.values[train]), 
              np.log(df.serum_creatinine.values[train]), np.log(df.serum_sodium.values[train]),
              df.diabetes.values[train], df.high_blood_pressure.values[train], df.sex.values[train],
              df.anaemia.values[train], df.smoking.values[train],
              df.time.values[train]]).T
Xtest = np.array([np.log(df.age.values[test]), np.log(df.creatinine_phosphokinase.values[test]), 
              np.log(df.ejection_fraction.values[test]), np.log(df.platelets.values[test]), 
              np.log(df.serum_creatinine.values[test]), np.log(df.serum_sodium.values[test]),
              df.diabetes.values[test], df.high_blood_pressure.values[test], df.sex.values[test],
              df.anaemia.values[test], df.smoking.values[test],
              df.time.values[test]]).T

# Quadratic Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
clf = QuadraticDiscriminantAnalysis()
mdl = clf.fit(X, df.DEATH_EVENT.values[train])
fittedvals = mdl.predict(Xtest)
acc_qda = np.sum(fittedvals == df.DEATH_EVENT.values[test])/len(fittedvals)
print(acc_qda)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(max_iter=500).fit(X, df.DEATH_EVENT.values[train])
fittedvals = clf.predict(Xtest)
acc_logreg = np.sum(fittedvals == df.DEATH_EVENT.values[test])/len(fittedvals)
print(acc_logreg)

# Support Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

### Which kernel has the best fit?

In [None]:
kernels = ['linear', 'poly', 'sigmoid', 'rbf']
for kern in kernels:
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel=kern)).fit(X, df.DEATH_EVENT.values[train])
    fittedvals = clf.predict(Xtest)
    acc = np.sum(fittedvals == df.DEATH_EVENT.values[test])/len(fittedvals)
    print("Kernel: %s, Accuracy: %.3f"%(kern, acc))

In [None]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel='sigmoid')).fit(X, df.DEATH_EVENT.values[train])
fittedvals = clf.predict(Xtest)
acc_svm = np.sum(fittedvals == df.DEATH_EVENT.values[test])/len(fittedvals)
print(acc_svm)

# K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

### Use leave-one-out cross-validation to pick optimal neighbor count

In [None]:
loocv = []
for n in np.arange(2,10):
    pred = []
    for i in range(X.shape[0]):
        keep = np.hstack((np.arange(i), np.arange(i+1,X.shape[0])))
        clf = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=n)).fit(X[keep,:], df.DEATH_EVENT.values[train][keep])
        pred.append(clf.predict(X[i,:].reshape(1,-1))[0])
    loocv.append(np.sum(np.array(pred) != df.DEATH_EVENT.values[train])/len(pred))

In [None]:
plt.scatter(np.arange(2,10), loocv)
plt.show()

In [None]:
clf = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=7)).fit(X, df.DEATH_EVENT.values[train])
fittedvals = clf.predict(Xtest)
acc_knn = np.sum(fittedvals == df.DEATH_EVENT.values[test])/len(fittedvals)
print(acc_knn)

### What if we didn't standardize the data?

In [None]:
loocv = []
for n in np.arange(2,10):
    pred = []
    for i in range(X.shape[0]):
        keep = np.hstack((np.arange(i), np.arange(i+1,X.shape[0])))
        clf = KNeighborsClassifier(n_neighbors=n).fit(X[keep,:], df.DEATH_EVENT.values[train][keep])
        pred.append(clf.predict(X[i,:].reshape(1,-1))[0])
    loocv.append(np.sum(np.array(pred) != df.DEATH_EVENT.values[train])/len(pred))

In [None]:
plt.scatter(np.arange(2,10), loocv)
plt.show()

In [None]:
clf = KNeighborsClassifier(n_neighbors=8).fit(X, df.DEATH_EVENT.values[train])
fittedvals = clf.predict(Xtest)
acc_knn2 = np.sum(fittedvals == df.DEATH_EVENT.values[test])/len(fittedvals)
print(acc_knn2)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

### Use leave-one-out cross-validation to pick optimal max_depth

In [None]:
loocv = []
for n in np.arange(1,10):
    pred = []
    for i in range(X.shape[0]):
        keep = np.hstack((np.arange(i), np.arange(i+1,X.shape[0])))
        clf = DecisionTreeClassifier(random_state=0, max_depth=n).fit(X[keep,:], df.DEATH_EVENT.values[train][keep])
        pred.append(clf.predict(X[i,:].reshape(1,-1))[0])
    loocv.append(np.sum(np.array(pred) != df.DEATH_EVENT.values[train])/len(pred))

In [None]:
plt.scatter(np.arange(1,10), loocv)
plt.show()

In [None]:
clf = DecisionTreeClassifier(random_state=0, max_depth=2).fit(X, df.DEATH_EVENT.values[train])
fittedvals = clf.predict(Xtest)
acc_tree = np.sum(fittedvals == df.DEATH_EVENT.values[test])/len(fittedvals)
print(acc_tree)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(max_depth=3, random_state=0).fit(X, df.DEATH_EVENT.values[train])
fittedvals = clf.predict(Xtest)
acc_rf = np.sum(fittedvals == df.DEATH_EVENT.values[test])/len(fittedvals)
print(acc_rf)

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
clf = GradientBoostingClassifier(max_depth=3, random_state=0).fit(X, df.DEATH_EVENT.values[train])
fittedvals = clf.predict(Xtest)
acc_boost = np.sum(fittedvals == df.DEATH_EVENT.values[test])/len(fittedvals)
print(acc_boost)

# XG-Boost

In [None]:
import xgboost

In [None]:
clf = xgboost.XGBRFClassifier(max_depth=3, random_state=0).fit(X, df.DEATH_EVENT.values[train])
fittedvals = clf.predict(Xtest)
acc_xgb = np.sum(fittedvals == df.DEATH_EVENT.values[test])/len(fittedvals)
print(acc_xgb)

# Light GBM

In [None]:
import lightgbm

In [None]:
clf = lightgbm.LGBMClassifier(max_depth=3, random_state=0).fit(X, df.DEATH_EVENT.values[train])
fittedvals = clf.predict(Xtest)
acc_lgbm = np.sum(fittedvals == df.DEATH_EVENT.values[test])/len(fittedvals)
print(acc_lgbm)

# CatBoost

In [None]:
from catboost import CatBoostClassifier

In [None]:
clf = CatBoostClassifier().fit(X, df.DEATH_EVENT.values[train])
fittedvals = clf.predict(Xtest)
acc_catb = np.sum(fittedvals == df.DEATH_EVENT.values[test])/len(fittedvals)
print(acc_catb)

# And the best classifier is ...

In [None]:
ACC = np.array([acc_qda, acc_logreg, acc_svm, acc_knn, acc_knn2, acc_tree, acc_rf,
       acc_boost, acc_xgb, acc_lgbm, acc_catb])
mdl = np.array(["Quadratic Disc. Anal.", "Log. Reg.", "SVM", "KNN", "KNN unstandardized",
       "Decision Tree", "Random Forest", "Boosting", "XGB", "Light GBM", "CatBoost"])
srt = np.argsort(-ACC)
for a, m in zip(ACC[srt], mdl[srt]):
    print(m, a)