In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

This is a dataset that keeps track of several variables including blood pressure and reports whether or not the patient had passed away by their next follow-up appointment.

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df['DEATH_EVENT'].value_counts()

According to the data, 203 patients were not deceased on follow-up and 96 were. Since the output has a binary outcome (deceased / not deceased), I wanted to start with logistic regression to see which variables were important to the prediction of the outcome.

In [None]:
X = df.drop('DEATH_EVENT', axis=1)
X.head()

In [None]:
y = df['DEATH_EVENT']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
import statsmodels.api as sm
logit_model = sm.Logit(y_train, X_train).fit()
logit_model.summary()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
LR = LogisticRegression().fit(X_train,y_train)
yhat = LR.predict(X_test)
LR.score(X_test, y_test)

This model had an accuracy of 0.733. Above is the summary output for the logistic regression with all of the predictors included. Based on the summary, the only predictors with a p-value below 0.05 were age, ejection_fraction, serum_creatinine, and time. 

The time variable measures the number of days between these measurements being taken and the patient's follow-up appointment. For prediction purposes, the time variable may not actually be appropriate to include in the model since it is not a measurement that can be taken on the patient.

Next, all of the predictors with a p-value greater than 0.05 and the time variable will be dropped.

In [None]:
X = df[['age', 'ejection_fraction', 'serum_creatinine']]
X.corr()

(multicollinearity?)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
logit_model = sm.Logit(y_train, X_train, axis=1).fit()
logit_model.summary()

In [None]:
LR = LogisticRegression().fit(X_train,y_train)
yhat = LR.predict(X_test)
LR.score(X_test, y_test)

After keeping only age, ejection_fraction, and serum_creatinine, the accuracy had a small improvement from 0.733 to 0.75. Based on the coefficients, higher age, higher serum creatinine level, and lower ejection fraction is associated with a greater likelihood of heart failure.

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, yhat)

(interpret the confusion matrix here)

In [None]:
df.head()

In [None]:
df['DEATH_EVENT'].value_counts()

In [None]:
X = df.drop('DEATH_EVENT', axis=1).values
y = df['DEATH_EVENT'].values

In [None]:
# for the ML try with all variables vs only the good variables

In [None]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
k = 7
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
yhat = neigh.predict(X_test)
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

In [None]:
k_range = 30
mean_acc = np.zeros((k_range-1))
ConfustionMx = [];
for n in range(1, k_range):
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

mean_acc

In [None]:
plt.plot(range(1,k_range),mean_acc,'g')
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (k)')
plt.tight_layout()
plt.show()