In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

# Part 1: Importing and Exploring Data****

In [None]:
df = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
df.head()

In [None]:
df.keys()

In [None]:
df.tail()

# Part 2: Visualizing The Data

Some basic visualizations of some of the variables

In [None]:
sns.pairplot(df, hue = 'DEATH_EVENT', vars = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 
                                             'serum_sodium', 'time'])

In [None]:
sns.countplot(df['DEATH_EVENT'])

In [None]:
sns.countplot(df['smoking'])

In [None]:
sns.countplot(df['high_blood_pressure'])

In [None]:
sns.countplot(df['sex'])

In [None]:
sns.countplot(df['diabetes'])

In [None]:
plt.figure(figsize = (20,10))

sns.countplot(df['age'])

In [None]:
sns.scatterplot(x = 'serum_sodium', y = 'serum_creatinine', hue = 'DEATH_EVENT', data = df)

In [None]:
sns.boxplot(x = 'sex', y = 'age', data = df)

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(df.corr(), annot = True)

# Part 3: Building a Model 

For this, we're going to do  a model based on time, the medicines, platelets, and age

In [None]:
X = df[['time', 'ejection_fraction', 'serum_creatinine', 'age']]


In [None]:
X.head()

In [None]:
y = df['DEATH_EVENT']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [None]:
X_train

In [None]:
X_test

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
svc_model = SVC()
svc_model.fit(X_train, y_train)

In [None]:
y_predict = svc_model.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_predict)

In [None]:
sns.heatmap(cm, annot = True)

In [None]:
min_train = X_train.min()
range_train = (X_train-min_train).max()
X_train_scaled = (X_train - min_train)/range_train

In [None]:
sns.scatterplot(x = X_train['age'], y = X_train['time'], hue = y_train)

In [None]:
min_test = X_test.min()
range_test = (X_test - min_test).max()
X_test_scaled = (X_test - min_test)/range_test

In [None]:
svc_model.fit(X_train_scaled, y_train)

In [None]:
y_predict = svc_model.predict(X_test_scaled)

In [None]:
cm = confusion_matrix(y_test, y_predict)

In [None]:
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, y_predict))

In [None]:
param_grid = {'C' : [0.1, 1, 10, 100], 'gamma' : [1, .1, .01, .001], 'kernel' : ['rbf']}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 4)

In [None]:
grid.fit(X_train_scaled, y_train)

In [None]:
grid.best_params_

In [None]:
grid_predictions = grid.predict(X_test_scaled)

In [None]:
cm = confusion_matrix(y_test, grid_predictions)

In [None]:
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, grid_predictions))

# Part 4: Building A Logistic Regression Model
Building a Logistic Regression Model based on diabetes, smoking, and high blood pressure

In [None]:
X = df[['time', 'ejection_fraction', 'serum_creatinine', 'age', 'high_blood_pressure']]

y = df['DEATH_EVENT']

from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
logmodel = LogisticRegression()

In [None]:
logmodel.fit(X_train, y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))

# PART 5: KNN Model


In [None]:
from sklearn.model_selection import train_test_split
X = df[['time', 'ejection_fraction', 'serum_creatinine', 'age']]
y = df['DEATH_EVENT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [None]:
from sklearn.neighbors import KNeighborsClassifier


In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))

In [None]:
print(classification_report(y_test,pred))

In [None]:
error_rate = []

# Will take some time
for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=10')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))