# Importing Libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().transpose()

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

# Exploratory Data Analysis

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(),vmin=-1,cmap='viridis')

In [None]:

df.corr()[abs(df.corr()['DEATH_EVENT']) > 0.1]['DEATH_EVENT']  
#The features 'age', 'ejection_fraction', 'serum_creatinine', 'serum_sodium', and 'time' have a considerable correlation with 'DEATH_EVENT'

In [None]:
sns.countplot(x='DEATH_EVENT',hue='sex',data=df)

In [None]:
print(df['anaemia'].value_counts())

print(df['diabetes'].value_counts())

print(df['high_blood_pressure'].value_counts())

print(df['smoking'].value_counts())

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(18, 10))

fig.suptitle('count plots')

sns.countplot(ax=axes[0, 0],x='DEATH_EVENT',hue='anaemia',data=df,palette='coolwarm')# should stay
sns.countplot(ax=axes[0, 1],x='DEATH_EVENT',hue='smoking',data=df,palette='coolwarm')# can be removed
sns.countplot(ax=axes[1, 0],x='DEATH_EVENT',hue='high_blood_pressure',data=df,palette='coolwarm')# should stay
sns.countplot(ax=axes[1, 1],x='DEATH_EVENT',hue='diabetes',data=df,palette='coolwarm') #can be removed



In [None]:
df[df['platelets']>600000]['DEATH_EVENT']

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(18, 10))

fig.suptitle('box plots')

sns.boxplot(ax=axes[0, 0],x='DEATH_EVENT',y='age',data=df)
sns.boxplot(ax=axes[0, 1],x='DEATH_EVENT',y='creatinine_phosphokinase',data=df)
sns.boxplot(ax=axes[1, 0],x='DEATH_EVENT',y='ejection_fraction',data=df)
sns.boxplot(ax=axes[1, 1],x='DEATH_EVENT',y='platelets',data=df) # is this feature useful in predicting?
sns.boxplot(ax=axes[2, 0],x='DEATH_EVENT',y='serum_creatinine',data=df)
sns.boxplot(ax=axes[2, 1],x='DEATH_EVENT',y='serum_sodium',data=df)

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(18, 10))

fig.suptitle('hist plots')

sns.histplot(df['age'],ax=axes[0,0],bins=30)
sns.histplot(df['ejection_fraction'],ax=axes[0, 1],bins=30,color='r')#drop greater than 60
sns.histplot(df['platelets'],ax=axes[1, 0],bins=30,color='g')#get rid of greater than 600000
sns.histplot(df['serum_creatinine'],ax=axes[1, 1],bins=30,color='y') # i think we should keep it/ train model and then remove it and check again
sns.histplot(df['serum_sodium'],ax=axes[2, 0],bins=30,color='k') # get rid of things below 125
sns.histplot(df['creatinine_phosphokinase'],ax=axes[2, 1],bins=30,color='c') # drop greater than 3000

In [None]:
sns.histplot(x='time',data=df,bins=30)

# Splitting Data & Feature Scaling

In [None]:
x = df[['ejection_fraction','serum_creatinine','time','serum_sodium','age']]
y = df['DEATH_EVENT']

In [None]:
scalerer = StandardScaler()
scalerer.fit(x)
scaled_feature = scalerer.transform(x)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_feature,df['DEATH_EVENT'],
                                                    test_size=0.30)

# K NEAREST NEIGHBORS CLASSIFIER

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)

In [None]:
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))



In [None]:
error_rate = []

for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='k', linestyle='dashed', marker='o',
         markersize=5)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
# NOW WITH K=19
knn = KNeighborsClassifier(n_neighbors=19)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=19')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

# Logistic Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_feature,df['DEATH_EVENT'],
                                                    test_size=0.30)

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

In [None]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))