In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing as preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_roc_curve, plot_precision_recall_curve, f1_score
from mlxtend.plotting import plot_learning_curves

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



In [None]:
df = pd.read_csv(r'/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
print(df.info())

In [None]:
print(df.describe())

In [None]:
print(df.head())

In [None]:
corr = df.corr()
print(corr[abs(corr['DEATH_EVENT']) > 0.1]['DEATH_EVENT'])
fig, ax = plt.subplots()
sns.heatmap(corr)
plt.show()

In [None]:
fig, ax2 = plt.subplots()
sns.boxplot(data=df, x='DEATH_EVENT', y='age')
plt.show()

In [None]:
fig, ax3 = plt.subplots()
sns.barplot(data=df, x='anaemia', y='DEATH_EVENT')
plt.show()

In [None]:
fig, ax4 = plt.subplots()
sns.boxplot(data=df, y='creatinine_phosphokinase', x='DEATH_EVENT')
plt.show()

In [None]:
fig, ax5 = plt.subplots()
sns.barplot(data=df, x='diabetes', y='DEATH_EVENT')
plt.show()

In [None]:
fig, ax6 = plt.subplots()
sns.boxplot(data=df, y='ejection_fraction', x='DEATH_EVENT')
plt.show()

In [None]:
fig, ax7 = plt.subplots()
sns.barplot(data=df, x='high_blood_pressure', y='DEATH_EVENT')
plt.show()

In [None]:
fig, ax8 = plt.subplots()
sns.violinplot(data=df, y='platelets', x='DEATH_EVENT')
plt.show()

In [None]:
fig, ax9 = plt.subplots()
sns.boxplot(data=df, y='serum_creatinine', x='DEATH_EVENT')
plt.show()

In [None]:
fig, ax10 = plt.subplots()
sns.boxplot(data=df, y='serum_sodium', x='DEATH_EVENT')
plt.show()

In [None]:
fig, ax11 = plt.subplots()
sns.barplot(data=df, x='sex', y='DEATH_EVENT')
plt.show()

In [None]:
fig, ax12 = plt.subplots()
sns.barplot(data=df, x='smoking', y='DEATH_EVENT')
plt.show()

In [None]:
fig, ax13 = plt.subplots()
sns.boxplot(data=df, y='time', x='DEATH_EVENT')
plt.show()

In [None]:
df1 = df[['age', 'anaemia', 'ejection_fraction', 'high_blood_pressure', 'serum_creatinine', 'serum_sodium', 'time', 'DEATH_EVENT']]
anaemia_dummies = pd.get_dummies(df1['anaemia'], prefix='anaemia')
high_blood_pressure_dummies = pd.get_dummies(df1['high_blood_pressure'], prefix='high_blood_pressure')
df1 = pd.concat([df1, anaemia_dummies, high_blood_pressure_dummies], axis=1)
df1.drop(['anaemia', 'high_blood_pressure'], axis=1, inplace=True)
print(df1.info())

In [None]:
scaler = preprocessing.StandardScaler()
age_pharma = scaler.fit_transform(df1['age'].values.reshape(-1, 1))
ejection_fraction_pharma = scaler.fit_transform(df1['ejection_fraction'].values.reshape(-1, 1))
serum_sodium_pharma = scaler.fit_transform(df1['serum_sodium'].values.reshape(-1, 1))
time_pharma = scaler.fit_transform(df1['time'].values.reshape(-1, 1))
df1['age'] = age_pharma
df1['ejection_fraction'] = ejection_fraction_pharma
df1['serum_sodium'] = serum_sodium_pharma
df1['time'] = time_pharma
#print(df1.head())

In [None]:
X = df1.drop(['DEATH_EVENT'], axis=1)
y = df1['DEATH_EVENT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#lr = LogisticRegression(n_jobs=-1)
#grid_values = {
#    'C':[0.01, 0.1, 1, 10, 100],
#    'penalty':['l1', 'l2']}
#clf = GridSearchCV(lr, param_grid=grid_values)
#clf.fit(X_train, y_train)
#print(clf.best_params_)
lr = LogisticRegression(n_jobs=-1, C=0.1, penalty='l2')
lr.fit(X_train, y_train)
pre_death_rate = lr.predict(X_test)
print('acc:', accuracy_score(y_test, pre_death_rate))
print('f1_score', f1_score(y_test, pre_death_rate))

In [None]:
cm = confusion_matrix(y_test, pre_death_rate)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True)
plt.show()

In [None]:
plot_roc_curve(lr, X_test, y_test)
plt.show()

In [None]:
cof = pd.DataFrame({
    'columns':list(X_train.columns),
    'coef':list(lr.coef_.T)
})

cof.sort_values(by='coef', ascending=False)
print(cof)