In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.preprocessing as preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,  f1_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
data.head()

In [None]:
data.info()
data.describe()

In [None]:
sns.distplot(data['age'])

In [None]:
data['smoking'].value_counts()

In [None]:
not_smoking = data.loc[data['smoking'] == 0][:97]
smoking = data.loc[data['smoking'] == 1]
new_data = pd.concat([not_smoking, smoking])

In [None]:
new_data['smoking'].value_counts()

In [None]:
sns.countplot(x=new_data['DEATH_EVENT'], hue = new_data['smoking'])

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(data.corr())

In [None]:
plt.figure(figsize=(30,30))
sns.catplot(x = 'DEATH_EVENT', y = 'serum_creatinine', kind = 'bar', data = data)

In [None]:
death = data.loc[data['DEATH_EVENT']==1]
serum_creatinine = data['serum_creatinine']
df = pd.concat([death, serum_creatinine])
df = df.drop(columns = ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 
                   'high_blood_pressure', 'platelets', 'serum_sodium', 'sex', 'smoking', 'time'])

df.head() 
df.describe()

In [None]:
plt.figure(figsize=(12,12))
sns.distplot(df['serum_creatinine'])
plt.show()

In [None]:
scaler = preprocessing.StandardScaler()
age_pharma = scaler.fit_transform(data['age'].values.reshape(-1, 1))
ejection_fraction_pharma = scaler.fit_transform(data['ejection_fraction'].values.reshape(-1, 1))
serum_sodium_pharma = scaler.fit_transform(data['serum_sodium'].values.reshape(-1, 1))
time_pharma = scaler.fit_transform(data['time'].values.reshape(-1, 1))
data['age'] = age_pharma
data['ejection_fraction'] = ejection_fraction_pharma
data['serum_sodium'] = serum_sodium_pharma
data['time'] = time_pharma

In [None]:
X = data.drop(['DEATH_EVENT'], axis=1)
y = data['DEATH_EVENT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
lr = LogisticRegression(n_jobs=-1)
grid_values = {'C':[0.01, 0.1, 1, 10, 100],'penalty':['l2']}
clf = GridSearchCV(lr, param_grid=grid_values)
clf.fit(X_train, y_train)
print(clf.best_params_)

In [None]:
lr = LogisticRegression(n_jobs=-1, C=0.1, penalty='l2')
lr.fit(X_train, y_train)
pre_death_rate = lr.predict(X_test)
print('acc:', accuracy_score(y_test, pre_death_rate))
