**In this work i study the data to create a logistic model using Sklearn algorithms to predict heart failure deaths**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
df= pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
df.head(10)

In [None]:
print (df.dtypes)
print (df.shape)

In [None]:
df.describe()

 **Data**
 1. age: (quantitative)
 2. anemia: Decrease of red blood cells or hemoglobin (categorical)
 3. creatinine_phosphokinase: Level of the CPK enzyme in the blood (mcg/L) (quantitative)
 4. diabetes: If the patient has diabetes (categorical)
 5. ejection_fraction: Percentage of blood leaving the heart at each contraction (quantitative)
 6. high_blood_pressure: If the patient has hypertension (categorical)
 7. platelets: Platelets in the blood (kiloplatelets/mL) (quantitative)
 8. serum_creatinine: Level of serum creatinine in the blood (mg/dL) (quantitative)
 9. serum_sodium: Level of serum sodium in the blood (mEq/L) (quantitative)
 10. sex: Woman or man (categorical)
 11. smoking: If the patient smokes or not  (categorical)
 12. time: Follow-up period (days) (quantitative)
 13. DEATH_EVENT: If the patient deceased during the follow-up period (categorical) Value to predict
 
The data base has 13 columns with 299 patients cases, non mising values

**Quantitative analysis**

In [None]:
sns.pairplot(df, vars=["age", "creatinine_phosphokinase", "ejection_fraction", "platelets", 
                       "serum_creatinine", "serum_sodium", "time"])

**Categorical analysis**

In [None]:
categorical_data= df.drop("age", axis=1)
categorical_data= categorical_data.drop("creatinine_phosphokinase", axis=1)
categorical_data= categorical_data.drop("ejection_fraction", axis=1)
categorical_data= categorical_data.drop("platelets", axis=1)
categorical_data= categorical_data.drop("serum_creatinine", axis=1)
categorical_data= categorical_data.drop("serum_sodium", axis=1)
categorical_data= categorical_data.drop("time", axis=1)
categorical_data.head(10)

In [None]:
plt.style.use("bmh")
sns.catplot(y = "sex", data = categorical_data, kind = "count").set(title="Count of women and men");

In [None]:
sns.catplot(y = "DEATH_EVENT", data = categorical_data, kind = "count").set(title="Deceased cases");

#### The difference between positive and negative cases its a problem for predictions, we can do an undersampling with "0" cases

In [None]:
df= df.rename(columns={'DEATH_EVENT': 'y'})
df_0 = df[df['y'] == 0]
df_1 = df[df['y'] == 1]
print("0 shape", df_0.shape)
print("1 shape", df_1.shape)

In [None]:
df_0_reduced = df_0.sample(96,random_state=100)
df_reduced = pd.concat([df_0_reduced,df_1],axis=0)
df_reduced = df_reduced.sample(frac=1,random_state=100)
df_reduced.head()

In [None]:
df_reduced.shape

In [None]:
plt.figure(figsize=(5,5))
sns.catplot(y = "y", data = df_reduced, kind = "count").set(title="Deceased cases");

In [None]:
plt.figure(figsize=(5,10))
sns.heatmap(pd.DataFrame(df_reduced.corr().y), annot=True,linewidth=0.5,cmap="RdBu");

**Convert data to dummies**

In [None]:
df_reduced.anaemia=df_reduced.anaemia.replace({1:"uno", 0:"cero"})
df_reduced.diabetes=df_reduced.diabetes.replace({1:"uno", 0:"cero"})
df_reduced.high_blood_pressure=df_reduced.high_blood_pressure.replace({1:"uno", 0:"cero"})
df_reduced.sex=df_reduced.sex.replace({0:"cero", 1:"uno"})
df_reduced.smoking=df_reduced.smoking.replace({0:"cero", 1:"uno"})
df_reduced.y=df_reduced.y.replace({0:"cero", 1:"uno"})
df_reduced.head()

In [None]:
columns_to_dummy = df_reduced.columns[df_reduced.dtypes == np.object]
df_reduced_w_dummy = pd.get_dummies(df_reduced,prefix=columns_to_dummy)
df_reduced_w_dummy.head()

In [None]:
df_reduced_w_dummy=df_reduced_w_dummy.drop(["anaemia_uno", "diabetes_uno", "high_blood_pressure_uno", "sex_uno", "smoking_uno", "y_cero"], axis=1)
df_reduced_w_dummy.head()

In [None]:
df_reduced_w_dummy.head()
df_reduced=df_reduced_w_dummy
plt.figure(figsize=(5,10))
sns.heatmap(pd.DataFrame(df_reduced.corr().y_uno), annot=True,linewidth=0.5,cmap="RdBu");

#### In general the data doesn't has a strong correlation with the variable to predict. The most correlated variables with deceased cases people:
1. age
2. ejection_fraction
3. serum_creatinine
4. serum_sodium
4. time

In [None]:
df_reduced=df_reduced.drop("creatinine_phosphokinase", axis=1)
df_reduced=df_reduced.drop("platelets", axis=1)
df_reduced=df_reduced.drop("anaemia_cero", axis=1)
df_reduced=df_reduced.drop("diabetes_cero", axis=1)
df_reduced=df_reduced.drop("high_blood_pressure_cero", axis=1)
df_reduced=df_reduced.drop("sex_cero", axis=1)
df_reduced=df_reduced.drop("smoking_cero", axis=1)
df_reduced.head()

In [None]:
y = df_reduced["y_uno"]
X = df_reduced.drop(["y_uno"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [None]:
clf = LogisticRegression(solver="liblinear").fit(X_train, y_train)
y_train_hat = clf.predict(X_train)
y_test_hat = clf.predict(X_test)
print ("Traning")
print (clf.score(X_train,y_train))
print("Test")
print (clf.score(X_test,y_test))

In [None]:
plt.figure(figsize = (10, 6))
plt.plot([0,1], [0,1], 'r--')

probs = clf.predict_proba(X_test)
probs = probs[:, 1]
fpr, tpr, thresholds = roc_curve(y_test,probs)
roc_auc = roc_auc_score(y_test, y_test_hat)
label = 'AUC:' + ' {0:.2f}'.format(roc_auc)
plt.plot(fpr, tpr, color="#514EBD", label = label, linewidth = 4)
plt.xlabel('FPR', fontsize = 16)
plt.ylabel('TPR', fontsize = 16)
plt.title('ROC', fontsize = 16)
plt.legend();
plt.savefig("ROC cardio.jpeg", bbox_inches='tight')

In [None]:
df_cap = df_reduced.sample(90,random_state=20)
cap_X = df_cap.drop("y_uno", axis=1)
df_cap.y_uno.value_counts()

In [None]:
df_cap = df_reduced.sample(90,random_state=20)
cap_X = df_cap.drop("y_uno", axis=1)
plt.figure(figsize = (10, 6))
plt.plot([0,90], [0,44], 'r--')
perfect_model = np.cumsum(np.repeat(1,44))
perfect_model.resize(90)
perfect_model[44:] = 44
perfect_model
plt.plot(perfect_model);
df_cap["probs"] = clf.predict_proba(cap_X)[:, 1]
df_cap = df_cap.sort_values(by="probs",ascending=False)
probs = np.cumsum(df_cap.y_uno).values
plt.plot(probs, color="#098B4A", label = label, linewidth = 4);
plt.axvline(x=43, color="#100F3E", linestyle='--',alpha=0.4)
plt.axhline(y=probs[43], color="#100F3E", linestyle='--',alpha=0.4);
plt.xlabel('Total of people', fontsize = 16)
plt.ylabel('Total of heart failure cases', fontsize = 16)
plt.title('CAP', fontsize = 16)
print (probs[43]);