# Import 

In [None]:
import pandas as pd
import pymc3 as pm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Load Dataset

In [None]:
df = pd.read_csv('../src/data/heart-failure.csv', sep=',', index_col=False)

# Function

In [None]:
def df_train(number_register):
    df_train = df.copy()
    df_train = df_train.head(number_register)
    return df_train

def df_test(number_register):
    df_test = df.copy()
    df_test = df_test.tail(number_register)
    return df_test

# Clean Data

In [None]:
# isnull()
df_null = df.isnull().sum()

In [None]:
# change variable name to "target"
df = df.rename(columns={'DEATH_EVENT':'target'})

In [None]:
# change date type
df[['age','platelets']] = df[['age','platelets']].astype(int)

# Analize data real

In [None]:
# describe()
df.describe()

In [None]:
matrix = df.corr(method='pearson')
n_ticks = len(df.columns)
plt.figure(figsize=(20, 9))
sns.heatmap(matrix, annot=True, cmap = sns.diverging_palette(188, 143, n=143))
plt.xticks(range(n_ticks), df.columns, rotation='vertical')
plt.yticks(range(n_ticks), df.columns)
_ = plt.title('Matriz de correlacion')
plt.show()

In [None]:
# describe() independient variable
df_describe_m = df.describe().loc[['min','max']]
df_describe_m = df_describe_m[['age','ejection_fraction','serum_creatinine','anaemia','diabetes',
                           'high_blood_pressure','sex','smoking','target']]
df_describe_m

# Split train - test data

In [None]:
df_train = df_train(275)

In [None]:
df_test = df_test(25)
df_test = df_test.drop(['time',"platelets","creatinine_phosphokinase","serum_sodium"], axis=1)

# Steps to create Generative Bayesian model

## Split features and target

In [None]:
x_real = df_train.drop(['target','time',"platelets","creatinine_phosphokinase","serum_sodium"], axis=1)
y_real = df_train['target'].to_frame()

## Split  binary, numeric


In [None]:
binary_data = x_real[['anaemia','diabetes','high_blood_pressure','sex','smoking']]
binary_data.shape

In [None]:
numeric_age = x_real['age']
numeric_age = numeric_age.to_frame()
numeric_age.shape

In [None]:
numeric_eyection = x_real['ejection_fraction']
numeric_eyection = numeric_eyection.to_frame()
numeric_eyection.shape

In [None]:
numeric_creatinine = x_real['serum_creatinine']
numeric_creatinine = numeric_creatinine.to_frame()
numeric_creatinine.shape

## Describe DF


In [None]:
df_real = df.describe()
df_real = df_real[['age','ejection_fraction','serum_creatinine','anaemia','diabetes','high_blood_pressure','sex','smoking']]
df_real

# Generative model

In [None]:
generative_model = pm.Model()
with generative_model:
    x_binary = pm.Bernoulli('x_binary',p=0.5,shape=binary_data.shape)
    x_age = pm.Normal('x_age',mu=numeric_age.mean(),sd=12,shape=numeric_age.shape, observed=numeric_age)
    x_ejection = pm.HalfNormal('x_ejection',sd=20,shape=numeric_eyection.shape, observed=numeric_eyection)
    x_creatinine = pm.TruncatedNormal('x_creatinine',mu=numeric_creatinine.mean(),sigma=2,lower=0.5,upper=10,shape=numeric_creatinine.shape,observed=numeric_creatinine)
    
    p_binary = binary_data
    p_age = pm.invlogit(numeric_age)
    p_ejection = pm.invlogit(numeric_eyection)
    p_creatinine = pm.invlogit(numeric_creatinine)
    
    p = pm.math.concatenate([p_binary, p_age,p_ejection,p_creatinine], axis=1)
    
    y_target = pm.Bernoulli('y_target',p=p, observed=y_real)

In [None]:
with generative_model:
    trace = pm.sample_posterior_predictive(
    'trace',
    samples=10,
    var_names=["x_binary","x_age","x_ejection","x_creatinine","y_target"])

In [None]:
binary_generative = trace['x_binary']
#binary_generative

In [None]:
age_generative = trace['x_age']
#age_generative

In [None]:
ejection_generative = trace['x_ejection']
#ejection_generative

In [None]:
creatinine_generative = trace['x_creatinine']
#creatinine_generative

In [None]:
target_generative = trace['y_target']
target_generative.shape

In [None]:
binary_generative = binary_generative.reshape(-1, binary_generative.shape[-1])
binary_generative = pd.DataFrame(binary_generative, columns=['anaemia','diabetes','high_blood_pressure','sex','smoking'])
#binary_generative

In [None]:
age_generative = age_generative.reshape(-1, age_generative.shape[-1])
age_generative = pd.DataFrame(age_generative, columns=['age'])
#age_generative

In [None]:
ejection_generative = ejection_generative.reshape(-1, ejection_generative.shape[-1])
ejection_generative = pd.DataFrame(ejection_generative, columns=['ejection_fraction'])
#ejection_generative

In [None]:
creatinine_generative = creatinine_generative.reshape(-1, creatinine_generative.shape[-1])
creatinine_generative = pd.DataFrame(creatinine_generative, columns=['serum_creatinine'])
#creatinine_generative

In [None]:
target_generative = target_generative.reshape(-1,1)
target_generative = pd.DataFrame(target_generative, columns=['target'])

#target_generative

In [None]:
df_generative = pd.concat([age_generative,creatinine_generative,ejection_generative,binary_generative,target_generative], axis= 1).dropna()
df_generative

# Analyze generic data

In [None]:
df_generative.describe()

In [None]:
df_describe = df.describe()
df_describe = df_describe[['age','ejection_fraction','serum_creatinine','anaemia','diabetes',
                           'high_blood_pressure','sex','smoking','target']]
#df_describe

# Concat

In [None]:
df_real = pd.concat([x_real,y_real],axis=1)
#df_real

In [None]:
df_generative = df_generative[['age','anaemia','diabetes','ejection_fraction','high_blood_pressure','serum_creatinine','sex','smoking','target']].astype(int)
#df_generative

In [None]:
# take generative data for test data

df_test_generative = df_generative.tail(50)
#df_test_generative.head()

In [None]:
df_generative = df_generative.head(2700)

In [None]:
df = pd.concat([df_real,df_generative])
#df_train = df.to_csv('train_heart_failure.csv', index=False)

In [None]:
df_test = pd.concat([df_test,df_test_generative])
#df_test = df_test.to_csv('test_heart_failure.csv', index=False)