In [68]:
import pandas as pd
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt


# Load Dataset

In [88]:
df = pd.read_csv('./heart-failure.csv', sep=',', index_col=False)
df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


# Function

In [89]:
def df_train(number_register):
    df_train = df.copy()
    df_train = df_train.head(number_register)
    return df_train

def df_test(number_register):
    df_test = df.copy()
    df_test = df_test.tail(number_register)
    return df_test

# Clean Data

In [90]:
# Nulos en las columnas
df_null = df.isnull().sum()
df_null

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [91]:
# rename target variable
df = df.rename(columns={'DEATH_EVENT':'target'})

# Split train - test data

In [92]:
df_train = df_train(275)
#print(df_train)
df_train

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,target
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,44.0,0,582,1,30,1,263358.03,1.6,130,1,1,244,0
271,51.0,0,582,1,40,0,221000.00,0.9,134,0,0,244,0
272,67.0,0,213,0,38,0,215000.00,1.2,133,0,0,245,0
273,42.0,0,64,0,40,0,189000.00,0.7,140,1,0,245,0


In [93]:
#df_test = df_test(25)
#df_test

# Creating Generative Bayesian model

In [94]:
# Split predictors variables(x) and target variable(y)

x_real = df_train.drop('target', axis=1)
y_real = df_train['target']

In [95]:
# verify matrix 
y_shape = y_real.shape
x_shape = x_real.shape

#x_shape

# Notas
- sd o sigma: desviacion estandar 

In [108]:
with pm.Model() as generative_model:
    x = pm.Normal('x', mu=x_real.mean(), sd=1.0, shape=(275, 12))
    x_sintetico = pm.Normal('x_sintetico', mu=x, sd=1.0, shape=(275,12))
    
    y = pm.Normal('y', mu=y_real.mean(), sd=1.0, shape=(275))
    y_sintetico = pm.Normal('y_sintetico', mu=y, sd=1.0, shape=(275))

In [101]:
with generative_model:
    samplers = pm.sample(1000, tune=1000)

  return wrapped_(*args_, **kwargs_)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [y_sintetico, y, x_sintetico, x]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 25 seconds.
The acceptance probability does not match the target. It is 0.7173003749623665, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.7056572757819586, but should be close to 0.8. Try to increase the number of tuning steps.


In [102]:
x_sintetico_generado = samplers['x_sintetico']
y_sintetico_generado = samplers['y_sintetico']

In [103]:
x_sintetico_generado

array([[[ 6.14981655e+01,  1.06936352e+00,  5.64932158e+02, ...,
          3.76864245e+00,  3.80939665e+00,  1.20504764e+02],
        [ 6.09067493e+01,  2.15290877e+00,  5.68175442e+02, ...,
          2.10352405e+00, -2.42784148e+00,  1.18291714e+02],
        [ 6.02445254e+01, -2.20564686e-01,  5.68399579e+02, ...,
          2.23957226e-01, -3.95405740e-01,  1.20290324e+02],
        ...,
        [ 6.16185256e+01,  7.86573286e-01,  5.66192111e+02, ...,
         -1.23254340e+00, -9.34937060e-01,  1.19899179e+02],
        [ 6.45021760e+01,  2.13698515e+00,  5.70059213e+02, ...,
          2.62800039e+00,  5.47419906e-01,  1.18472747e+02],
        [ 6.19763499e+01,  1.33148588e+00,  5.67552621e+02, ...,
          8.07772126e-02,  4.91455291e-01,  1.20191131e+02]],

       [[ 5.94570945e+01, -8.25970643e-01,  5.66906991e+02, ...,
          9.70502165e-01, -1.35863740e+00,  1.19865003e+02],
        [ 6.12425324e+01, -1.87351470e-01,  5.67394377e+02, ...,
          2.46153512e+00,  2.72252068e

In [104]:
x_sintetico_generado = x_sintetico_generado.reshape(-1, x_sintetico_generado.shape[-1])
sintetico = pd.DataFrame(x_sintetico_generado, columns=['age','anaemia','creatinine_phosphokinase','diabetes','ejection_fraction','high_blood_pressure','platelets','serum_creatinine','serum_sodium','sex','smoking','time'])
sintetico

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
0,61.498166,1.069364,564.932158,1.570609,37.695363,1.547605,261120.890271,1.128949,137.194589,3.768642,3.809397,120.504764
1,60.906749,2.152909,568.175442,3.133641,36.561359,3.215022,261122.130988,0.328436,139.046945,2.103524,-2.427841,118.291714
2,60.244525,-0.220565,568.399579,0.501693,37.252744,-0.076214,261123.689105,1.585256,135.665389,0.223957,-0.395406,120.290324
3,58.925194,-0.039421,564.674496,0.646949,39.570928,2.862661,261121.154744,1.808066,137.255407,1.271579,-1.554904,119.317748
4,58.888526,-0.101393,567.417735,1.056221,37.229776,0.274308,261120.741468,-0.284489,135.360360,-0.081226,0.817079,119.753538
...,...,...,...,...,...,...,...,...,...,...,...,...
1099995,61.080945,1.717000,566.062842,-2.182913,38.247934,0.794536,261120.991880,1.978788,137.062971,0.945638,0.523586,119.155208
1099996,59.648416,0.846187,567.632397,1.427344,36.049323,1.596557,261122.410082,2.630710,137.044860,-2.611659,-0.425698,122.142637
1099997,60.858138,0.265794,568.266441,1.989097,35.913845,0.036895,261122.009038,0.176036,136.169358,2.805134,1.090715,119.549259
1099998,60.239405,1.461212,569.605143,1.006256,37.305898,-0.348422,261121.280056,0.375568,136.510623,-0.598237,-2.036163,118.960072


In [105]:
y_sintetico_generado

array([[ 0.04140244,  0.97678277,  1.08966528, ...,  0.32393579,
        -0.95027853,  2.7724573 ],
       [-0.63082137, -1.6759386 , -1.42997282, ...,  0.49932358,
         1.09202894,  1.15981966],
       [-0.37688669, -1.73261907,  3.83468872, ...,  0.76951639,
        -0.95522383,  1.34315317],
       ...,
       [ 1.49231617, -0.00399713,  1.10803142, ..., -0.69324051,
        -0.65016347,  0.89684097],
       [-0.30893356,  0.19957957, -2.03342297, ...,  0.69103948,
        -0.23053151, -0.38664374],
       [ 1.31135249,  1.36238451,  0.42872312, ...,  0.38341332,
         0.25698962, -0.06874735]])

In [112]:
#y_sintetico_generado = y_sintetico_generado.reshape(-1, y_sintetico_generado.shape[-1])
y_sintetico_generado = y_sintetico_generado.flatten()
sintetic = pd.DataFrame({'target': y_sintetico_generado})
sintetic

Unnamed: 0,target
0,0.041402
1,0.976783
2,1.089665
3,-0.658060
4,0.574391
...,...
1099995,0.382164
1099996,2.197475
1099997,0.383413
1099998,0.256990
