In [1]:
import pandas as pd
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt

# Load Dataset

In [2]:
df = pd.read_csv('./heart-failure.csv', sep=',', index_col=False)
df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


# Function

In [3]:
def df_train(number_register):
    df_train = df.copy()
    df_train = df_train.head(number_register)
    return df_train

def df_test(number_register):
    df_test = df.copy()
    df_test = df_test.tail(number_register)
    return df_test

def min_value(column_name):
    min_value = df[column_name].min()
    print(f"El valor min {column_name} es {min_value}")
    return min_value

def max_value(column_name):
    max_value = df[column_name].max()
    print(f"El valor max {column_name} es {max_value}")
    return max_value

# Clean Data

In [4]:
# Nulos en las columnas
df_null = df.isnull().sum()
df_null

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [5]:
# rename target variable
df = df.rename(columns={'DEATH_EVENT':'target'})

## Analize data

### Numeric columns

In [6]:
min_value_age = min_value('age')
max_value_age = max_value('age')

El valor min age es 40.0
El valor max age es 95.0


In [7]:
min_value_creatine = min_value('creatinine_phosphokinase')
max_value_creatine = max_value('creatinine_phosphokinase')

El valor min creatinine_phosphokinase es 23
El valor max creatinine_phosphokinase es 7861


In [8]:
min_value_ejection = min_value('ejection_fraction')
max_value_ejection = max_value('ejection_fraction')

El valor min ejection_fraction es 14
El valor max ejection_fraction es 80


In [9]:
min_value_platelets = min_value('platelets')
max_value_platelets = max_value('platelets')

El valor min platelets es 25100.0
El valor max platelets es 850000.0


In [10]:
min_value_creatinine = min_value('serum_creatinine')
max_value_creatinine = max_value('serum_creatinine')

El valor min serum_creatinine es 0.5
El valor max serum_creatinine es 9.4


In [11]:
min_value_sodium = min_value('serum_sodium')
max_value_sodium = max_value('serum_sodium')

El valor min serum_sodium es 113
El valor max serum_sodium es 148


In [12]:
min_value_time = min_value('time')
max_value_time = max_value('time')

El valor min time es 4
El valor max time es 285


### Binary columns

In [13]:
min_value_anemia = min_value('anaemia')
max_value_anemia = max_value('anaemia')

El valor min anaemia es 0
El valor max anaemia es 1


In [14]:
min_value_diabetes = min_value('diabetes')
max_value_diabetes = max_value('diabetes')

El valor min diabetes es 0
El valor max diabetes es 1


In [15]:
min_value_pressure = min_value('high_blood_pressure')
max_value_pressure = max_value('high_blood_pressure')

El valor min high_blood_pressure es 0
El valor max high_blood_pressure es 1


In [16]:
min_value_sex = min_value('sex')
max_value_sex = max_value('sex')

El valor min sex es 0
El valor max sex es 1


In [17]:
min_value_smoking = min_value('smoking')
max_value_smoking = max_value('smoking')

El valor min smoking es 0
El valor max smoking es 1


# Split train - test data

In [18]:
df_train = df_train(275)
#print(df_train)
df_train

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,target
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,44.0,0,582,1,30,1,263358.03,1.6,130,1,1,244,0
271,51.0,0,582,1,40,0,221000.00,0.9,134,0,0,244,0
272,67.0,0,213,0,38,0,215000.00,1.2,133,0,0,245,0
273,42.0,0,64,0,40,0,189000.00,0.7,140,1,0,245,0


In [19]:
#df_test = df_test(25)
#df_test

# Creating Generative Bayesian model

In [20]:
# Split predictors variables(x) and target variable(y)

x_real = df_train.drop('target', axis=1)
y_real = df_train['target']

In [21]:
# verify matrix 
y_shape = y_real.shape
x_shape = x_real.shape

#x_shape

# Notas
- sd o sigma: desviacion estandar 

In [22]:
with pm.Model() as generative_model:
    x = pm.Normal('x', mu=x_real.mean(), sd=1.0, shape=(275, 12))
    x_sintetico = pm.Normal('x_sintetico', mu=x, sd=1.0, shape=(275,12))
    
    y = pm.Normal('y', mu=y_real.mean(), sd=1.0, shape=(275))
    y_sintetico = pm.Normal('y_sintetico', mu=y, sd=1.0, shape=(275))

In [23]:
with generative_model:
    samplers = pm.sample(1000, tune=1000)

  return wrapped_(*args_, **kwargs_)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [y_sintetico, y, x_sintetico, x]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 43 seconds.
The acceptance probability does not match the target. It is 0.718161737591146, but should be close to 0.8. Try to increase the number of tuning steps.


In [24]:
x_sintetico_generado = samplers['x_sintetico']
y_sintetico_generado = samplers['y_sintetico']

In [25]:
x_sintetico_generado

array([[[ 6.01862660e+01, -3.72510439e-01,  5.67224441e+02, ...,
         -4.37421761e-01, -2.04495972e-01,  1.19589184e+02],
        [ 6.09307997e+01, -2.78667380e+00,  5.68632809e+02, ...,
          3.60107162e+00,  8.58622511e-01,  1.21250835e+02],
        [ 6.29129187e+01, -3.22922761e-01,  5.65744452e+02, ...,
          1.05732214e+00, -2.52904824e-03,  1.19779187e+02],
        ...,
        [ 6.18643809e+01,  1.74247160e+00,  5.67907573e+02, ...,
          2.05594467e+00,  1.65094675e-01,  1.17142897e+02],
        [ 6.10019842e+01,  1.76360247e+00,  5.68365793e+02, ...,
         -1.44845623e+00,  5.77811315e-01,  1.19406936e+02],
        [ 6.05993456e+01, -3.46843788e-01,  5.67524837e+02, ...,
         -5.41603857e-01,  3.02541829e+00,  1.21044229e+02]],

       [[ 6.43827792e+01,  7.90660487e-01,  5.65943242e+02, ...,
          1.65433995e+00,  2.44669760e+00,  1.18571645e+02],
        [ 6.27323179e+01,  2.04953908e+00,  5.64152212e+02, ...,
          3.81618776e+00,  2.62633960e

In [26]:
x_sintetico_generado = x_sintetico_generado.reshape(-1, x_sintetico_generado.shape[-1])
sintetico = pd.DataFrame(x_sintetico_generado, columns=['age','anaemia','creatinine_phosphokinase','diabetes','ejection_fraction','high_blood_pressure','platelets','serum_creatinine','serum_sodium','sex','smoking','time'])
sintetico

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
0,60.186266,-0.372510,567.224441,0.173071,39.839519,-1.136983,261119.087883,0.440664,135.220729,-0.437422,-0.204496,119.589184
1,60.930800,-2.786674,568.632809,0.987488,37.198455,-0.847710,261123.177887,2.950758,137.471088,3.601072,0.858623,121.250835
2,62.912919,-0.322923,565.744452,-0.339745,35.602179,0.240370,261119.945049,-0.097689,138.308307,1.057322,-0.002529,119.779187
3,61.848312,-0.952033,568.283227,-1.896220,38.026088,2.279739,261119.908373,2.147732,138.155153,1.814994,1.179037,117.765122
4,62.633397,2.164020,566.447212,-0.728199,36.183124,3.552983,261121.741915,1.799081,137.457236,-3.026933,0.929580,116.945867
...,...,...,...,...,...,...,...,...,...,...,...,...
1099995,62.861971,0.879506,566.079993,1.847524,40.196991,-1.379546,261121.093342,1.688804,134.977789,1.042972,-1.277695,119.202653
1099996,60.540682,0.781070,568.596781,0.383379,40.373026,0.027695,261122.935621,1.514755,137.505934,-0.056636,1.769699,115.880552
1099997,57.276137,2.577004,568.502424,2.841277,36.862749,-1.391282,261121.025608,4.163943,136.504836,0.424068,0.736628,121.205702
1099998,59.543304,0.987500,570.301633,-1.104723,38.443542,0.907136,261121.761911,3.073553,137.003990,1.891172,-2.866035,120.387299


In [27]:
y_sintetico_generado

array([[-1.43431473,  0.57013016, -1.88297449, ...,  3.7113344 ,
         0.80205195, -1.53162098],
       [ 0.1463058 ,  0.86717583,  2.06923482, ..., -2.39621955,
         0.55928071,  0.51095486],
       [-1.92643755, -1.55973937,  1.61326229, ..., -0.07923502,
         0.64705781,  2.19021291],
       ...,
       [ 3.26041646,  3.16264139,  0.96100516, ...,  0.85012344,
        -0.88635232, -0.89789306],
       [-2.71983532, -2.17781292, -0.16297358, ..., -1.05363783,
         0.83221564,  0.8024035 ],
       [ 2.69893361,  2.29557481, -0.29156242, ...,  1.98380905,
         0.37816809, -0.12404794]])

In [28]:
#y_sintetico_generado = y_sintetico_generado.reshape(-1, y_sintetico_generado.shape[-1])
y_sintetico_generado = y_sintetico_generado.flatten()
sintetic = pd.DataFrame({'target': y_sintetico_generado})
sintetic

Unnamed: 0,target
0,-1.434315
1,0.570130
2,-1.882974
3,1.673105
4,-2.973259
...,...
1099995,2.092998
1099996,1.584668
1099997,1.983809
1099998,0.378168
