In [None]:
import pandas as pd

FILE_PATH = '../input/heart-disease-uci/heart.csv'

df = pd.read_csv(FILE_PATH)
df.head()

In [None]:
df.columns = [
    'age',
    'sex',
    'chest_pain_type',
    'resting_blood_pressure',
    'cholesterol',
    'fasting_blood_sugar',
    'rest_ecg',
    'max_heart_rate_achieved',
    'exercise_induced_angina',
    'st_depression',
    'st_slope',
    'num_major_vessels',
    'thalassemia',
    'target'
]

In [None]:
df['sex'][df['sex'] == 0] = 'female'
df['sex'][df['sex'] == 1] = 'male'

df['chest_pain_type'][df['chest_pain_type'] == 0] = 'asymptomatic'
df['chest_pain_type'][df['chest_pain_type'] == 1] = 'typical angina'
df['chest_pain_type'][df['chest_pain_type'] == 2] = 'atypical angina'
df['chest_pain_type'][df['chest_pain_type'] == 3] = 'non-anginal pain'
df['chest_pain_type'][df['chest_pain_type'] == 4] = 'asymptomatic'

df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'

df['rest_ecg'][df['rest_ecg'] == 0] = 'normal'
df['rest_ecg'][df['rest_ecg'] == 1] = 'ST-T wave abnormality'
df['rest_ecg'][df['rest_ecg'] == 2] = 'left ventricular hypertrophy'

df['exercise_induced_angina'][df['exercise_induced_angina'] == 0] = 'no'
df['exercise_induced_angina'][df['exercise_induced_angina'] == 1] = 'yes'

df['st_slope'][df['st_slope'] == 1] = 'upsloping'
df['st_slope'][df['st_slope'] == 2] = 'flat'
df['st_slope'][df['st_slope'] == 3] = 'downsloping'

df['thalassemia'][df['thalassemia'] == 1] = 'normal'
df['thalassemia'][df['thalassemia'] == 2] = 'fixed defect'
df['thalassemia'][df['thalassemia'] == 3] = 'reversable defect'

df.head(100)

In [None]:
df.drop(df.index[df['st_slope']==0], inplace=True)
df.drop(df.index[df['thalassemia']==0], inplace=True)

In [None]:
a = pd.get_dummies(df['sex'], prefix='sex')
b = pd.get_dummies(df['chest_pain_type'], prefix='chest_pain_type')
c = pd.get_dummies(df['fasting_blood_sugar'], prefix='fasting_blood_sugar')
d = pd.get_dummies(df['rest_ecg'], prefix='rest_ecg')
e = pd.get_dummies(df['exercise_induced_angina'], prefix='exercise_induced_angina')
f = pd.get_dummies(df['st_slope'], prefix='st_slope')
g = pd.get_dummies(df['thalassemia'], prefix='thalassemia')

frames = [df, a, b, c, d, e, f, g]
df = pd.concat(frames, axis = 1)
df = df.drop(columns=[
    'sex',
    'chest_pain_type',
    'fasting_blood_sugar',
    'rest_ecg',
    'exercise_induced_angina',
    'st_slope',
    'thalassemia'
])

In [None]:
len(df.keys())

In [None]:
from sklearn.preprocessing import MinMaxScaler

x = df.values
min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_scaled = pd.DataFrame(x_scaled, columns=df.columns)

In [None]:
df.to_csv('./clean_heart.csv')
df_scaled.to_csv('./clean_scaled_heart.csv')

# GAN

## Dataset Creation

In [None]:
import tensorflow as tf

x_0 = df_scaled[df_scaled['target']==0].drop(columns='target').values
x_1 = df_scaled[df_scaled['target']==1].drop(columns='target').values

x_0 = tf.cast(tf.convert_to_tensor(x_0), tf.float32)
x_1 = tf.cast(tf.convert_to_tensor(x_1), tf.float32)

batch_size = 32
dataset_0 = tf.data.Dataset.from_tensor_slices(x_0).shuffle(1000).batch(batch_size)
dataset_1 = tf.data.Dataset.from_tensor_slices(x_1).shuffle(1000).batch(batch_size)

## Define Model

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense

def build_generator():
    model = Sequential([
        Input(shape=16),
        Dense(16, activation='relu'),
        Dense(16, activation='relu'),
        Dense(24, activation='sigmoid'),
    ])
    return model


def build_discriminator():
    model = Sequential([
        Input(shape=24),
        Dense(16, activation='relu'),
        Dense(8, activation='relu'),
        Dense(1, activation='softmax')
    ])
    return model

In [None]:
def train_step(real_data, generator, discriminator):
    random_latent_vectors = tf.random.normal(shape=(batch_size, 16))
    generated_data = generator(random_latent_vectors)
    combined_data = tf.concat([generated_data, real_data], axis=0)

    labels = tf.concat(
        [tf.ones((batch_size, 1)), tf.zeros((real_data.shape[0], 1))], axis=0
    )
    labels += 0.05 * tf.random.uniform(labels.shape)

    # Train the discriminator
    with tf.GradientTape() as tape:
        predictions = discriminator(combined_data)
        d_loss = loss_fn(labels, predictions)
    grads = tape.gradient(d_loss, discriminator.trainable_weights)
    d_optimizer.apply_gradients(zip(grads, discriminator.trainable_weights))

    random_latent_vectors = tf.random.normal(shape=(batch_size, 16))
    misleading_labels = tf.zeros((batch_size, 1))

    # Train the generator (note that we should *not* update the weights
    # of the discriminator)!
    with tf.GradientTape() as tape:
        predictions = discriminator(generator(random_latent_vectors))
        g_loss = loss_fn(misleading_labels, predictions)
    grads = tape.gradient(g_loss, generator.trainable_weights)
    g_optimizer.apply_gradients(zip(grads, generator.trainable_weights))
    return d_loss, g_loss, generated_data

def train(epochs, dataset, generator, discriminator):
    for epoch in range(epochs):
        print(epoch)
        for step, real_data in enumerate(dataset):
            d_loss, g_loss, generated_data = train_step(real_data, generator, discriminator)
            if step % 200 == 0:
                # Print metrics
                print("discriminator loss at step %d: %.2f" % (step, d_loss))
                print("adversarial loss at step %d: %.2f" % (step, g_loss))

## Create Two Models

In [None]:
generator_0 = build_generator()
discriminator_0 = build_discriminator()

generator_1 = build_generator()
discriminator_1 = build_discriminator()

d_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
g_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [None]:
epochs = 20000
# train(epochs, dataset_0, generator_0, discriminator_0)

In [None]:
epochs = 20000
# train(epochs, dataset_1, generator_1, discriminator_1)

In [None]:
synthetic_0 = pd.DataFrame(generator_0(tf.random.normal([20000,16])).numpy(), columns=df.drop(columns='target').columns)
synthetic_0.head()

In [None]:
import numpy as np

label_0 = np.zeros((20000,))
synthetic_0['target'] = label_0
synthetic_0.head()

In [None]:
synthetic_1 = pd.DataFrame(generator_1(tf.random.normal([20000,16])).numpy(), columns=df.drop(columns='target').columns)
synthetic_1

In [None]:
label_1 = np.ones((20000,))
synthetic_1['target'] = label_1
synthetic_1

In [None]:
synthetic_merge = pd.concat([synthetic_0, synthetic_1])
synthetic_merge

In [None]:
# synthetic_merge.to_csv('./synthetic_merge.csv')

# Prediction Model

## Dataset Creation

In [None]:
import pandas as pd

syn_path = '../input/syntheticheartdiseaseuci/synthetic_merge.csv'
df_syn = pd.read_csv(syn_path)
df_syn.head()

In [None]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

x = df_syn.drop(columns='target').values
y = df_syn.target.values

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

pca = PCA(n_components=5)
x = pca.fit_transform(x)
for i in range(2):
  d = x[np.where(y == i)]
  plt.scatter(d[:,0],d[:,1])

In [None]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
for i in range(2):
  d = x[np.where(y == i)]
  plt.scatter(d[:,0],d[:,1])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.5, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1)

x_train = tf.cast(tf.convert_to_tensor(x_train), tf.float32)
y_train = tf.cast(tf.convert_to_tensor(y_train), tf.float32)
x_val = tf.cast(tf.convert_to_tensor(x_val), tf.float32)
y_val = tf.cast(tf.convert_to_tensor(y_val), tf.float32)
x_test = tf.cast(tf.convert_to_tensor(x_test), tf.float32)
y_test = tf.cast(tf.convert_to_tensor(y_test), tf.float32)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(256)
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)).shuffle(10000).batch(256)
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).shuffle(10000).batch(256)

## Prediction Model

In [None]:
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras import Sequential

model = Sequential([
    Input(shape=5),
    Dense(3, activation='relu'),
    Dense(1, activation='sigmoid'),
])

optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
train_acc_metric = tf.keras.metrics.BinaryAccuracy()
val_acc_metric = tf.keras.metrics.BinaryAccuracy()

def train_step(x, y):
    with tf.GradientTape() as tape:
        logits = model(x)
        loss_val = loss_fn(y, logits)
    grads = tape.gradient(loss_val, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    train_acc_metric.update_state(y, logits)
    return loss_val

def train(epochs):
    train_loss_history = []
    val_loss_history = []
    train_acc_history = []
    val_acc_history = []
    for epoch in range(epochs):
        for step, (x_train, y_train) in enumerate(train_dataset):
            train_loss = train_step(x_train, y_train)
        for step, (x_val, y_val) in enumerate(val_dataset):
            logits = model(x_val)
            val_loss = loss_fn(y_val, logits)
            val_acc_metric.update_state(y_val, logits)
        print('EPOCH: %d - TRAIN LOSS: %.3f - VAL LOSS: %.3f' % (epoch, train_loss, val_loss))
        print('EPOCH: %d - TRAIN ACCU: %.3f - VAL ACCU: %.3f' % (epoch, train_acc_metric.result(), val_acc_metric.result()))
        train_loss_history.append(train_loss)
        val_loss_history.append(val_loss)
        train_acc_history.append(train_acc_metric.result())
        val_acc_history.append(val_acc_metric.result())
        
        train_acc_metric.reset_states()
        val_acc_metric.reset_states()
    return train_loss_history, val_loss_history, train_acc_history, val_acc_history
        
train_loss, val_loss, train_acc, val_acc = train(25)

In [None]:
test_acc_metric = tf.keras.metrics.BinaryAccuracy()

def test():
    for step, (x_test, y_test) in enumerate(test_dataset):
        logits = model(x_test)
        val_loss = loss_fn(y_test, logits)
        test_acc_metric.update_state(y_test, logits)
    print('TEST ACCU: %.3f' % (test_acc_metric.result()))
    test_acc_metric.reset_states()

test()

In [None]:
plt.plot(train_loss)
plt.plot(val_loss)
plt.plot(train_acc)
plt.plot(val_acc)

plt.title('Metrics')
plt.xlabel('epoch')
plt.legend(['Train Loss', 'Val Loss', 'Train Accuracy', 'Val Accuracy'])

plt.show()