In [13]:
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import os

In [56]:
# Load the tfrecords file
#dataset = tf.data.TFRecordDataset("/workspaces/Chip_Seq_GAN_Peaks/data/fake/rep1_bed.tfrecords")

# Define a function to parse the TFRecords
def parse_tfrecord_fn(example):
    feature_description = {
#        'label': tf.io.FixedLenFeature([], tf.int64),
        'chromosome': tf.io.FixedLenFeature([], tf.int64),
        'start': tf.io.FixedLenFeature([], tf.int64),
        'end': tf.io.FixedLenFeature([], tf.int64),
        'feature1': tf.io.FixedLenFeature([], tf.float32),
        'feature2': tf.io.FixedLenFeature([], tf.float32),
        'feature3': tf.io.FixedLenFeature([], tf.float32),
        'feature4': tf.io.FixedLenFeature([], tf.int64)
#        ,'replica_id': tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, feature_description)
    return example

# Define a function to create a dataset from TFRecords files
def create_dataset_from_tfrecords(tfrecord_pattern, batch_size=32, buffer_size=10000):
    files = tf.data.Dataset.list_files(tfrecord_pattern)
    dataset = files.interleave(
        lambda filename: tf.data.TFRecordDataset(filename),
        cycle_length=4, num_parallel_calls=tf.data.AUTOTUNE
    )
    dataset = dataset.shuffle(buffer_size)
    dataset = dataset.map(parse_tfrecord_fn, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    return dataset

# Specify the TFRecords pattern
tfrecord_pattern_real = "/workspaces/Chip_Seq_GAN_Peaks/data/real/*.tfrecords"
tfrecord_pattern_fake = "/workspaces/Chip_Seq_GAN_Peaks/data/fake/*.tfrecords"

# Create a dataset
batch_size = 32  # Adjust as needed
buffer_size = 1500  # Adjust as needed
dataset_real = create_dataset_from_tfrecords(tfrecord_pattern_real, batch_size=batch_size, buffer_size=buffer_size)
dataset_fake = create_dataset_from_tfrecords(tfrecord_pattern_fake, batch_size=batch_size, buffer_size=buffer_size)


In [57]:
# Define a function to build the generator model
def build_generator():
    # Define your generator model architecture
    # Example:
    model = models.Sequential()
    model.add(layers.Dense(128, input_shape=(100,), activation='relu'))
    model.add(layers.Dense(7, activation='linear'))
    return model

# Define a function to build the discriminator model
def build_discriminator():
    # Define your discriminator model architecture
    # Example:
    model = models.Sequential()
    model.add(layers.Dense(128, input_shape=(7,), activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model

# Define the loss functions
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss

def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)

# Define optimizers
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

# Define a function to generate synthetic data
def generate_synthetic_data(batch_size=32):
    noise = tf.random.normal([batch_size, 100])
    generated_data = generator(noise, training=False)
    return generated_data

# Create the generator and discriminator
generator = build_generator()
discriminator = build_discriminator()

# Training loop
@tf.function
def train_step(real_data):
    """    real_data = tf.concat([
            tf.cast(real_data[0], tf.float32),
            tf.cast(real_data[1], tf.float32),
            tf.cast(real_data[2], tf.float32),
            tf.cast(real_data[3], tf.float32),
            tf.cast(real_data[4], tf.float32),
            tf.cast(real_data[5], tf.float32),
            tf.cast(real_data[6], tf.float32)
        ], axis=-1)
    """
    batch_size = real_data['feature1'].shape[0]
    noise = tf.random.normal([batch_size, 100])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_data = generator(noise, training=True)

        real_output = discriminator(real_data, training=True)
        fake_output = discriminator(generated_data, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

# Load your TFRecords dataset
dataset = dataset_real

# Training the GAN
num_epochs = 10000
batch_size = 32

for epoch in range(num_epochs):
    for real_data in dataset.batch(batch_size):
        train_step(real_data)

    if epoch % 1000 == 0:
        print(f"Epoch {epoch}/{num_epochs}, Generator Loss: {gen_loss}, Discriminator Loss: {disc_loss}")

# Generate synthetic data using the trained generator
synthetic_data = generate_synthetic_data(batch_size=1000)
print("Generated Synthetic Data:")
print(synthetic_data)


ValueError: in user code:

    File "/tmp/ipykernel_41034/3605413461.py", line 64, in train_step  *
        real_output = discriminator(real_data, training=True)
    File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/engine/input_spec.py", line 197, in assert_input_compatibility
        raise ValueError(

    ValueError: Missing data for input "dense_62_input". You passed a data dictionary with keys ['chromosome', 'end', 'feature1', 'feature2', 'feature3', 'feature4', 'start']. Expected the following keys: ['dense_62_input']


In [58]:
dataset = real_data
dataset['chromosome'] = tf.cast(dataset['chromosome'], tf.float32)
dataset['start'] = tf.cast(dataset['start'], tf.float32)
dataset['end'] = tf.cast(dataset['end'], tf.float32)
dataset['feature1'] = tf.cast(dataset['feature1'], tf.float32)
dataset['feature2'] = tf.cast(dataset['feature2'], tf.float32)
dataset['feature3'] = tf.cast(dataset['feature3'], tf.float32)
dataset['feature4'] = tf.cast(dataset['feature4'], tf.float32)

In [59]:
input_features = tf.concat(
    [dataset['chromosome'], dataset['start'], dataset['end'],
     dataset['feature1'], dataset['feature2'], dataset['feature3'], dataset['feature4']], axis=-1)

In [60]:
input_features

<tf.Tensor: shape=(32, 224), dtype=float32, numpy=
array([[1.30e+10, 1.70e+10, 9.00e+09, ..., 1.01e+02, 1.46e+02, 1.93e+02],
       [1.20e+10, 9.00e+09, 1.00e+09, ..., 7.80e+01, 9.70e+01, 1.71e+02],
       [1.70e+10, 1.70e+10, 1.00e+09, ..., 9.80e+01, 1.29e+02, 2.16e+02],
       ...,
       [1.10e+10, 1.00e+09, 1.90e+10, ..., 2.77e+02, 2.49e+02, 1.42e+02],
       [1.00e+09, 2.00e+09, 1.00e+09, ..., 2.53e+02, 1.68e+02, 1.68e+02],
       [1.90e+10, 3.00e+09, 6.00e+09, ..., 1.44e+02, 1.33e+02, 2.10e+02]],
      dtype=float32)>