# TASK 10.A

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


Synthetic Data Generation for EEG Data Using VAE (MSE , cosine similarity and correlation)

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
# Define paths
base_path = "drive/MyDrive/EEG_Data/train_data"
save_base_path = "drive/MyDrive/EEG_Data/synthetic_data"
os.makedirs(save_base_path, exist_ok=True)

# List of folders (classes) in train_data
folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]

# Define VAE class

class VAE(tf.keras.Model):
    def __init__(self, latent_dim, input_dim):
        super(VAE, self).__init__()
        self.latent_dim = latent_dim

        # Encoder
        self.encoder = tf.keras.Sequential([
            layers.InputLayer(input_shape=(input_dim,)),
            layers.Dense(128, activation='relu'),
            layers.Dense(64, activation='relu'),
            layers.Dense(latent_dim * 2),  # Outputs z_mean and z_log_var
        ])

        # Decoder
        self.decoder = tf.keras.Sequential([
            layers.InputLayer(input_shape=(latent_dim,)),
            layers.Dense(64, activation='relu'),
            layers.Dense(128, activation='relu'),
            layers.Dense(input_dim, activation='sigmoid'),
        ])

    def encode(self, x):
        z_mean, z_log_var = tf.split(self.encoder(x), num_or_size_splits=2, axis=1)
        return z_mean, z_log_var

    def reparameterize(self, z_mean, z_log_var):
        epsilon = tf.random.normal(shape=tf.shape(z_mean))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

    def decode(self, z):
        return self.decoder(z)

    def train_step(self, data):
        if isinstance(data, tuple):
            x = data[0]
        else:
            x = data

        with tf.GradientTape() as tape:
            # Forward pass
            z_mean, z_log_var = self.encode(x)
            z = self.reparameterize(z_mean, z_log_var)
            x_reconstructed = self.decode(z)

            # Reconstruction loss (MSE)
            reconstruction_loss = tf.reduce_mean(tf.square(x - x_reconstructed))

            # KL divergence loss
            kl_loss = -0.5 * tf.reduce_sum(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1
            )
            kl_loss = tf.reduce_mean(kl_loss)

            # Total loss
            total_loss = reconstruction_loss + kl_loss

        # Backpropagation
        gradients = tape.gradient(total_loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        # Return a dictionary of metrics
        return {"loss": total_loss, "reconstruction_loss": reconstruction_loss, "kl_loss": kl_loss}


def generate_synthetic_data(real_data, latent_dim, num_samples, save_dir):
    # Define input dimension
    input_dim = real_data.shape[1]

    # Instantiate VAE
    vae = VAE(latent_dim=latent_dim, input_dim=input_dim)
    vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3))

    # Train the VAE
    vae.fit(real_data, real_data, epochs=5, batch_size=32, verbose=1)

    # Generate synthetic data
    for i in range(100):  # Adjust the number of synthetic datasets if necessary
        z_samples = np.random.normal(size=(real_data.shape[0], latent_dim))
        synthetic_data = vae.decode(z_samples).numpy()

        # Save synthetic data
        np.save(os.path.join(save_dir, f"synthetic_data_{i+1}.npy"), synthetic_data)

        # Calculate and print similarity metrics
        mse = mean_squared_error(real_data, synthetic_data)
        cosine_sim = cosine_similarity(real_data.flatten().reshape(1, -1), synthetic_data.flatten().reshape(1, -1))[0][0]
        pearson_corr, _ = pearsonr(real_data.flatten(), synthetic_data.flatten())

        print(f"Metrics for synthetic data {i+1}:")
        print(f"  MSE: {mse}")
        print(f"  Cosine Similarity: {cosine_sim}")

        print(f"  Pearson Correlation: {pearson_corr}")
        print("-" * 50)

# Iterate over folders and generate synthetic data
latent_dim = 10
num_synthetic_samples = 150

for folder in folders:
    print(f"Processing folder: {folder}")

    folder_path = os.path.join(base_path, folder)
    save_dir = os.path.join(save_base_path, folder)
    os.makedirs(save_dir, exist_ok=True)

    # Load all .npy files in the folder
    file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".npy")]
    real_data = np.vstack([np.load(file) for file in file_paths])

    # Generate and save synthetic data
    generate_synthetic_data(real_data, latent_dim, num_synthetic_samples, save_dir)
    print(f"Synthetic data generation for folder '{folder}' is complete!")
print("All synthetic data generation complete!")


Processing folder: Normal




Epoch 1/5
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - kl_loss: 4.8431e-05 - loss: 0.0035 - reconstruction_loss: 0.0035
Epoch 2/5
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - kl_loss: 3.4766e-06 - loss: 2.7994e-04 - reconstruction_loss: 2.7646e-04
Epoch 3/5
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - kl_loss: 3.2462e-06 - loss: 2.7913e-04 - reconstruction_loss: 2.7588e-04
Epoch 4/5
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - kl_loss: 3.5936e-06 - loss: 2.7936e-04 - reconstruction_loss: 2.7577e-04
Epoch 5/5
[1m1658/1658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - kl_loss: 6.0251e-06 - loss: 2.8176e-04 - reconstruction_loss: 2.7573e-04
Metrics for synthetic data 1:
  MSE: 0.00027600726851156925
  Cosine Similarity: 0.8746626396368249
  Pearson Correlation: 0.6628395409112384
--------------------------------------------------
Metri



[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - kl_loss: 0.0014 - loss: 0.0995 - reconstruction_loss: 0.0981
Epoch 2/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - kl_loss: 0.0014 - loss: 0.0027 - reconstruction_loss: 0.0013
Epoch 3/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - kl_loss: 4.9970e-04 - loss: 0.0014 - reconstruction_loss: 9.4566e-04
Epoch 4/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - kl_loss: 2.4216e-04 - loss: 0.0011 - reconstruction_loss: 8.3032e-04
Epoch 5/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - kl_loss: 2.5901e-04 - loss: 0.0010 - reconstruction_loss: 7.7706e-04
Metrics for synthetic data 1:
  MSE: 0.0007705991687696336
  Cosine Similarity: 0.7562694930108546
  Pearson Correlation: 0.6294683474810847
--------------------------------------------------
Metrics for synthetic data 2:
  MSE: 0.000774323176165591
  Cos



Epoch 1/5
[1m1304/1304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - kl_loss: 1.4898e-04 - loss: 0.0049 - reconstruction_loss: 0.0047
Epoch 2/5
[1m1304/1304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - kl_loss: 1.1272e-06 - loss: 0.0010 - reconstruction_loss: 0.0010
Epoch 3/5
[1m1304/1304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - kl_loss: 1.7950e-06 - loss: 0.0010 - reconstruction_loss: 0.0010
Epoch 4/5
[1m1304/1304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - kl_loss: 6.0711e-06 - loss: 0.0010 - reconstruction_loss: 0.0010
Epoch 5/5
[1m1304/1304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - kl_loss: 4.4949e-07 - loss: 0.0010 - reconstruction_loss: 0.0010
Metrics for synthetic data 1:
  MSE: 0.0010271700083456022
  Cosine Similarity: 0.9065002458412255
  Pearson Correlation: 0.756596464465841
--------------------------------------------------
Metrics for synthetic data 2:
  MSE: 0.



[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - kl_loss: 3.5280e-04 - loss: 0.0165 - reconstruction_loss: 0.0162
Epoch 2/5
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - kl_loss: 7.0350e-05 - loss: 5.1119e-04 - reconstruction_loss: 4.4084e-04
Epoch 3/5
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - kl_loss: 3.2682e-05 - loss: 4.5737e-04 - reconstruction_loss: 4.2469e-04
Epoch 4/5
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - kl_loss: 5.8769e-06 - loss: 4.2844e-04 - reconstruction_loss: 4.2256e-04
Epoch 5/5
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - kl_loss: 1.2190e-07 - loss: 4.2123e-04 - reconstruction_loss: 4.2110e-04
Metrics for synthetic data 1:
  MSE: 0.0004216938245822881
  Cosine Similarity: 0.822746466208517
  Pearson Correlation: 0.7195938719565342
--------------------------------------------------
Metrics for synthetic data