<a href="https://colab.research.google.com/github/prysie/Orange-Neural-Net-Classifier/blob/master/PBT_for_GAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Surrogate

In [16]:
!pip install torch torchvision torchaudio



Required for the parrellism

In [17]:
import torch
torch.cuda.is_available()
if not torch.cuda.is_available():
    print("CUDA is not available. Running on CPU.")

In [18]:
import multiprocessing as mp
mp.set_start_method('spawn', force=True)
import torch.multiprocessing as torch_mp

If the training file exists train the surrogate Random Forest for usage in the GAN

In [19]:
hyperparameters = {
    'learning_rate_g': [1e-4, 2e-4, 5e-4],
    'learning_rate_d': [1e-4, 2e-4, 5e-4],
    'beta1': [0.5, 0.7, 0.9],
    'beta2': [0.9, 0.99, 0.999],
    'batch_size': [8, 16, 32, 64],
    'replay_buffer_size': [10000, 50000, 100000],
    'n_critic': [1, 5, 10],
    'gradient_penalty_weight': [5, 10, 20]
}

In [20]:
from google.colab import drive
import os
from datetime import datetime

def save_training_data(training_data, base_filename='gan_training_data'):
    # Mount Google Drive
    try:
        drive.mount('/content/drive')
    except DriveAlreadyMountedException:
        print("Google Drive is already mounted. Proceeding without remounting.")

    # Create the directory path on Google Drive
    drive_path = '/content/drive/MyDrive/PBT/Datasets/GAN'
    os.makedirs(drive_path, exist_ok=True)

    # Generate a unique filename with a timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{base_filename}_{timestamp}.csv"

    # Save the data to Google Drive
    file_path = os.path.join(drive_path, filename)
    training_data.to_csv(file_path, index=False)
    print(f"Data saved to {file_path}")

In [21]:
def load_latest_training_data(save_dir='/content/drive/MyDrive/PBT/Datasets/GAN'):
    # Mount Google Drive
    try:
        drive.mount('/content/drive')
    except:
        print("Google Drive is already mounted. Proceeding without remounting.")

    # Get the list of CSV files in the save directory
    csv_files = [f for f in os.listdir(save_dir) if f.endswith('.csv')]

    # Sort the files by modification time to get the latest
    latest_file = sorted(csv_files, key=lambda x: os.path.getmtime(os.path.join(save_dir, x)), reverse=True)[0]

    # Load the latest file into a DataFrame
    latest_file_path = os.path.join(save_dir, latest_file)
    training_data = pd.read_csv(latest_file_path)

    print(f"Loaded training data from {latest_file_path}")
    return training_data

The Random Forest Regressor is being trained to predict the Inception Score, a widely used metric to evaluate the quality of generated images from Generative Adversarial Networks (GANs).

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import os

def train_surrogate_model(training_data_file):
    # Check if the training data file exists and is not empty
    if not os.path.exists(training_data_file) or os.stat(training_data_file).st_size == 0:
        print(f"Training data file '{training_data_file}' does not exist or is empty. Skipping surrogate model training.")
        return None

    # Load the training data from the file
    training_data = load_latest_training_data(training_data_file)

    # Check if the DataFrame is empty
    if training_data.empty:
        print("Loaded training data is empty. Skipping surrogate model training.")
        return None

    # Extract the hyperparameters and the Inception Score
    X = training_data[['learning_rate_d', 'learning_rate_g', 'beta1', 'beta2', 'latent_dim', 'batch_size', 'replay_buffer_size', 'n_critic', 'gradient_penalty_weight']]
    y = training_data['inception_score']

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the Random Forest Regressor
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)

    print("Surrogate model trained successfully.")
    return rf

# Example usage
training_data_file = '/content/drive/MyDrive/PBT/Datasets/GAN/training_data.csv'
surrogate_model = train_surrogate_model(training_data_file)


Training data file '/content/drive/MyDrive/PBT/Datasets/GAN/training_data.csv' does not exist or is empty. Skipping surrogate model training.


# GAN


In [23]:
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

# Define the data loader
transform = transforms.Compose([
    transforms.Resize((64, 64)),  # Resize real images to match Generator output
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2471, 0.2435, 0.2616])
])

dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
data_loader = DataLoader(dataset, batch_size=128, shuffle=True, num_workers=4)

# Print dataset size
print("Dataset size:", len(dataset))

# Print the shape of data samples
data_iter = iter(data_loader)
images, labels = next(data_iter)

print("Batch shape:", images.shape)
print("Single image shape:", images[0].shape)

# Load the pretrained ResNet18 model
cifar_classifier = models.resnet18(pretrained=True)

# Adapt the classifier for CIFAR-10
num_ftrs = cifar_classifier.fc.in_features
cifar_classifier.fc = torch.nn.Linear(num_ftrs, 10)  # CIFAR-10 has 10 classes

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cifar_classifier = cifar_classifier.to(device)
cifar_classifier.eval()



Files already downloaded and verified
Dataset size: 50000
Batch shape: torch.Size([128, 3, 64, 64])
Single image shape: torch.Size([3, 64, 64])




ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

GAN Model Definition

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from torchvision.transforms import Resize
resize = Resize((64, 64))

class Generator(nn.Module):
    def __init__(self, latent_dim=100, num_channels=3):
        super(Generator, self).__init__()
        self.latent_dim = latent_dim
        self.num_channels = num_channels

        self.model = nn.Sequential(
            nn.ConvTranspose2d(latent_dim, 512, 4, 1, 0, bias=False),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            nn.ConvTranspose2d(512, 256, 4, 2, 1, bias=False),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            nn.ConvTranspose2d(256, 128, 4, 2, 1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(True),
            nn.ConvTranspose2d(128, 64, 4, 2, 1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.ConvTranspose2d(64, num_channels, 4, 2, 1, bias=False),
            nn.Tanh()
        )

    def forward(self, z):
        #print(f"Generator input z shape: {z.shape}")
        z = z.view(z.size(0), self.latent_dim, 1, 1)
        output = self.model(z)
        #print(f"Generator output shape: {output.shape}")
        return output


class Discriminator(nn.Module):
    def __init__(self, num_channels=3):
        super(Discriminator, self).__init__()
        self.num_channels = num_channels

        self.model = nn.Sequential(
            nn.Conv2d(num_channels, 64, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(64, 128, 4, 2, 1, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(128, 256, 4, 2, 1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(256, 512, 4, 2, 1, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(512, 1, 4, 1, 0, bias=False)
        )

    def forward(self, x):
        #print(f"Discriminator input x shape: {x.shape}")
        output = self.model(x)
        output = output.view(-1, 1).squeeze(1)
        #print(f"Discriminator output shape: {output.shape}")
        return output

Training Loop with WGAN-GP Objective

Step Each step consists of K = 5 gradient descent updates of the discriminator followed by a single update
of the generator using the Adam optimiser (Kingma & Ba, 2015). We train using the WGAN-GP (Gulrajani
11
et al., 2017) objective where the discriminator estimates the Wasserstein distance between the real and generated data distributions, with the Lipschitz constraint enforced by regularising its input gradient to have a unit
norm. See Appendix A.3 for additional details.

In [25]:
def compute_gradient_penalty(discriminator, real_data, fake_data, device, gradient_penalty_weight):
    """ Compute the gradient penalty for the WGAN-GP. """
    alpha = torch.rand(real_data.size(0), 1, 1, 1, device=device)
    alpha = alpha.expand_as(real_data)

    interpolated = alpha * real_data.data + (1 - alpha) * fake_data.data
    interpolated.requires_grad = True

    prob_interpolated = discriminator(interpolated)

    gradients = torch.autograd.grad(outputs=prob_interpolated, inputs=interpolated,
                                    grad_outputs=torch.ones(prob_interpolated.size()).to(device),
                                    create_graph=True, retain_graph=True, only_inputs=True)[0]

    gradients = gradients.view(gradients.size(0), -1)
    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * gradient_penalty_weight
    #print(f"Gradient tensor shape: {gradients.shape}")
    return gradient_penalty

def train_step(generator, discriminator, gen_optimizer, disc_optimizer, real_data, batch_size, device, epoch):
  start_time = time.time()
  real_data = resize(real_data).to(device)
  # Multiple discriminator updates
  for _ in range(5):
      start_disc_time = time.time()
      # Sample noise as generator input
      z = torch.randn(batch_size, 100, device=device)
      fake_data = generator(z).detach()

      # Reset gradients
      disc_optimizer.zero_grad()

      # Discriminator loss
      real_loss = discriminator(real_data)
      fake_loss = discriminator(fake_data)
      disc_loss = -(torch.mean(real_loss) - torch.mean(fake_loss))

      # WGAN-GP gradient penalty
      gradient_penalty = compute_gradient_penalty(discriminator, real_data, fake_data, device)
      disc_loss += gradient_penalty

      # Backprop and optimize
      disc_loss.backward()
      disc_optimizer.step()
      end_disc_time = time.time()
      logging.info(f"Completed discriminator update {i+1}: took {end_disc_time - start_disc_time} seconds")

  # Generator update
  logging.info("Starting generator update")
  start_gen_time = time.time()
  gen_optimizer.zero_grad()
  z = torch.randn(batch_size, 100, device=device)
  fake_data = generator(z)
  gen_loss = -torch.mean(discriminator(fake_data))
  gen_loss.backward()
  gen_optimizer.step()
  end_gen_time = time.time()
  logging.info(f"Completed generator update: took {end_gen_time - start_gen_time} seconds")

  # Log the losses and hyperparameters
  learning_rate_d = disc_optimizer.param_groups[0]['lr']
  learning_rate_g = gen_optimizer.param_groups[0]['lr']
  d_loss_value = d_loss.item()  # Assuming d_loss is calculated in your training code
  g_loss_value = g_loss.item()  # Assuming g_loss is calculated in your training code

  # Append to the DataFrame
  training_data.loc[len(training_data)] = [epoch, d_loss_value, g_loss_value, learning_rate_d, learning_rate_g]
  end_time = time.time()
  logging.info(f"Finished train_step at {end_time}, duration: {end_time - start_time} seconds")

  # Return losses if needed
  return d_loss.item(), g_loss.item()

computed from the outputs of a pretrained CIFAR classifier (as used in Rosca et al. (2017)) rather than an
ImageNet classifier, to avoid directly optimising the final performance metric. The CIFAR Inception scorer
uses a much smaller network, making evaluation much faster



In [26]:
import torch
from torch.nn.functional import softmax
from scipy.stats import entropy

torch.set_num_threads(2)

def inception_score(imgs, cifar_classifier, cuda=False, batch_size=32, splits=1):
  start_time = time.time()
  """Compute the Inception Score for generated images using CIFAR classifier."""
  N = len(imgs)
  dataloader = torch.utils.data.DataLoader(imgs, batch_size=batch_size)

  # Move classifier to GPU if using CUDA
  if cuda:
      cifar_classifier.cuda()

  preds = []
  with torch.no_grad():
      for batch in dataloader:
          if cuda:
              batch = batch.cuda()
          pred = cifar_classifier(batch)
          preds.append(softmax(pred, dim=1).cpu().numpy())

  # Now compute the mean kl-div
  preds = np.concatenate(preds, 0)
  split_scores = []

  for k in range(splits):
      part = preds[k * (N // splits): (k+1) * (N // splits), :]
      py = np.mean(part, axis=0)
      scores = []
      for i in range(part.shape[0]):
          pyx = part[i, :]
          scores.append(entropy(pyx, py))
      split_scores.append(np.exp(np.mean(scores)))
  end_time = time.time()  # Add this line to record the end time
  print(f"Worker {self.id}: Inception score time: {end_time - start_time:.2f} seconds")
  return np.mean(split_scores), np.std(split_scores)

# Example usage:
# imgs is a batch of generated images from your GAN
# cifar_classifier is your pretrained CIFAR classifier
# inception_score_value, std_dev = inception_score(imgs, cifar_classifier, cuda=True)

PBT

This guy does all the work - running in a multiprocessor framework for parrellism

In [27]:
import uuid
import torch.multiprocessing as torch_mp
import torch.optim as optim
import pandas as pd
import time

class Worker(torch_mp.Process):
  def __init__(self, generator, discriminator, gen_lr, disc_lr, beta1, beta2, cifar_classifier, device, training_data, use_surrogate=False):
      super(Worker, self).__init__()
      self.id = uuid.uuid4()
      self.generator = generator
      self.discriminator = discriminator
      self.gen_optimizer = optim.Adam(generator.parameters(), lr=gen_lr, betas=(beta1, beta2))
      self.disc_optimizer = optim.Adam(discriminator.parameters(), lr=disc_lr, betas=(beta1, beta2))
      self.cifar_classifier = cifar_classifier
      self.device = device
      self.inception_score = 0
      self.training_data = training_data
      self.use_surrogate = use_surrogate
      self.latent_dim = 100
      self.batch_size = hyperparameters['batch_size'][0]
      print(f"Worker {self.id}: Batch size: {self.batch_size}")
      self.replay_buffer_size = hyperparameters['replay_buffer_size'][0]
      self.n_critic = hyperparameters['n_critic'][0]
      self.gradient_penalty_weight = hyperparameters['gradient_penalty_weight'][0]
      self.gradient_penalty_weight = hyperparameters['gradient_penalty_weight'][0]

  def run(self, data_loader, steps):
      self.train(data_loader, steps)
      self.inception_score = self.evaluate()
      return self.inception_score

  def set_batch_size(self, batch_size):
      self.batch_size = batch_size
      return self.batch_size

  def set_replay_buffer_size(self, replay_buffer_size):
      self.replay_buffer_size = replay_buffer_size
      return self.replay_buffer_size

  def set_n_critic(self, n_critic):
      self.n_critic = n_critic
      return self.n_critic

  def set_gradient_penalty_weight(self, gradient_penalty_weight):
      self.gradient_penalty_weight = gradient_penalty_weight
      return self.gradient_penalty_weight

  def get_inception_score(self):
      return self.inception_score


  def train(self, data_loader, steps):
    start_time = time.time()
    for step in range(steps):
      for _ in range(5):
        real_data, _ = next(iter(data_loader))
        real_data = real_data.to(self.device)
        batch_size = real_data.size(0)

        self.disc_optimizer.zero_grad()

        z = torch.randn(batch_size, self.latent_dim, 1, 1).to(self.device)
        fake_data = self.generator(z)

        real_loss = self.discriminator(real_data).mean()
        fake_loss = self.discriminator(fake_data.detach()).mean()
        disc_loss = fake_loss - real_loss

        gp = compute_gradient_penalty(self.discriminator, real_data, fake_data, self.device, self.gradient_penalty_weight)
        disc_loss += gp
        disc_loss.backward()
        self.disc_optimizer.step()

        self.gen_optimizer.zero_grad()

        z = torch.randn(batch_size, self.latent_dim, device=self.device)
        fake_data = self.generator(z)
        gen_loss = -self.discriminator(fake_data).mean()
        gen_loss.backward()
        self.gen_optimizer.step()
        torch.cuda.empty_cache()
        self.log_data(step, gen_loss.item(), disc_loss.item(), real_loss.item(), fake_loss.item())

    end_time = time.time()  # Add this line to record the end time
    print(f"Worker {self.id}: Training time: {end_time - start_time:.2f} seconds")

  def log_data(self, step, gen_loss, disc_loss, real_loss, fake_loss):
      new_row = pd.DataFrame({
          'step': [step],
          'gen_loss': [gen_loss],
          'disc_loss': [disc_loss],
          'real_loss': [real_loss],
          'fake_loss': [fake_loss],
          'learning_rate_g': [self.gen_optimizer.param_groups[0]['lr']],
          'learning_rate_d': [self.disc_optimizer.param_groups[0]['lr']],
          'beta1': [self.gen_optimizer.param_groups[0]['betas'][0]],
          'beta2': [self.gen_optimizer.param_groups[0]['betas'][1]],
          'batch_size': [self.batch_size],
          'latent_dim': [self.latent_dim],
          'replay_buffer_size': [self.replay_buffer_size],
          'n_critic': [self.n_critic],
          'gradient_penalty_weight': [self.gradient_penalty_weight]
      })
      self.training_data = pd.concat([self.training_data, new_row], ignore_index=True)

  def evaluate(self, num_samples=50000, surrogate_threshold=7.0):
    start_time = time.time()
    if not self.use_surrogate:
      self.generator.eval()
      z = torch.randn(num_samples, 100, 1, 1).to(self.device)
      with torch.no_grad():
          fake_data = self.generator(z)
      fake_data = fake_data.cpu()
      self.inception_score, _ = inception_score(fake_data.cpu(), self.cifar_classifier.cpu(), cuda=False, batch_size=32, splits=10)
      self.generator.train()
      end_time = time.time()
      print(f"Worker {self.id}: Evaluation time: {end_time - start_time:.2f} seconds")
      return self.inception_score

  def exploit(self, population):
    start_time = time.time()
    print("Exploiting better hyperparameters...")

    # Determine selection method and select a better worker based on the Inception score
    if np.random.random() < 0.5:
        # Truncation selection
        print("Using truncation selection...")
        idx = np.argsort([w.inception_score for w in population])[-int(len(population) * 0.2):]
        better_worker = population[np.random.choice(idx)]
    else:
        # Binary tournament selection
        print("Using binary tournament selection...")
        idx = np.random.choice(len(population), size=2, replace=False)
        better_worker = population[idx[0]] if population[idx[0]].inception_score > population[idx[1]].inception_score else population[idx[1]]

    # Print out the old and new learning rates and hyperparameters
    print(f"Old Generator LR: {self.gen_optimizer.param_groups[0]['lr']}, New Generator LR: {better_worker.gen_optimizer.param_groups[0]['lr']}")
    print(f"Old Discriminator LR: {self.disc_optimizer.param_groups[0]['lr']}, New Discriminator LR: {better_worker.disc_optimizer.param_groups[0]['lr']}")

    # Load the state dict from better_worker to current worker
    self.generator.load_state_dict(better_worker.generator.state_dict())
    self.discriminator.load_state_dict(better_worker.discriminator.state_dict())
    self.gen_optimizer.load_state_dict(better_worker.gen_optimizer.state_dict())
    self.disc_optimizer.load_state_dict(better_worker.disc_optimizer.state_dict())
    print(f"Worker {self.id}: Exploit time: {end_time - start_time:.2f} seconds")

    def explore(self):
      start_time = time.time()
      # Print initial hyperparameters
      print("Exploring new hyperparameters...")
      print(f"Initial learning rates - Generator: {self.gen_optimizer.param_groups[0]['lr']}, Discriminator: {self.disc_optimizer.param_groups[0]['lr']}")
      print(f"Initial batch size: {self.batch_size}, Initial latent dimension: {self.latent_dim}")

      # Update learning rates
      self.gen_optimizer.param_groups[0]['lr'] *= np.random.uniform(0.5, 2.0)
      self.disc_optimizer.param_groups[0]['lr'] *= np.random.uniform(0.5, 2.0)

      # Update batch size and latent dimension
      old_batch_size = self.batch_size
      old_latent_dim = self.latent_dim
      self.batch_size = np.random.choice(hyperparameters['batch_size'])
      self.latent_dim = np.random.choice(hyperparameters['latent_dim'])

      # Print updated hyperparameters
      print(f"Updated learning rates - Generator: {self.gen_optimizer.param_groups[0]['lr']}, Discriminator: {self.disc_optimizer.param_groups[0]['lr']}")
      print(f"Updated batch size from {old_batch_size} to {self.batch_size}")
      print(f"Updated latent dimension from {old_latent_dim} to {self.latent_dim}")
      print(f"Worker {self.id}: Explore time: {end_time - start_time:.2f} seconds")


Helper class to parallise the Worker

In [28]:
def worker_fn(worker, data_loader, steps, queue):
    inception_score = worker.run(data_loader, steps)
    queue.put((worker.id, inception_score))

Run the experiement

In [29]:
def run_pbt(population_size, generations, data_loader, cifar_classifier, device, training_data, use_surrogate=False, debug_mode=False, use_pbt=True):
    population = []
    for _ in range(population_size):
        # Sample hyperparameters from the predefined ranges
        gen_lr = np.random.choice(hyperparameters['learning_rate_g'])
        disc_lr = np.random.choice(hyperparameters['learning_rate_d'])
        beta1 = np.random.choice(hyperparameters['beta1'])
        beta2 = np.random.choice(hyperparameters['beta2'])
        batch_size = np.random.choice(hyperparameters['batch_size'])
        replay_buffer_size = np.random.choice(hyperparameters['replay_buffer_size'])
        n_critic = np.random.choice(hyperparameters['n_critic'])
        gradient_penalty_weight = np.random.choice(hyperparameters['gradient_penalty_weight'])

        # Create a new worker with the sampled hyperparameters
        worker = Worker(Generator().to(device),
                        Discriminator().to(device),
                        gen_lr, disc_lr, beta1, beta2,
                        cifar_classifier, device, training_data, use_surrogate=use_surrogate)
        worker.set_batch_size(batch_size)
        worker.set_replay_buffer_size(replay_buffer_size)
        worker.set_n_critic(n_critic)
        worker.set_gradient_penalty_weight(gradient_penalty_weight)
        population.append(worker)

    all_inception_scores = []
    best_inception_scores = []
    mean_inception_scores = []
    best_models = []

    if debug_mode:
        training_dataset, _ = torch.utils.data.random_split(dataset, [50, len(dataset) - 50])
        training_dataloader = DataLoader(training_dataset, batch_size=12, shuffle=True, num_workers=2)
        train_steps = 5
        print(f"Debug mode: Training dataset size: {len(training_dataset)}")
        print(f"Debug mode: Training steps: {train_steps}")
    else:
        training_dataloader = data_loader
        train_steps = 5000

    for generation in range(generations):
        print(f"Generation {generation + 1}")

        queue = mp.Queue()
        processes = []

        if debug_mode:
            training_dataset, _ = torch.utils.data.random_split(dataset, [50, len(dataset) - 50])
            training_dataloader = DataLoader(training_dataset, batch_size=12, shuffle=True, num_workers=2)
            train_steps = 5
        else:
            training_dataloader = data_loader
            train_steps = 5000

        for worker in population:
            worker.start()
            processes.append(worker)

        for worker in processes:
            inception_score = worker.run(training_dataloader, train_steps)
            queue.put(inception_score)

        # Collect the results
        inception_scores = [queue.get() for _ in range(len(population))]
        inception_scores.sort(key=lambda x: x[0])  # Sort by worker ID
        inception_scores = [score for _, score in inception_scores]

        for worker_idx, inception_score in enumerate(inception_scores):
            print(f"Worker {worker_idx + 1} Inception Score: {inception_score:.4f}")
            all_inception_scores.append(inception_score)

        if use_pbt:
          # Record best and mean Inception scores for the current generation
          best_inception_score = max(worker.inception_score for worker in population)
          mean_inception_score = sum(worker.inception_score for worker in population) / len(population)
          best_inception_scores.append(best_inception_score)
          mean_inception_scores.append(mean_inception_score)

          # Record the best model for the current generation
          best_model_idx = np.argmax([worker.inception_score for worker in population])
          best_models.append(population[best_model_idx].generator.state_dict())

          # Exploit and explore
          for worker in population:
            if np.random.random() < 0.5:
              worker.exploit(population)
            else:
              worker.explore()

    if use_pbt:
      # Visualize generated samples from the best models
      best_model = Generator().to(device)
      for i, model_state_dict in enumerate(best_models):
        best_model.load_state_dict(model_state_dict)
        z = torch.randn(16, 100, 1, 1).to(device)
        samples = best_model(z)
        samples = (samples + 1) / 2  # Rescale to [0, 1]
        plt.figure(figsize=(10, 10))
        for j in range(16):
            plt.subplot(4, 4, j + 1)
            plt.imshow(samples[j].permute(1, 2, 0).cpu().numpy())
            plt.axis('off')
        plt.title(f"Best Model at Generation {i + 1}")
        plt.show()

    return population

In [30]:
#if __name__ == '__main__':
#  mp.set_start_method('spawn')  # Set the start method for multiprocessing

training_data = pd.DataFrame(columns=['step', 'gen_loss', 'disc_loss', 'real_loss', 'fake_loss',
                                      'learning_rate_g', 'learning_rate_d', 'beta1', 'beta2',
                                      'batch_size', 'latent_dim', 'replay_buffer_size', 'n_critic',
                                      'gradient_penalty_weight'])

population_size = 2
generations = 1
#population_size = 45
#generations = 400

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Run without the surrogate model to collect training data
final_population = run_pbt(population_size, generations, data_loader, cifar_classifier, device, training_data, debug_mode=True, use_surrogate=False)
save_training_data(training_data)

final_population = run_pbt(population_size, generations, data_loader, cifar_classifier, device, training_data, debug_mode=True, use_surrogate=True)


Worker cdbf4ff2-e580-40b2-9a9e-1910eed4457c: Batch size: 8
Worker 22760dcb-61d5-40e7-b8c3-01793242e758: Batch size: 8
Debug mode: Training dataset size: 50
Debug mode: Training steps: 5
Generation 1
Worker cdbf4ff2-e580-40b2-9a9e-1910eed4457c: Training time: 225.97 seconds


OutOfMemoryError: CUDA out of memory. Tried to allocate 6.10 GiB. GPU 0 has a total capacity of 22.17 GiB of which 3.20 GiB is free. Process 4423 has 18.96 GiB memory in use. Of the allocated memory 18.68 GiB is allocated by PyTorch, and 45.95 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Visualisation of Results

In [None]:
print(training_data.shape())

In [None]:
data = pd.read_csv('gan_training_data.csv')

# Plot learning curves
plt.figure(figsize=(12, 6))
for key in ['Real_Loss', 'Fake_Loss', 'Gen_Loss']:
    plt.plot(data[key], label=key)
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Training Loss Curves')
plt.legend()
plt.show()

# Plot Inception Scores
plt.figure(figsize=(12, 6))
plt.plot(data['Inception_Score'], label='Inception Score')
plt.xlabel('Training Steps')
plt.ylabel('Inception Score')
plt.title('Inception Score Over Time')
plt.legend()
plt.show()

GAN Random Forest Surrogate Model