We use DeepSEA as the biological critic. It's input is encoded A, G, C, T and takes in sequences of length 1000.

We set up our generator to also create one-hot encoded strings of A, G, C, T of length 1000.

# Data preprocessing

In [4]:
from Bio import SeqIO
from textwrap import wrap
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import numpy as np
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [5]:
ngpu=1
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")

Split reference genome into 131-kb samples to match the input size of Basenji

We also convert lower-case soft-masked regions of the DNA to uppercase for proper one-hot encoding 

See: https://bioinformatics.stackexchange.com/questions/225/uppercase-vs-lowercase-letters-in-reference-genome

In [6]:
sample_length = 1000

In [7]:
# nucleic_acids = "ACGT"
# samples = []

# if len(samples) == 0:
#   for i, record in enumerate(SeqIO.parse("/content/GCF_000001405.39_GRCh38.p13_genomic.fna", "fasta")):
#     if i <= 5:
#       sequence = str(record.seq)
#       for sample in wrap(sequence, sample_length):
#         sample = sample.upper()
#         if set(sample) == set(nucleic_acids) and len(sample) == sample_length: # Ensure string only contains proper nucleic acids
#           samples.append(sample)

In [8]:
nucleic_acids = "AGCT"

num_training_samples = 100
validation_samples = []
num_validation_samples = 100

def generate_samples(num_samples):
  samples = []
  for i in range(num_samples):
    sample = ""
    for i in range(sample_length):
      base_selection = int(random.random()*4)
      sample += nucleic_acids[base_selection]
    samples.append(sample)
  return samples

raw_training_samples = generate_samples(num_training_samples)

raw_validation_samples = generate_samples(num_validation_samples)

Initialize one hot encoder

In [9]:
onehot_encoder = OneHotEncoder(sparse=False)
label_encoder = LabelEncoder()

integer_encoded = label_encoder.fit_transform(list(nucleic_acids))
onehot_encoder = OneHotEncoder(sparse=False)

integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoder.fit(np.asarray(list(nucleic_acids)).reshape(-1, 1))

OneHotEncoder(sparse=False)

In [10]:
# One-hot encode each input string
batch_size = 100
use_channels = False

def encode_and_resize_samples(samples):
  one_hot_samples = np.array([np.array(onehot_encoder.transform(np.asarray(list(sample)).reshape(-1, 1))) for sample in samples])
  if use_channels:
    one_hot_samples2 = np.swapaxes(np.swapaxes(np.expand_dims(one_hot_samples, axis=3), 1, 2), 2, 3)
  else:
    one_hot_samples2 = np.swapaxes(np.expand_dims(one_hot_samples, axis=3), 1, 3)

  return one_hot_samples2

In [11]:
resized_training_samples = encode_and_resize_samples(raw_training_samples)
resized_validation_samples = encode_and_resize_samples(raw_validation_samples)

In [12]:
resized_training_samples[0].shape

(1, 4, 1000)

Reshape and batch one-hot encoded input sequences

In [13]:
def batchify(samples: np.ndarray, batch_size: int) -> np.ndarray:
    for i in list(range(0, len(samples), batch_size)):
      yield samples[i:i+batch_size]

In [14]:
batched_training_samples = list(batchify(resized_training_samples, batch_size))
batched_validation_samples = list(batchify(resized_validation_samples, batch_size))

In [15]:
# Convert one-hot encoded genome samples to PyTorch tensors
training_samples = torch.as_tensor(batched_training_samples).to(device, dtype=torch.float) 
validation_samples = torch.as_tensor(batched_validation_samples).to(device, dtype=torch.float) 

# Set up Pytorch for GPU

In [16]:
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")

# Train DeepSEA-like model

In [17]:
# Format DeepSEA training data
import h5py
h5f = h5py.File("/gpfs/home/crsmall/CS2952G/ganome/data/deepsea_train/train.mat", "r")

In [24]:
ds_trainx = h5f["trainxdata"]
ds_trainy = h5f["traindata"]

In [40]:
batch_size = 100

ds_trainx = torch.as_tensor(ds_trainx.reshape((-1, batch_size, 1, 4, 1000))).to(device, dtype=torch.float)
ds_trainy = torch.as_tensor(ds_trainy.reshape((-1, batch_size, 919))).to(device, dtype=torch.float)

RuntimeError: ignored

In [20]:
"""
DeepSEA architecture (Zhou & Troyanskaya, 2015).
"""
import numpy as np
import torch
import torch.nn as nn


class DeepSEA(nn.Module):
    def __init__(self):
        super(DeepSEA, self).__init__()
        kernel_size = 10
        num_channels = 1
        n_deepsea_features = 256
        n_output_features = 919

        self.main = nn.Sequential(
          nn.Conv2d(num_channels, n_deepsea_features, (4, kernel_size)),
          nn.BatchNorm2d(n_deepsea_features),
          nn.LeakyReLU(0.2 ,inplace=True),
          # Second layer
          nn.Conv1d(n_deepsea_features, int(n_deepsea_features/2), kernel_size), # <- This kernel size has to be 1,x for some reason
          nn.BatchNorm2d(int(n_deepsea_features/2)),
          nn.LeakyReLU(0.2 ,inplace=True),
          # Third layer
          nn.Conv1d(int(n_deepsea_features/2), n_deepsea_features, kernel_size),
          nn.BatchNorm2d(n_deepsea_features),
          nn.ReLU(True),
          # Fourth layer
          nn.Linear(n_deepsea_features, n_output_features),
          nn.ReLU(True),
          nn.Linear(n_output_features, n_output_features),
          nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

In [21]:
deepsea = DeepSEA().to(device)

In [18]:
ds_criterion = nn.MSELoss()
ds_optimizer = optim.Adam(deepsea.parameters())

NameError: name 'deepsea' is not defined

In [47]:

num_ds_epochs = 10


for epoch in range(num_ds_epochs):
  for i in range(0, batch_size):
      training_batch = ds_trainx[i]
      training_labels = ds_trainy[i]

      deepsea.zero_grad()
      output = deepsea.forward(training_batch)

      training_loss = ds_criterion(output, training_labels)

      training_loss.backward()
      ds_optimizer.step()

      training_loss = training_loss.item()

      # Compute validation loss across all validation samples
      validation_total_loss = 0
      for validation_batch in validation_samples:
        validation_output = net(validation_batch)
        validation_total_loss += ds_criterion(validation_output, validation_batch)

      validation_loss = validation_total_loss/num_validation_samples

      print(f"Epoch {epoch}, Batch: {i}: Training Loss: {training_loss}, Validation Loss: {validation_loss}")
      print('[%d, %5d] loss: %.3f' %
            (epoch + 1, i + 1, running_loss))
      running_loss = 0.0

RuntimeError: ignored

# Simple NN Generator

In [28]:
num_channels = 1  # Only one channel for genomic data
n_generator_features = 256  # Size of feature maps in generator
n_discriminator_features = 64  # Size of feature maps in discriminator
kernel_size = (4, 10)
input_size = sample_length
ngpu = 1

class Net(nn.Module):
  def __init__(self, ngpu):
    super(Net, self).__init__()
    self.ngpu = ngpu
    self.main = nn.Sequential(
        # First layer
        nn.Conv2d(num_channels, n_generator_features, kernel_size),
        nn.BatchNorm2d(n_generator_features),
        nn.LeakyReLU(0.2 ,inplace=True),
        # Second layer
        nn.Conv2d(n_generator_features, int(n_generator_features/2), (1,10)), # <- This kernel size has to be 1,x for some reason
        nn.BatchNorm2d(int(n_generator_features/2)),
        nn.LeakyReLU(0.2 ,inplace=True),
        nn.ConvTranspose2d(int(n_generator_features/2), n_generator_features, (1, 10)),
        nn.BatchNorm2d(n_generator_features),
        nn.ReLU(True),
        # Output Layer
        nn.ConvTranspose2d(n_generator_features, num_channels, kernel_size),
        nn.Softmax(dim=2)
    )

  def forward(self, input):
    return self.main(input)

In [29]:
# Arbitrarily taken from: https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)

In [30]:
# Instantiate our model with the detected GPU
net = Net(1).to(device)

# Initialize the weights in our model
# net.apply(weights_init)

# Print the model
print(net)

Net(
  (main): Sequential(
    (0): Conv2d(1, 256, kernel_size=(4, 10), stride=(1, 1))
    (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.2, inplace=True)
    (3): Conv2d(256, 128, kernel_size=(1, 10), stride=(1, 1))
    (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): LeakyReLU(negative_slope=0.2, inplace=True)
    (6): ConvTranspose2d(128, 256, kernel_size=(1, 10), stride=(1, 1))
    (7): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU(inplace=True)
    (9): ConvTranspose2d(256, 1, kernel_size=(4, 10), stride=(1, 1))
    (10): Softmax(dim=2)
  )
)


In [31]:
# # Visualize our model
# from torchviz import make_dot
# y = net(one_hot_samples3)
# make_dot(y)

In [32]:
criterion = nn.BCELoss(reduction='sum')
# optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(net.parameters())

In [33]:
HEIGHT = batch_size
WIDTH = 1000
DEPTH = 4

num_epochs = 100

for epoch in range(num_epochs):
  running_loss = 0.0
  for i, training_batch in enumerate(training_samples):
        
    net.zero_grad()
    output = net(training_batch)

    ds_ts = deepsea(training_batch)
    ds_o = deepsea(o)

    # Training loss to recreate sequences
    # criterion = nn.BCELoss(reduction='sum')
    # training_loss = criterion(output, training_batch)
    
    # Training loss to minimize DeepSEA distances
    criterion = nn.MSELoss(reduction='sum')

    output_pred = torch.as_tensor(output_pred_vals[0]).to(device, dtype=torch.float) 
    training_pred = torch.as_tensor(training_pred_vals[0]).to(device, dtype=torch.float) 

    training_loss = criterion(output_pred, training_pred)

    training_loss.backward()
    optimizer.step()

    training_loss = training_loss.item()

    # Compute validation loss across all validation samples
    validation_total_loss = 0
    for validation_batch in validation_samples:
      validation_output = net(validation_batch)
      validation_total_loss += criterion(validation_output, validation_batch)

    validation_loss = validation_total_loss/num_validation_samples

    print(f"Epoch {epoch}, Batch: {i}: Training Loss: {training_loss}, Validation Loss: {validation_loss}")
    print('[%d, %5d] loss: %.3f' %
          (epoch + 1, i + 1, running_loss))
    running_loss = 0.0

RuntimeError: ignored

In [26]:
y = net(validation_samples[0])

In [None]:
torch.argmax(validation_samples[0][0], dim=1)

In [None]:
torch.argmax(y[0], dim=1)

In [None]:
o = output.detach().cpu().numpy()

In [None]:
(o == o.max(axis=2)[:,None]).astype(int).reshape(1, 100, 4, 1000).squeeze(0).shape

In [None]:
deepsea_training_tensor

In [None]:
deepsea_output_tensor

In [None]:
len(training_pred_vals)

In [None]:
!zip -r /content/deepsea_train.zip /content/deepsea_train