## Installations + Imports

In [None]:
pip install rdkit

In [None]:
pip install tape_proteins

In [None]:
pip install transformers

In [None]:
pip install deepchem

In [None]:
import tensorflow as tf
import torch
from tape import ProteinBertModel, TAPETokenizer
import torch.nn as nn
from rdkit import Chem
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoModel, AutoTokenizer, GPTNeoForCausalLM

## Preprocess Dataset + Create DataLoaders

In [None]:
def preprocess():
  df = pd.read_csv('SMILES_Keyboard.csv')
  smiles_list = df['SMILES'].tolist()
  # Truncate the length of the SMILES strings to conserve memory and power.
  smiles_list = [word for word in smiles_list if len(word) <= 75]

  unique_chars = set()

  for smiles in smiles_list:
      for char in smiles:
          unique_chars.add(char)

  vocab = sorted(list(unique_chars))

  # Add special tokens
  START_TOKEN = '$'
  END_TOKEN = '&'
  PAD_TOKEN = '^'

  vocab.extend([START_TOKEN, END_TOKEN, PAD_TOKEN])

  char_to_index = {char: idx for idx, char in enumerate(vocab)}
  index_to_char = {idx: char for char, idx in char_to_index.items()}

  # Define a maximum length for the sequences
  # It should be at least one more than the length of the longest SMILES string
  # in your dataset to account for the start and end tokens.
  MAX_LENGTH = 77

  def one_hot_encode(smiles, vocab):
      # Create an array of zeros with the shape (MAX_LENGTH, length of the vocabulary)
      encoded = np.zeros((MAX_LENGTH, len(vocab)), dtype=int)

      # Add start token
      smiles = START_TOKEN + smiles + END_TOKEN
      if len(smiles) < MAX_LENGTH:
        padding_length = MAX_LENGTH - len(smiles)
        smiles = smiles + PAD_TOKEN * padding_length
      # Go through each character in the SMILES string
      for i, char in enumerate(smiles):
          # Find the position of the character in the vocabulary
          j = vocab.index(char)
          # Set the corresponding position in the encoded matrix to 1
          encoded[i, j] = 1

      # The rest of the positions in the matrix will remain 0, representing the PAD token

      return encoded

  encoded_data = [one_hot_encode(smiles, vocab) for smiles in smiles_list]

  train, val = train_test_split(encoded_data, test_size=0.2, random_state=69)

In [None]:
from torch.utils.data import Dataset

class SMILESDataset(Dataset):
    def __init__(self, word_list):
        self.word_list = word_list

    def __len__(self):
        return len(self.word_list)

    def __getitem__(self, idx):
        return self.word_list[idx]

In [None]:
from torch.utils.data import DataLoader

train_dataset = SMILESDataset(train)
val_dataset = SMILESDataset(val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

## Network Architecture. 3 modules: AASE, SFI, MG

In [None]:
# Amino Acid Sequence Embedding module.

class AASEModule:
    def __init__(self):
        self.model = ProteinBertModel.from_pretrained('bert-base')
        self.tokenizer = TAPETokenizer(vocab='iupac')

    def get_embedding(self, sequence):
        token_ids = torch.tensor([self.tokenizer.encode(sequence)])
        with torch.no_grad():  # No need to compute gradients for this step
            sequence_output, pooled_output = self.model(token_ids)

        # Since the pooled output is not trained, take the mean of the sequence output
        sequence_mean = sequence_output.mean(dim=1)

        return sequence_mean

In [None]:
# Perceptron layer for Structural Feature Inference.

class Perceptron(nn.Module):
    def __init__(self, input_dim, output_dim=512):
        super(Perceptron, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.fc(x)

protein_feature_dim = 768
perceptron = Perceptron(protein_feature_dim)

In [None]:
# GANs to learn how to convert AASE Embedding into protein latent representation.

class Generator(nn.Module):
    def __init__(self, noise_dim, condition_dim, output_dim):
        super(Generator, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(noise_dim+condition_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim)
        )

    def forward(self, noise, condition):
        x = torch.cat([noise, condition], dim=1)
        return self.fc(x)

noise_dim = 100
generator_output_dim = 512
generator = Generator(noise_dim, 868, generator_output_dim)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 1)
        )

    def forward(self, x):
        return self.fc(x)

discriminator = Discriminator(generator_output_dim)

In [None]:
# Molecular Generation conditioned on latent representation.

class Encoder(nn.Module):
    def __init__(self, vocab_size=42, seq_len=77, hidden_dim=512, latent_dim=512):
        super(Encoder, self).__init__()

        self.encoder_lstm = nn.LSTM(vocab_size, hidden_dim//2, num_layers=2, bidirectional=True, batch_first=True)
        self.encoder_ff = nn.Linear(hidden_dim, latent_dim)

    def forward(self, x):
        lstm_out, (h_n, c_n) = self.encoder_lstm(x)
        h_final_forward = h_n[-2]
        h_final_backward = h_n[-1]
        h_final = torch.cat([h_final_forward, h_final_backward], dim=1)
        latent = self.encoder_ff(h_final)

        return latent

class Decoder(nn.Module):
    def __init__(self, vocab_size=42, hidden_dim=512):
        super(Decoder, self).__init__()

        self.decoder_lstm = nn.LSTM(vocab_size, hidden_dim, num_layers=4, batch_first=True)
        self.decoder_ff = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, latent):
        lstm_out, _ = self.decoder_lstm(x, (latent.unsqueeze(0).repeat(4, 1, 1), torch.zeros_like(latent).unsqueeze(0).repeat(4, 1, 1)))
        output = self.decoder_ff(lstm_out)

        return output

class Autoencoder(nn.Module):
    def __init__(self, vocab_size=42, seq_len=77, hidden_dim=512, latent_dim=512):
        super(Autoencoder, self).__init__()

        self.encoder = Encoder(vocab_size, seq_len, hidden_dim, latent_dim)
        self.decoder = Decoder(vocab_size, hidden_dim)

    def forward(self, x):
        latent = self.encoder(x)
        output = self.decoder(x, latent)

        return output


## Model Training + Inference

In [None]:
from tqdm import tqdm

model = Autoencoder()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 4

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_train_loss = 0

    for i, batch in enumerate(tqdm(train_loader)):
        optimizer.zero_grad()

        batch = batch.float()
        outputs = model(batch)

        labels = torch.argmax(batch, dim=-1)
        loss = criterion(outputs.view(-1, outputs.size(-1)), labels.view(-1))

        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # Validation.
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = batch.float()
            outputs = model(batch)
            labels = torch.argmax(batch, dim=-1)
            loss = criterion(outputs.view(-1, outputs.size(-1)), labels.view(-1))
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")

torch.save(model, 'TGan_weights.pth')


100%|██████████| 442/442 [19:39<00:00,  2.67s/it]


Epoch 1/4, Training Loss: 0.8146875046058748, Validation Loss: 0.11592584096633636


100%|██████████| 442/442 [20:14<00:00,  2.75s/it]


Epoch 2/4, Training Loss: 0.05951944710813227, Validation Loss: 0.03122221213673149


100%|██████████| 442/442 [20:52<00:00,  2.83s/it]


Epoch 3/4, Training Loss: 0.1680420453584336, Validation Loss: 0.06258153244181797


100%|██████████| 442/442 [21:32<00:00,  2.92s/it]


Epoch 4/4, Training Loss: 0.037058004810226176, Validation Loss: 0.022935522292312736


In [None]:
class DrugGeneratorModel(nn.Module):
    def __init__(self):
        super(DrugGeneratorModel, self).__init__()

        # 1. AASEModule for amino acid sequence embeddings
        self.aase_module = AASEModule()

        # 2. Generator and Discriminator
        self.generator = Generator(noise_dim, 768, generator_output_dim)
        self.discriminator = Discriminator(generator_output_dim)

    def amino_to_embedding(self, amino_sequence):
        return self.aase_module.get_embedding(amino_sequence)

    def generate(self, amino_embedding, noise):
        return self.generator(noise, amino_embedding)

    def discriminate(self, samples):
        return self.discriminator(samples)


In [None]:
# Initialize the combined model
dGEN = DrugGeneratorModel()

# 1. Convert amino acid sequence to embedding.
sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ'
amino_embedding = dGEN.amino_to_embedding(sequence)

# 2. Generate some random noise.
noise = torch.randn((1, noise_dim))

# 3. Get the generated output
generated_latent = dGEN.generate(amino_embedding, noise)

# Convert PyTorch tensor to numpy for TensorFlow model.
generated_latent_np = generated_latent.detach().numpy()
print(generated_latent_np.shape)


100%|██████████| 567/567 [00:00<00:00, 1626655.52B/s]
100%|██████████| 370264230/370264230 [00:10<00:00, 36771088.48B/s]


(1, 512)


  token_ids = torch.tensor([self.tokenizer.encode(sequence)])


In [None]:
def generate_smiles(decoder, latent_vector, start_token, char_to_index, index_to_char, max_length=77):

    # Convert numpy ndarray to PyTorch tensor if necessary.
    if isinstance(latent_vector, np.ndarray):
        latent_vector = torch.tensor(latent_vector).float()

    # Ensure the tensor is in the expected shape: (1, latent_dim).
    if len(latent_vector.shape) == 1:
        latent_vector = latent_vector.unsqueeze(0)

    # Initialize the sequence with the START token.
    input_sequence = torch.zeros(1, 1, len(char_to_index)).float()
    input_sequence[0, 0, char_to_index[start_token]] = 1

    # The generated sequence starts with the START token.
    generated_sequence = [start_token]

    # Begin iterative generation
    for _ in range(max_length):
        # Feed the input sequence and the latent vector into the decoder.
        output = decoder(input_sequence, latent_vector)

        # Get the character with the highest prediction probability.
        _, predicted_idx = output.topk(1)
        predicted_idx = predicted_idx[0, -1, 0].item()
        predicted_char = index_to_char[predicted_idx]

        # Append the predicted character to the sequence.
        generated_sequence.append(predicted_char)

        # Stop generation if END token is produced.
        if predicted_char == END_TOKEN:
            break

        # Update the input_sequence for the next iteration.
        input_sequence = torch.zeros(1, 1, len(char_to_index)).float()
        input_sequence[0, 0, predicted_idx] = 1

    return ''.join(generated_sequence)


In [None]:
def smiles_to_tensor(smiles, char_to_index, max_length=77):
    tensor = torch.zeros(max_length, len(char_to_index)).float()
    for i, char in enumerate(smiles):
        tensor[i, char_to_index[char]] = 1
    return tensor

def tensor_to_smiles(tensor, index_to_char):
    smiles = []
    _, indices = tensor.topk(1, dim=-1)
    for idx in indices.squeeze():
        char = index_to_char[idx.item()]
        if char == END_TOKEN:
            break
        smiles.append(char)
    return ''.join(smiles)

def encode_decode_smiles(encoder, decoder, smiles, char_to_index, index_to_char, max_length=77):
    # Convert smiles to tensor.
    input_tensor = smiles_to_tensor(smiles, char_to_index, max_length)
    input_tensor = input_tensor.unsqueeze(0)

    # Pass through the encoder.
    latent_vector = encoder(input_tensor)

    # Pass the latent vector through the decoder.
    output_tensor = decoder(input_tensor, latent_vector)

    # Convert tensor back to SMILES.
    reconstructed_smiles = tensor_to_smiles(output_tensor.squeeze(), index_to_char)

    return reconstructed_smiles

# Usage:
smiles_string = "$CN1CC(C2=NCCN2)Oc2c1ccc1ccccc21&^^^^^^^^^^^^^^^^^"
reconstructed = encode_decode_smiles(model.encoder, model.decoder, smiles_string, char_to_index, index_to_char)
print(reconstructed)


$CN1CC(C2=NCCN2)Oc2c1ccc1ccccc21
