#  Dependencies and Paths

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd '/content/drive/MyDrive/transformers/gitas/GA_Transformer-main(2).zip (Unzipped Files)/GA_Transformer-main'
!ls

/content/drive/MyDrive/transformers/gitas/GA_Transformer-main(2).zip (Unzipped Files)/GA_Transformer-main
datasets   genetic_algorithm  LICENSE.txt  README.md	     to_do.txt
functions  images	      model	   requirements.txt


In [3]:
!pip install transformers==4.9.1 ruamel.yaml

Collecting transformers==4.9.1
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 7.3 MB/s 
[?25hCollecting ruamel.yaml
  Downloading ruamel.yaml-0.17.16-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 63.8 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 54.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 59.9 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 60.6 MB/s 
Collecting ruamel.yaml.clib>=0.1.2
  Downloading ruamel.yaml.clib-0.

In [4]:
import random
import os
import copy
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import ruamel.yaml

In [47]:
config_name = './genetic_algorithm/settings.yaml'

with open(config_name, 'r') as stream:
    try:
        yaml = ruamel.yaml.YAML()
        config = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

#  New directory for current genetic algorithm
directory = config['Paths']['iteration_folder']

#  Path to scored aptamers
aptamerList = config['Paths']['path_to_initial_aptamers']
aptamerListAll = config['Paths']['path_to_all_aptamers']
#  Path to PyTorch alBERT model
path_to_model = config['Paths']['path_to_model']

#  How many sequences we want to have in a list
apt_len = config['Parameters']['aptamer_len']

aptamerList_iter = './datasets/ga_interim_data/Albumin/breed_1.csv'

Stage I
Load model and create a DataLouder for latter GA

In [6]:
class CustomDataset(Dataset):

    def __init__(self, data, maxlen, with_labels=True, bert_model='albert-base-v2'):
        self.data = data  # pandas dataframe
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model, return_dict=False)  
        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sent1 = str(self.data.loc[index, 'Sequence1'])
        sent2 = str(self.data.loc[index, 'Sequence2'])

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(sent1, sent2, 
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      max_length=self.maxlen,  
                                      return_tensors='pt')  # Return torch.Tensor objects
        
        token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels
            label = self.data.loc[index, 'Label']
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids

In [7]:
class Model(nn.Module):

    def __init__(self, bert_model="albert-base-v2", freeze_bert=False):
        super(Model, self).__init__()
        self.bert_layer = AutoModel.from_pretrained(bert_model, return_dict=False)

        bert_model == "albert-base-v2"  # 12M parameters
        hidden_size = 768
        #  More information on available models can be found at https://huggingface.co/transformers/pretrained_models.html
        
        # Freeze model layers and only train the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        # Putting Classification layer on top of BERT
        self.cls_layer = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(p=0.1)

    @autocast()  # Mixes precision
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''

        # Feeding the inputs to the BERT-based model to obtain contextualized representations
        cont_reps, pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)

        # Feeding to the classifier layer the last layer hidden-state of the [CLS] token further processed by a
        # Linear Layer and a Tanh activation. The Linear layer weights were trained from the sentence order prediction (ALBERT) or next sentence prediction (BERT)
        # objective during pre-training.
        logits = self.cls_layer(self.dropout(pooler_output))

        return logits

In [8]:
def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [9]:
def get_probs_from_logits(logits):
    """
    Converts a tensor of logits into an array of probabilities by applying the sigmoid function
    """
    probs = torch.sigmoid(logits.unsqueeze(-1))
    return probs.detach().cpu().numpy()

def test_prediction(net, device, aptamerDataFrame, dataloader, with_labels, result_path, iteration):
    """
    Predict the probabilities on a dataset with or without labels and print the result in a file
    """
    net.eval()
    probs_all = []
    nb_iterations = len(dataloader)
    
    with torch.no_grad():
        if with_labels:
            for it, (seq, attn_masks, token_type_ids) in tqdm(enumerate(dataloader), total = nb_iterations):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()

                
        else:
            for it, (seq, attn_masks, token_type_ids) in tqdm(enumerate(dataloader), total=nb_iterations):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()

                
    df1 = pd.read_csv(aptamerDataFrame)
    probs_all = [round(x) for x in probs_all]
    df2 = pd.DataFrame({'Label': probs_all})
    df = pd.concat([df1, df2], axis=1)
    df.to_csv(result_path)
    
    print("Compared aptamers iteration {} is located in {}".format(iteration, result_path))

In [10]:
bert_model =  config['Model']['model_name']  #  'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2' and others
maxlen =  config['Model']['max_len']         #  maximum length of the tokenized input sentence pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
bs =  config['Model']['batch_size']          #  batch size of testing
with_labels =  config['Model']['with_labels']
iter = 1

Stage III
Apply Genetic Algorithm to generate new population of aptamers

In [51]:
def run_GA():
    iter = 1
    while iter < 51:
      #  Generate N aptamers to have the same 1000 as before deleting inferior
      !python ./genetic_algorithm/breeder.py --p {aptamerList} --o {directory} --l {apt_len} --i {iter}
      
      if iter > 1:
        breedCSV = './datasets/ga_interim_data/Albumin/breed_{}.csv'.format(iter-1)
        %rm $breedCSV

      #  Pair up new batch
      !python ./functions/pairing.py --h {aptamerList_iter} --o {directory} --i {iter}

      #  Call alBERT to compare goodness of sequences
      df_test = pd.read_csv('{}iteration_{}.csv'.format(directory, iter))
      test_set = CustomDataset(df_test, maxlen, with_labels, bert_model)
      data_toModel = DataLoader(test_set, batch_size=bs) #nureadinti data pirma
      test_prediction(net=model, device=device, aptamerDataFrame='{}iteration_{}.csv'.format(directory, iter), dataloader=data_toModel, with_labels=False, result_path='{}predicted_{}.csv'.format(directory, iter), iteration=iter)

      #  Find dominating aptamers and go to step 1 again.
      !python ./functions/dominance_score.py --p {directory + 'predicted_' +str(iter) + '.csv'} --f {directory + 'breed_' + str(iter) + '.csv'} --o {directory + 'top_iter_' + str(iter)}  --i {iter} --l {apt_len}
      #survarkyti kur galunes nera tokios ir tegul patys scriptai tuo rupinasi

      iterationCSV = './datasets/ga_interim_data/Albumin/iteration_{}.csv'.format(iter)
      predictionCSV = './datasets/ga_interim_data/Albumin/predicted_{}.csv'.format(iter)

      aptamerList = directory + 'top_iter_' + str(iter) + '.csv'
      iter += 1
      aptamerList_iter = './datasets/ga_interim_data/Albumin/breed_{}.csv'.format(iter)

      !rm $iterationCSV $predictionCSV 

In [None]:
set_seed(2021)

print("Loading model...")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Model(bert_model, freeze_bert=False)
model.load_state_dict(torch.load(config['Paths']['path_to_model']))
model.to(device)
model.eval() #tikriausiai nereikia

run_GA()