#  Dependencies and Paths

In [None]:
#  New directory for current genetic algorithm
directory = './datasets/runs/test/'

#  Path to scored aptamers
aptamerList = './datasets/training/best_0.csv'
aptamerListAll = './datasets/training/scored_sequences.csv'
#  Path to PyTorch alBERT model
path_to_model = './model/Albert-base-20epoch-val0-4-run2.pt'  

#  How many sequences we want to have in a list
apt_len = 1000

In [None]:
#  Create a new directory
try:
    os.mkdir(directory)
except OSError:
    print("Creation of %s failed." % directory)
else:
    print("Successfully created the directory %s ." % directory)

In [None]:
import random
import os
import copy
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

Stage I
Load model and create a DataLouder for latter GA

In [None]:
class CustomDataset(Dataset):

    def __init__(self, data, maxlen, with_labels=True, bert_model='albert-base-v2'):
        self.data = data  # pandas dataframe
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model, return_dict=False)  
        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sent1 = str(self.data.loc[index, 'Sequence1'])
        sent2 = str(self.data.loc[index, 'Sequence2'])

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(sent1, sent2, 
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      max_length=self.maxlen,  
                                      return_tensors='pt')  # Return torch.Tensor objects
        
        token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels
            label = self.data.loc[index, 'Label']
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids

In [None]:
class Model(nn.Module):

    def __init__(self, bert_model="albert-base-v2", freeze_bert=False):
        super(Model, self).__init__()
        self.bert_layer = AutoModel.from_pretrained(bert_model, return_dict=False)

        if bert_model == "albert-base-v2":  # 12M parameters
            hidden_size = 768
        elif bert_model == "albert-large-v2":  # 17M parameters
            hidden_size = 1024
        elif bert_model == "albert-xlarge-v2":  # 58M parameters
            hidden_size = 2048
        elif bert_model == "albert-xxlarge-v2":  # 223M parameters
            hidden_size = 4096
        elif bert_model == "roberta-base":  # 125M parameters
            hidden_size = 768
        elif bert_model == "distilroberta-base":  # 82M parameters
            hidden_size = 768
        #  More information on available models can be found at https://huggingface.co/transformers/pretrained_models.html
        
        # Freeze model layers and only train the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        # Putting Classification layer on top of BERT
        self.cls_layer = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(p=0.1)

    @autocast()  # Mixes precision
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''

        # Feeding the inputs to the BERT-based model to obtain contextualized representations
        cont_reps, pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)

        # Feeding to the classifier layer the last layer hidden-state of the [CLS] token further processed by a
        # Linear Layer and a Tanh activation. The Linear layer weights were trained from the sentence order prediction (ALBERT) or next sentence prediction (BERT)
        # objective during pre-training.
        logits = self.cls_layer(self.dropout(pooler_output))

        return logits

In [None]:
def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
def get_probs_from_logits(logits):
    """
    Converts a tensor of logits into an array of probabilities by applying the sigmoid function
    """
    probs = torch.sigmoid(logits.unsqueeze(-1))
    return probs.detach().cpu().numpy()

def test_prediction(net, device, aptamerDataFrame, dataloader, with_labels, result_path):
    """
    Predict the probabilities on a dataset with or without labels and print the result in a file
    """
    net.eval()
    probs_all = []
    nb_iterations = len(dataloader)
    
    with torch.no_grad():
        if with_labels:
            for it, (seq, attn_masks, token_type_ids) in tqdm(enumerate(dataloader), total = nb_iterations):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()

                
        else:
            for it, (seq, attn_masks, token_type_ids) in tqdm(enumerate(dataloader), total=nb_iterations):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()

                
    df1 = pd.read_csv(columns=['Sequence1', 'Sequence2'])
    probs_all = [round(x) for x in probs_all]
    df2 = pd.DataFrame({'Label': probs_all})
    df = pd.concat([df1, df2], axis=1)
    df.to_csv(result_path)

In [None]:
bert_model = "albert-base-v2"  #  'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2' and others
maxlen = 32                    #  maximum length of the tokenized input sentence pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
bs = 64                        #  batch size of testing
with_labels = False
iter = 1

Stage III
Apply Genetic Algorithm to generate new population of aptamers

In [None]:
def run_GA():

    #  Generate N aptamers to have the same 1000 as before deleting inferior
    !python /content/genetic_algorithm/breeder.py --p {aptamerList} --o {directory} --l {apt_len} --i {iter}

    #  Pair up new batch
    !python /content/functions/pairing.py --p {aptamerList} --o {directory} --i {iter}

    #  Call alBERT to compare goodness of sequences
    df_test = pd.read_csv('{}iteration_{}.csv'.format(directory, iter))
    test_set = CustomDataset(df_test, maxlen, with_labels, bert_model)
    data_toModel = DataLoader(test_set) #nureadinti data pirma
    test_prediction(net=model, device=device, aptamerDataFrame='{}iteration_{}'.format(directory, iter), dataloader=data_toModel, with_labels=False, result_path='{}predicted_{}'.format(directory, iter))


    #  Find dominating aptamers and go to step 1 again.
    !python /content/functions/dominance_score.py --p {directory + 'predicted_' +str(iter)} --f {directory + 'breed_' + str(iter) + '.csv'} --o {directory + 'best_' + str(iter) + '.csv'}  --i {iter} --l {apt_len}
    #survarkyti kur galunes nera tokios ir tegul patys scriptai tuo rupinasi

    aptamerList = directory + 'best_' + iter
    iter += 1

In [None]:
set_seed(2020)

print("Loading model...")
model.eval() #tikriausiai nereikia
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Model(bert_model, freeze_bert=freeze_bert, strict=False)
model.to(device)
model.eval() #tikriausiai nereikia

while true:#convergency
    run_GA(aptamers)