In [None]:
import os
import pickle
import torch
import numpy as np
import torch.optim as optim
from transformers import BertConfig, BertModel, BertForMaskedLM

# import matplotlib.pyplot as plt
# from transformers import *

In [None]:
# load data
data_dir = '/home-3/msomme16@jhu.edu/scratch/shimmer/data/'
CDS_path = os.path.join(data_dir, "CDS_3600.pkl")

with open(CDS_path, 'rb') as f:
    CDS = pickle.load(f)
print(CDS[10])
print(len(CDS))

In [None]:
lengths = [len(s) for s in CDS]
meanlen = np.mean(lengths)
medlen = np.median(lengths)
minlen = np.min(lengths)
maxlen = np.max(lengths)
print(meanlen, medlen, minlen, maxlen)

In [None]:
# import logging
# logging.basicConfig(level=logging.INFO)

In [None]:
# # custom parameters for BERT model
# vocab_size = 12 # Vocabulary size of inputs_ids in BertModel. default=30522
# hidden_size = 768 # Size of the encoder layers and the pooler layer, default=768
# num_hidden_layers = 12 # Number of hidden layers in the Transformer encoder. default=12
# num_attention_heads = 12 # Number of attention heads for each attention layer in the Transformer encoder, default=12
# intermediate_size = 3072 # The size of the “intermediate” (i.e., feed-forward) layer in the Transformer encoder. default=3072
# hidden_act = "gelu" # The non-linear activation function (function or string) in the encoder and pooler. If string, “gelu”, “relu”, “swish” and “gelu_new” are supported. default="gelu"
# hidden_dropout_prob = 0.1 # The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. default=0.1
# attention_probs_dropout_prob = 0.1 # The dropout ratio for the attention probabilities. default=0.1
# max_position_embeddings = 512 # The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). default=512
# type_vocab_size = 2 # 1 # The vocabulary size of the token_type_ids passed into BertModel. default=2
# initializer_range = 0.02 # The sttdev of the truncated_normal_initializer for initializing all weight matrices. default=0.02
# layer_norm_eps = 1e-12 # The epsilon used by LayerNorm. default=1e-12


# config = BertConfig(vocab_size_or_config_json_file=vocab_size,
#                     hidden_size=hidden_size,
#                     num_hidden_layers=num_hidden_layers,
#                     num_attention_heads=num_attention_heads,
#                     intermediate_size=intermediate_size,
#                     hidden_act=hidden_act,
#                     hidden_dropout_prob=hidden_dropout_prob,
#                     attention_probs_dropout_prob=attention_probs_dropout_prob,
#                     max_position_embeddings=max_position_embeddings,
#                     type_vocab_size=type_vocab_size,
#                     initializer_range=initializer_range,
#                     layer_norm_eps=layer_norm_eps)

# model = BertForMaskedLM(config)

# print(model)
# model.to('cuda')

In [None]:
# custom parameters for BERT model
vocab_size = 12 # Vocabulary size of inputs_ids in BertModel. default=30522
hidden_size = 48 # Size of the encoder layers and the pooler layer, default=768
num_hidden_layers = 3 # Number of hidden layers in the Transformer encoder. default=12
num_attention_heads = 3 # Number of attention heads for each attention layer in the Transformer encoder, default=12
intermediate_size = 48*4 # The size of the “intermediate” (i.e., feed-forward) layer in the Transformer encoder. default=3072
hidden_act = "gelu" # The non-linear activation function (function or string) in the encoder and pooler. If string, “gelu”, “relu”, “swish” and “gelu_new” are supported. default="gelu"
hidden_dropout_prob = 0.1 # The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. default=0.1
attention_probs_dropout_prob = 0.1 # The dropout ratio for the attention probabilities. default=0.1
max_position_embeddings = 512 # The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). default=512
type_vocab_size = 1 # 1 # The vocabulary size of the token_type_ids passed into BertModel. default=2
initializer_range = 0.02 # The sttdev of the truncated_normal_initializer for initializing all weight matrices. default=0.02
layer_norm_eps = 1e-12 # The epsilon used by LayerNorm. default=1e-12


config = BertConfig(vocab_size_or_config_json_file=vocab_size,
                    hidden_size=hidden_size,
                    num_hidden_layers=num_hidden_layers,
                    num_attention_heads=num_attention_heads,
                    intermediate_size=intermediate_size,
                    hidden_act=hidden_act,
                    hidden_dropout_prob=hidden_dropout_prob,
                    attention_probs_dropout_prob=attention_probs_dropout_prob,
                    max_position_embeddings=max_position_embeddings,
                    type_vocab_size=type_vocab_size,
                    initializer_range=initializer_range,
                    layer_norm_eps=layer_norm_eps)

model = BertForMaskedLM(config)

print(model)
model.to('cuda')

In [None]:
# encode data as GPU tensors
max_aa_seq_length = max_position_embeddings


def tokenize_aa_seq_murphy10(aa_seq):
    table = {"L":1,
             "V":1,
             "I":1,
             "M":1,
             "C":2,
             "A":3,
             "G":4,
             "S":5,
             "T":5,
             "P":6,
             "F":7,
             "Y":7,
             "W":7,
             "E":8,
             "D":8,
             "N":8,
             "Q":8,
             "K":9,
             "R":9,
             "H":10,
             "X":0,
             "B":0}
    tokenized = [table[aa] for aa in aa_seq]
    return tokenized

tokens = [tokenize_aa_seq_murphy10(seq) for seq in CDS[:100]]
tokens_tensor = torch.zeros(len(tokens), max_aa_seq_length, dtype=torch.long)
for i in range(len(tokens)):
    l = len(tokens[i]) # scuff way to ensure fit in tensor, TODO build correctly sized data set and split into train test
    if l > max_aa_seq_length:
        l = max_aa_seq_length
    for j in range(l):
        tokens_tensor[i][j] += tokens[i][j]

tokens_tensor = tokens_tensor.to('cuda')

print(tokens_tensor)
print(tokens_tensor.shape)

In [None]:
dtrain = tokens_tensor[:int(0.8*len(tokens_tensor))] #TODO do an actual random selection on better data
dvalid = tokens_tensor[int(0.8*len(tokens_tensor)):]
print(len(dtrain))
print(len(dvalid))

In [None]:
print(1)

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.0001)#, momentum=args.momentum)
# optimizer = optim.AdamW(model.parameters())

In [None]:
# train model with single aa masked at a time
model.train()
loss_mask = (torch.zeros(max_position_embeddings)-1).long().to('cuda')

np.random.seed(2019)
for i in range(10):
    select_idx = np.random.randint(0,len(dtrain),1)[0]    
    input_ids = dtrain[select_idx].unsqueeze(0)

    for j in range(max_position_embeddings):
        tempval = 0
        tempval += input_ids[0, j].item()
        if tempval != 0:
            optimizer.zero_grad()

            loss_mask[j] = tempval # calculate loss based only on masked amino acid
            input_ids[0, j] = vocab_size-1 # mask label is the highest vocab number, never present in data

            outputs = model(input_ids, masked_lm_labels=loss_mask) #TODO mask padding 0's for attention
            loss, prediction_scores = outputs[:2]
            print(i, torch.argmax(prediction_scores[0,j]).item(), loss.item())
            loss.backward()
            optimizer.step()

            input_ids[0,j] = tempval # set masked value back to original value
            loss_mask[j] = -1

#             if j==10:
#                 break
        else:
            break # stop training at end of protein sequence

#     break

In [None]:
# evaluate model
model.eval()

np.random.seed(424242)
predseq = []
actualseq = []
for i in range(20):
    select_idx = np.random.randint(0,len(dvalid),1)[0]    
    input_ids = dvalid[select_idx].unsqueeze(0)

    
#     for j in range(max_position_embeddings):
    for j in range(10):

        tempval = 0
        tempval += input_ids[0, j].item()
        if tempval != 0:
            input_ids[0, j] = vocab_size-1 # mask label is the highest vocab number, never present in data
            predicted_aa = torch.argmax(model(input_ids)[0][0,j]).item() # TODO mask padding 0's for attention
            input_ids[0,j] = tempval # set masked value back to original value
#             print(model(input_ids)[0][0,j])
            predseq.append(predicted_aa)
            actualseq.append(tempval)
            print(i, predicted_aa,tempval)

n_correct = 0
n_wrong = 0
for i, aa in enumerate(predseq):
    if aa == actualseq[i]:
        n_correct += 1
    else:
        n_wrong += 1
print(n_correct/(n_correct+n_wrong))

In [None]:
print(model(dvalid[2].unsqueeze(0))[0].shape)
print(torch.argmax(model(input_ids)[0][0,j]).item())

In [None]:
def evaluate(model, data):
    model.eval()
    acc_list = []
    for d in data:
        outputs = model(d.unsqueeze(0), masked_lm_labels=d.unsqueeze(0))
        loss, prediction_scores = outputs[:2]
        
        predicted_index = torch.argmax(prediction_scores, dim=2)
        n_correct = torch.sum(predicted_index==d).item()
#         n_possible = torch.sum(d!=0).item()
        n_possible = len(predicted_index[0])
        acc = n_correct/n_possible
        acc_list.append(acc)
        
    return(np.mean(acc_list))

In [None]:
# train model
batch_size = 1

optimizer = optim.SGD(model.parameters(), lr=0.01)#, momentum=args.momentum)
# optimizer = optim.AdamW(model.parameters())
_max_select = len(dtrain)


# single step to non-zero weights
optimizer.zero_grad()
select_idx = np.random.randint(0, _max_select, batch_size)
outputs = model(dtrain[select_idx], masked_lm_labels=dtrain[select_idx])
loss, prediction_scores = outputs[:2]
loss.backward()
optimizer.step()

# # evaluate before training
# acc_train = evaluate(model, dtrain)
# acc_valid = evaluate(model, dvalid)
# print(acc_train)
# print(acc_valid)


# train
model.train()
np.random.seed(2019)
for i in range(100):
    optimizer.zero_grad()
    select_idx = np.random.randint(0, _max_select, batch_size)
    input_ids = tokens_tensor[select_idx]
    outputs = model(input_ids, masked_lm_labels=input_ids)
    loss, prediction_scores = outputs[:2]
    print(i, loss.item())
    loss.backward()
    optimizer.step()
    
#     loss.backward(retain_graph=True)



In [None]:
# after training
acc_train = evaluate(model, dtrain)
acc_valid = evaluate(model, dvalid)
print(acc_train)
print(acc_valid)


In [None]:
model.eval()
outputs = model(dvalid[0].unsqueeze(0), masked_lm_labels=dvalid[0].unsqueeze(0))
loss, prediction_scores = outputs[:2]

predicted_index = torch.argmax(prediction_scores, dim=2)
print(predicted_index)
print(dvalid[0])

In [None]:
# debug
model.train()
optimizer = optim.SGD(model.parameters(), lr=0.01)#, momentum=args.momentum)
for i in range(10):
    optimizer.zero_grad()
#     select_idx = 1
#     input_ids = tokens_tensor[select_idx].unsqueeze(0)
#     input_ids = tokens_tensor[0:3]
    input_ids = dtrain[0].unsqueeze(0)

    outputs = model(input_ids, masked_lm_labels=input_ids)
    loss, prediction_scores = outputs[:2]
    print(i, loss.item())
    loss.backward()
    optimizer.step()
    
#     loss.backward(retain_graph=True)



In [None]:
model.eval()
select_idx = 5
input_ids = tokens_tensor[select_idx].unsqueeze(0)
outputs = model(input_ids, masked_lm_labels=input_ids)
loss, prediction_scores = outputs[:2]

predicted_index = torch.argmax(prediction_scores, dim=2)
a = evaluate(model, tokens_tensor[select_idx].unsqueeze(0))
print(a)
# print(predicted_index.shape)
print(predicted_index)

In [None]:
# evaluate on random data to ensure accuracy metric works
model.eval()

np.random.seed(2019)
noise = np.random.randint(1, 10, 512).reshape(1,-1)
input_ids = torch.from_numpy(noise).to('cuda')

outputs = model(input_ids, masked_lm_labels=input_ids)
loss, prediction_scores = outputs[:2]

predicted_index = torch.argmax(prediction_scores, dim=2)
a = evaluate(model, tokens_tensor[select_idx].unsqueeze(0))
print(a)
# print(predicted_index.shape)
print(input_ids)
print(predicted_index)

In [None]:
# fix forward mask
model.eval()

np.random.seed(2019)
noise = np.random.randint(1, 10, 512).reshape(1,-1)
input_ids = torch.from_numpy(noise).to('cuda')
input_ids[0,0] = -inf

attention_mask = torch.from_numpy(np.ones(512).reshape(1,-1)).float().to('cuda')
attention_mask[0,0] = 0
# print(attention_mask)

token_type_ids = torch.from_numpy(np.zeros(512).reshape(1,-1)).long().to('cuda')

outputs = model(input_ids, masked_lm_labels=input_ids)
loss, prediction_scores = outputs[:2]

predicted_index = torch.argmax(prediction_scores, dim=2)
a = evaluate(model, tokens_tensor[select_idx].unsqueeze(0))
print(a)
# print(predicted_index.shape)
print(input_ids)
print(predicted_index)

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
# train model
import torch.optim as optim
import torch.nn.functional as F

def train_epoch(epoch, args, model, device, data_loader, optimizer):
    model.train()  # set to training mode, disappointingly does not actually train the model 
    pid = os.getpid()
    for batch_idx, (data, target) in enumerate(data_loader):
        optimizer.zero_grad()
        output = model(data.to(device))
        loss = F.nll_loss(output, target.to(device))
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('{}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                pid, epoch, batch_idx * len(data), len(data_loader.dataset),
                100. * batch_idx / len(data_loader), loss.item()))
            
optimizer = optim.SGD(model.parameters(), lr=0.01)#, momentum=args.momentum)
optimizer.zero_grad()
# output = model(tokens_tensor)
# loss = F.nll_loss(output, target.to('cuda'))
print([x for x in model.parameters()])


In [None]:
from transformers import BertTokenizer, BertModel, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model = BertForMaskedLM(config)

input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids)
loss, prediction_scores = outputs[:2]
print(loss)
print(prediction_scores)

In [None]:
print(input_ids.shape)
print(tokens_tensor[0].unsqueeze(0).shape)
print(tokens_tensor[0:2].shape)

In [None]:
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model = BertForMaskedLM(config)
model.to('cuda')

# input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
# input_ids = tokens_tensor[0].unsqueeze(0)
input_ids = tokens_tensor[0:2]

outputs = model(input_ids, masked_lm_labels=input_ids)
loss, prediction_scores = outputs[:2]
print(loss)
print(prediction_scores.shape)

for i in range(2):
    input_ids = tokens_tensor[0:2]
    outputs = model(input_ids, masked_lm_labels=input_ids)
    loss, prediction_scores = outputs[:2]
    loss.backward()
    print(loss)
    print(prediction_scores.shape)
#     loss.backward(retain_graph=True)

In [None]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
# import logging
# logging.basicConfig(level=logging.INFO)
# logging.basicConfig(level=logging.NONE)


# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
# masked_index = 8
# tokenized_text[masked_index] = '[MASK]'

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
print(indexed_tokens)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [None]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
# segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
#     outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    outputs = model(tokens_tensor)

    predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)

predicted_index = torch.argmax(predictions[0, 11]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)

In [None]:
for i in range(len(indexed_tokens)):
    predicted_index = torch.argmax(predictions[0, i]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print(predicted_token)

In [None]:
predicted_token = tokenizer.convert_ids_to_tokens([103])[0]
print(predicted_token)

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
# example tokenization
from transformers import BertTokenizer, BertModel, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

masked_index = 8
tokenized_text[masked_index] = '[MASK]'
print(tokenized_text)

indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
print(tokens_tensor)
print(tokens_tensor.shape)

In [None]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
# segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
#     outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    outputs = model(tokens_tensor)
    predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)

In [1]:
# GPT-2

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')

# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2-large')


# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()
model.to('cuda')

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html
100%|██████████| 1042301/1042301 [00:00<00:00, 6347012.86B/s]
100%|██████████| 456318/456318 [00:00<00:00, 11644711.41B/s]
100%|██████████| 529/529 [00:00<00:00, 602276.55B/s]
100%|██████████| 3247202234/3247202234 [01:21<00:00, 39843399.03B/s]


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2):

In [42]:
# original_text = "Martin Steinegger is in Peru because he "
# original_text = "The real reason Steven keeps recruiting German postdocs is "
# original_text = "The secret to giving a fun and compelling Joint Lab Meeting presentation is "
# original_text = "UC Berkeley is"
# original_text = "Why did Donald Trump "
# original_text = "Finding genes is easy... The secret is "
# original_text = "Computational gene finding is easy, the problem is "
# original_text = "The future of Biomedical Engineering is "
# original_text = "The best way to describe how neural networks work is "
original_text = "Improving on state-of-the-art bacterial gene finding programs is hard, "


text = original_text
for i in range(20): # not the best way to iterate, but it works
    if text[-1] != ".":
        # Encode a text inputs
        indexed_tokens = tokenizer.encode(text)

        # Convert indexed tokens in a PyTorch tensor
        tokens_tensor = torch.tensor([indexed_tokens])

        # If you have a GPU, put everything on cuda
        tokens_tensor = tokens_tensor.to('cuda')

        # Predict all tokens
        with torch.no_grad():
            outputs = model(tokens_tensor)
            predictions = outputs[0]

        # get the predicted next sub-word (in our case, the word 'man')
        predicted_index = torch.argmax(predictions[0, -1, :]).item()
        predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

        text = predicted_text

In [43]:
print("Original text:\t\t", original_text)
print("Completed sentence:\t", predicted_text)

Original text:		 Improving on state-of-the-art bacterial gene finding programs is hard, 
Completed sentence:	 Improving on state-of-the-art bacterial gene finding programs is hard, but it's not impossible.
