In [1]:
import os
import pickle
import torch
import numpy as np
import torch.optim as optim
from transformers import BertConfig, BertModel, BertForMaskedLM

# import matplotlib.pyplot as plt
# from transformers import *

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


In [2]:
# load data
data_dir = '/home-3/msomme16@jhu.edu/scratch/shimmer/data/'
CDS_path = os.path.join(data_dir, "CDS_3600.pkl")

with open(CDS_path, 'rb') as f:
    CDS = pickle.load(f)
print(CDS[10])
print(len(CDS))

MKIAFDTVNVYYLPQFIPICDELAKRGHEVKLVCYSNKNKAQAFEQVLLSFGYEFCWVDDDKAARDLYLKEEPDWIFFGNGFSYLDDIHKVSKTAQLGHGIGPKPSYYHKSSTPMTVRFIEGKMRLAKIRELYPNDEFVQVGFSKLDPLFNNTEPGLKYDELGLDKGKPTLLFAPTFNPSSLECFPDDWPSHFTDFNILIKPHTFTYSREAYKNQRKKLKKWAQFNNTYVATETDISLLPFMKDADILISEASSTLFEFVALSKPVIVCNFFKLKWSYRGIFNYRFEKRFGKDNVIYKNIGLHINDYTELRAAIDKQLNNEHLYKEERKNYTQDHVGPTDGKSSLRIVDYIENN
9860046


In [3]:
lengths = [len(s) for s in CDS]
meanlen = np.mean(lengths)
medlen = np.median(lengths)
minlen = np.min(lengths)
maxlen = np.max(lengths)
print(meanlen, medlen, minlen, maxlen)

330.0915760433572 291.0 0 16477


In [None]:
# import logging
# logging.basicConfig(level=logging.INFO)

In [4]:
# custom parameters for BERT model
vocab_size = 11 # Vocabulary size of inputs_ids in BertModel. default=30522
hidden_size = 768 # Size of the encoder layers and the pooler layer, default=768
num_hidden_layers = 12 # Number of hidden layers in the Transformer encoder. default=12
num_attention_heads = 12 # Number of attention heads for each attention layer in the Transformer encoder, default=12
intermediate_size = 3072 # The size of the “intermediate” (i.e., feed-forward) layer in the Transformer encoder. default=3072
hidden_act = "gelu" # The non-linear activation function (function or string) in the encoder and pooler. If string, “gelu”, “relu”, “swish” and “gelu_new” are supported. default="gelu"
hidden_dropout_prob = 0.1 # The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. default=0.1
attention_probs_dropout_prob = 0.1 # The dropout ratio for the attention probabilities. default=0.1
max_position_embeddings = 512 # The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). default=512
type_vocab_size = 2 # 1 # The vocabulary size of the token_type_ids passed into BertModel. default=2
initializer_range = 0.02 # The sttdev of the truncated_normal_initializer for initializing all weight matrices. default=0.02
layer_norm_eps = 1e-12 # The epsilon used by LayerNorm. default=1e-12


config = BertConfig(vocab_size_or_config_json_file=vocab_size,
                    hidden_size=hidden_size,
                    num_hidden_layers=num_hidden_layers,
                    num_attention_heads=num_attention_heads,
                    intermediate_size=intermediate_size,
                    hidden_act=hidden_act,
                    hidden_dropout_prob=hidden_dropout_prob,
                    attention_probs_dropout_prob=attention_probs_dropout_prob,
                    max_position_embeddings=max_position_embeddings,
                    type_vocab_size=type_vocab_size,
                    initializer_range=initializer_range,
                    layer_norm_eps=layer_norm_eps)

# model = BertModel(config)
model = BertForMaskedLM(config)

print(model)
model.to('cuda')


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(11, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(11, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)

In [5]:
# encode data as GPU tensors
max_aa_seq_length = max_position_embeddings


def tokenize_aa_seq_murphy10(aa_seq):
    table = {"L":1,
             "V":1,
             "I":1,
             "M":1,
             "C":2,
             "A":3,
             "G":4,
             "S":5,
             "T":5,
             "P":6,
             "F":7,
             "Y":7,
             "W":7,
             "E":8,
             "D":8,
             "N":8,
             "Q":8,
             "K":9,
             "R":9,
             "H":10,
             "X":0,
             "B":0}
    tokenized = [table[aa] for aa in aa_seq]
    return tokenized

tokens = [tokenize_aa_seq_murphy10(seq) for seq in CDS[:100]]
tokens_tensor = torch.zeros(len(tokens), max_aa_seq_length, dtype=torch.long)
for i in range(len(tokens)):
    l = len(tokens[i]) # scuff way to ensure fit in tensor, TODO build correctly sized data set and split into train test
    if l > max_aa_seq_length:
        l = max_aa_seq_length
    for j in range(l):
        tokens_tensor[i][j] += tokens[i][j]

tokens_tensor = tokens_tensor.to('cuda')

print(tokens_tensor)
print(tokens_tensor.shape)

tensor([[1, 9, 8,  ..., 0, 0, 0],
        [1, 8, 4,  ..., 0, 0, 0],
        [1, 9, 7,  ..., 9, 8, 8],
        ...,
        [1, 5, 5,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 5, 8, 9],
        [1, 6, 1,  ..., 0, 0, 0]], device='cuda:0')
torch.Size([100, 512])


In [6]:
# train model
batch_size = 5

# optimizer = optim.SGD(model.parameters(), lr=0.01)#, momentum=args.momentum)
optimizer = optim.AdamW(model.parameters())

_max_select = len(tokens_tensor)
np.random.seed(2019)
for i in range(1000):
    optimizer.zero_grad()
    input_ids = tokens_tensor[np.random.randint(0, _max_select, batch_size)]
    outputs = model(input_ids, masked_lm_labels=input_ids)
    loss, prediction_scores = outputs[:2]
    print(loss)
    loss.backward()
    optimizer.step()
    
#     loss.backward(retain_graph=True)

tensor(2.5065, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.6123, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.0721, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4245, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.2004, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.2349, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.2856, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.7870, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.3617, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4910, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0807, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1377, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4310, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.5223, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0783, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.0333, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2541, device='cuda:0', grad_fn=<NllLossBackward

KeyboardInterrupt: 

In [None]:
# train model
import torch.optim as optim
import torch.nn.functional as F

def train_epoch(epoch, args, model, device, data_loader, optimizer):
    model.train()  # set to training mode, disappointingly does not actually train the model 
    pid = os.getpid()
    for batch_idx, (data, target) in enumerate(data_loader):
        optimizer.zero_grad()
        output = model(data.to(device))
        loss = F.nll_loss(output, target.to(device))
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('{}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                pid, epoch, batch_idx * len(data), len(data_loader.dataset),
                100. * batch_idx / len(data_loader), loss.item()))
            
optimizer = optim.SGD(model.parameters(), lr=0.01)#, momentum=args.momentum)
optimizer.zero_grad()
# output = model(tokens_tensor)
# loss = F.nll_loss(output, target.to('cuda'))
print([x for x in model.parameters()])


In [None]:
from transformers import BertTokenizer, BertModel, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model = BertForMaskedLM(config)

input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids)
loss, prediction_scores = outputs[:2]
print(loss)
print(prediction_scores)

In [None]:
print(input_ids.shape)
print(tokens_tensor[0].unsqueeze(0).shape)
print(tokens_tensor[0:2].shape)

In [None]:
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model = BertForMaskedLM(config)
model.to('cuda')

# input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
# input_ids = tokens_tensor[0].unsqueeze(0)
input_ids = tokens_tensor[0:2]

outputs = model(input_ids, masked_lm_labels=input_ids)
loss, prediction_scores = outputs[:2]
print(loss)
print(prediction_scores.shape)

for i in range(2):
    input_ids = tokens_tensor[0:2]
    outputs = model(input_ids, masked_lm_labels=input_ids)
    loss, prediction_scores = outputs[:2]
    loss.backward()
    print(loss)
    print(prediction_scores.shape)
#     loss.backward(retain_graph=True)

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
# example tokenization
from transformers import BertTokenizer, BertModel, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

masked_index = 8
tokenized_text[masked_index] = '[MASK]'
print(tokenized_text)

indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
print(tokens_tensor)
print(tokens_tensor.shape)

In [None]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
# segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
#     outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    outputs = model(tokens_tensor)
    predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)

In [None]:
# GPT-2

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')

# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2-large')


# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()
model.to('cuda')

# original_text = "Martin Steinegger is in Peru because he "
# original_text = "The real reason Steven keeps recruiting German postdocs is "
original_text = "The secret to giving a fun and compelling Joint Lab Meeting presentation is "
# original_text = "Johns Hopkins University is"
# original_text = "UC Berkeley is"
# original_text = "Why did Donald Trump "
# original_text = "Finding genes is easy... The secret is "
# original_text = "The reason I want to get a PhD in Biomedical Engineering is "
# original_text = "Computational gene finding is easy, the real secret is "
# original_text = "Computational gene finding is easy, the problem is "
# original_text = "I came to Johns Hopkins University because "
# original_text = "My experiment does not work because "





text = original_text
for i in range(100): # not the best way to iterate, but it works
#     if text[-1] != ".":
        # Encode a text inputs
        indexed_tokens = tokenizer.encode(text)

        # Convert indexed tokens in a PyTorch tensor
        tokens_tensor = torch.tensor([indexed_tokens])

        # If you have a GPU, put everything on cuda
        tokens_tensor = tokens_tensor.to('cuda')

        # Predict all tokens
        with torch.no_grad():
            outputs = model(tokens_tensor)
            predictions = outputs[0]

        # get the predicted next sub-word (in our case, the word 'man')
        predicted_index = torch.argmax(predictions[0, -1, :]).item()
        predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

        text = predicted_text

In [None]:
print("Original text:\t\t", original_text)
print("Completed sentence:\t", predicted_text)