In [2]:
!nvidia-smi

Sun Dec  8 21:55:03 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    31W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [4]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
!pip install biopython
!pip install transformers

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/96/01/7e5858a1e54bd0bd0d179cd74654740f07e86fb921a43dd20fb8beabe69d/biopython-1.75-cp36-cp36m-manylinux1_x86_64.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 2.7MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.75
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/70/1a/364556102943cacde1ee00fdcae3b1615b39e52649eddbf54953e5b144c9/transformers-2.2.1-py3-none-any.whl (364kB)
[K     |████████████████████████████████| 368kB 2.7MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/3d/efb655a670b98f62ec32d66954e1109f403db4d937c50d779a75b9763a29/sentencepiece-0.1.83-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 50.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/1f/8e/ed5364a06a9ba720fddd9820155cc57300d28f5f43a6f

In [5]:
import os
import sys
import pickle
import torch
import numpy as np
import torch.optim as optim
# from transformers import BertConfig, BertModel, BertForMaskedLM
from transformers import GPT2Config, GPT2Model, GPT2LMHeadModel
from Bio import SeqIO
from Bio.Data import CodonTable

from IPython.display import clear_output

In [0]:
# load genes and fake ORFs
d = "drive/My Drive/Colab Notebooks/smaug/data"
gene_ORF_path = os.path.join(d, "ecoli_MG1655_geneORFs.pkl")
fake_ORF_path = os.path.join(d, "ecoli_MG1655_fakeORFs.pkl")
# gene_ORF_path = os.path.join(d, "Staphylococcus_geneORFs.pkl")
# fake_ORF_path = os.path.join(d, "Staphylococcus_fakeORFs.pkl")

with open(gene_ORF_path, 'rb') as f:
    gene_aa_filtered = pickle.load(f)
with open(fake_ORF_path, 'rb') as f:
    fake_aa = pickle.load(f)

gene_aa_filtered = [x for x in gene_aa_filtered if len(x) > 40]
fake_aa = [x for x in fake_aa if len(x) > 40] # already filtered for length at creation time

In [0]:
def tokenize_aa_seq(aa_seq):
    """Convert amino acid letters to integers. Can also use murphy's reduced aa alphabet later"""
    table = {"L":1,
             "V":2,
             "I":3,
             "M":4,
             "C":5,
             "A":6,
             "G":7,
             "S":8,
             "T":9,
             "P":10,
             "F":11,
             "Y":12,
             "W":13,
             "E":14,
             "D":15,
             "N":16,
             "Q":17,
             "K":18,
             "R":19,
             "H":20,
             "X":0, # get rid of these
             "B":0,
             "*":0}
    tokenized = torch.tensor([table[aa] for aa in aa_seq])
    return tokenized

In [8]:
# split long ORFs into shingles

# TODO: this was done in time crunch, should not be done like this. Redo
SHINGLE_STEP = 10
SHINGLE_LENGTH = 20 # shingle length should be the same as trained model window size

ORF_shingled = []
for i, ORFseq in enumerate(gene_aa_filtered):
    if i%1000==0:
        print(i)
    ORF_shingled_temp = []
    while True:
        shingle = ORFseq[:SHINGLE_LENGTH]
        shingle_int = tokenize_aa_seq(shingle)
        ORF_shingled_temp.append(shingle_int)
        if len(ORFseq) > SHINGLE_LENGTH + SHINGLE_STEP:
            ORFseq = ORFseq[SHINGLE_STEP:]
        else:
            break
    shingle_last = ORFseq[-(SHINGLE_LENGTH):]
    shingle_int =  tokenize_aa_seq(shingle_last)
    ORF_shingled_temp.append(shingle_int)
    
    ORF_shingled.append(ORF_shingled_temp)
# flatten shingles to submit to GPU
# keep track of which shingles belong to which ORFs
combined = [torch.stack(x, dim=0) for x in ORF_shingled]
combined_shape = [len(x) for x in combined]

ORF_flat = torch.cat(combined, dim=0)

# ARFs
ARF_shingled = []
for i, ARFseq in enumerate(fake_aa):
    if i%1000==0:
        print(i)
    ARF_shingled_temp = []
    while True:
        shingle = ARFseq[:SHINGLE_LENGTH]
        shingle_int = tokenize_aa_seq(shingle)
        ARF_shingled_temp.append(shingle_int)
        if len(ARFseq) > SHINGLE_LENGTH + SHINGLE_STEP:
            ARFseq = ARFseq[SHINGLE_STEP:]
        else:
            break
    shingle_last = ARFseq[-(SHINGLE_LENGTH):]
    shingle_int =  tokenize_aa_seq(shingle_last)
    ARF_shingled_temp.append(shingle_int)
    
    ARF_shingled.append(ARF_shingled_temp)
# flatten shingles to submit to GPU
# keep track of which shingles belong to which ORFs
combined = [torch.stack(x, dim=0) for x in ARF_shingled]
combined_shape = [len(x) for x in combined]

ARF_flat = torch.cat(combined, dim=0)

0
1000
2000
3000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000


In [9]:
# split into training and test with gene labels
data_X = torch.cat((ORF_flat, ARF_flat))
# data_X = torch.cat((ORF_flat, ARF_flat)).type(torch.long)
data_y = torch.cat((torch.ones(len(ORF_flat), dtype=int), torch.zeros(len(ARF_flat), dtype=int))) # 1=gene, 0=arf

np.random.seed(2019)
# select_idx = np.random.randint(0, len(data_X), int(0.8*len(data_X)))
select_idx = np.random.choice(range(len(data_X)), size=int(0.8*len(data_X)), replace=False)
dtrain_X = data_X[select_idx]
dtrain_y = data_y[select_idx].reshape(-1,1)

inverse_select_idx = np.ones(len(data_X), dtype=bool)
inverse_select_idx[select_idx] = 0
dvalid_X = data_X[inverse_select_idx]
dvalid_y = data_y[inverse_select_idx].reshape(-1,1)

rand_idx = np.random.choice(range(len(dvalid_X)), size=int(len(dvalid_X)), replace=False)
dvalid_X = dvalid_X[rand_idx]
dvalid_y = dvalid_y[rand_idx]

print(len(data_X), len(dtrain_X), len(dvalid_X), len(dtrain_X)+len(dvalid_X))

289739 231791 57948 289739


In [0]:
# Teach the new head how to classify genes

In [0]:
# from transformers import BertForSequenceClassification

# examplemodel = BertForSequenceClassification.from_pretrained("bert-base-uncased")
# print(examplemodel)

In [16]:
class GeneBinaryClassifier(torch.nn.Module):
    """ customize GPT2 model for classification """
    def __init__(self, pretrained_path, hidden_dim):
        super(GeneBinaryClassifier, self).__init__()
        self.model = GPT2LMHeadModel.from_pretrained(pretrained_path)
        self.model.config.output_hidden_states = False

        self.removed = list(self.model.children())[:-1] # remove last layer
        self.model= torch.nn.Sequential(*self.removed)

        # for param in self.model.parameters(): # freeze all pretrained layers
        #     param.requires_grad = False, TODO: empirically this kills the model almost entirely, why?

        self.fc = torch.nn.Linear(hidden_dim, hidden_dim, bias=False) # add fully connected layer
        # self.activation = torch.nn.Sigmoid() # similar to BERT, https://github.com/google-research/bert/issues/43
        # self.activation = torch.nn.Tanh() # similar to BERT, https://github.com/google-research/bert/issues/43

        self.classifier = torch.nn.Linear(hidden_dim, 1) # TODO: compare BCEWithLogitsLoss on 1 node to Cross Entropy on 2 nodes
    
    def forward(self, X):
        y_transformer, _ = self.model(X)
        y_last = y_transformer[:, -1] # take only the hidden state of the last token (similar to BERT taking only first token)
        y_fc = self.fc(y_last)
        # y_act = self.activation(y_fc)
        # y_class = self.classifier(y_act)
        y_class = self.classifier(y_fc)
        return y_class

# modeldir = "drive/My Drive/Colab Notebooks/smaug/data/models/ecoli_trivial_length40_overlap_20"
modeldir = "drive/My Drive/Colab Notebooks/smaug/data/models/ecoli_trivial_length20_overlap10"

hidden_dim = 16
genemodel = GeneBinaryClassifier(modeldir, hidden_dim).to("cuda")
print(genemodel)

GeneBinaryClassifier(
  (model): Sequential(
    (0): GPT2Model(
      (wte): Embedding(21, 16)
      (wpe): Embedding(20, 16)
      (drop): Dropout(p=0, inplace=False)
      (h): ModuleList(
        (0): Block(
          (ln_1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
          (attn): Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0, inplace=False)
            (resid_dropout): Dropout(p=0, inplace=False)
          )
          (ln_2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
          (mlp): MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (dropout): Dropout(p=0, inplace=False)
          )
        )
        (1): Block(
          (ln_1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
          (attn): Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0, inplace=False)
            (resid_dropout): Dropout(

In [0]:
optimizer = optim.AdamW(genemodel.parameters())

# optimizer = optim.SGD(genemodel.parameters(), lr=0.001, momentum=0.9)

# criterion = torch.nn.CrossEntropyLoss()
# criterion = torch.nn.BCELoss()
criterion = torch.nn.BCEWithLogitsLoss()

exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [0]:
def savemodel(model):
    # save model
    # Load the Drive helper and mount
    from google.colab import drive

    # This will prompt for authorization.
    drive.mount('/content/drive')

    # modeldir = "drive/My Drive/Colab Notebooks/smaug/data/models/ecoli_trivial_length40_overlap20_guillotine"
    modeldir = "drive/My Drive/Colab Notebooks/smaug/data/models/ecoli_trivial_length20_overlap10_guillotine"
    p = os.path.join(modeldir, "model")
    with open(p, "wb") as f:
        torch.save(model, f)
    # model.config.save_pretrained(modeldir)
    print(os.listdir(modeldir))

In [23]:
# retrain model
genemodel.train()

batch_size = 2**9

optimizer.zero_grad()
for i in range(int(1e10)):
    if i%500==100:
        savemodel(genemodel)
    optimizer.zero_grad()

    select_idx = np.random.randint(0, len(dtrain_X), batch_size)

#     input_ids = dtrain[select_idx].unsqueeze(0) # singleton
    input_ids = dtrain_X[select_idx].to('cuda')
    input_labels = dtrain_y[select_idx].type(torch.float32).to('cuda')
    
    outputs = genemodel(input_ids)
    
    loss = criterion(outputs, input_labels)
    
    loss.backward()
    optimizer.step()
    
    clear_output(wait=True)
    print("Iteration:", i, "; Loss:", loss.item())

Iteration: 114 ; Loss: 0.3317720293998718


KeyboardInterrupt: ignored

In [15]:
print(outputs.dtype)
print(input_labels.dtype)
# print(-torch.log(torch.sigmoid(outputs[:10])))
print(torch.sigmoid(outputs[:10]))
# print(outputs[:10])

print(input_labels[:10])

torch.float32
torch.float32
tensor([[9.8483e-01],
        [8.5497e-03],
        [1.2257e-04],
        [9.9845e-01],
        [4.7623e-05],
        [1.3472e-04],
        [9.9974e-01],
        [3.8654e-05],
        [6.7991e-04],
        [9.8174e-01]], device='cuda:0', grad_fn=<SigmoidBackward>)
tensor([[1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.]], device='cuda:0')


In [0]:
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt 

def plot_ROC(y_true, y_pred_score):
    fpr, tpr, _ = metrics.roc_curve(y_true, y_pred_score)
    roc_auc = metrics.auc(fpr, tpr)

    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
            lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

def plot_hist(y_true, y_pred_score):
    scores_GENE = y_pred_score[y_true==1]
    scores_ARF = y_pred_score[y_true==0]

    # sns.distplot(scores_GENE, hist = False, kde = True,
    #             kde_kws = {'shade': True, 'linewidth': 3}, 
    #             label = 'gene')
    # sns.distplot(scores_ARF, hist = False, kde = True,
    #             kde_kws = {'shade': True, 'linewidth': 3}, 
    #             label = 'ARF')
    
    # # plt.xlim([0.04,0.09])
    # plt.xlim([-0.1, 1.1])
    # plt.show()

    bins = np.linspace(0, 1, 100)
    plt.hist(scores_GENE, bins, alpha=0.5, label="Gene")
    plt.hist(scores_ARF, bins, alpha=0.5, label="ARF")
    plt.legend(loc='upper right')
    plt.show()

In [1]:
# evaluate new head model
def evaluate(model, X, y, BATCH_SIZE):
    model.eval()

    outputs = []
    for i in range(0, len(X), BATCH_SIZE):
        if i%1000==0:
            print(i)
        out = model(X[i:i+BATCH_SIZE].to("cuda")).to("cpu")
        outputs.append(out)
    outputs = torch.cat(outputs, dim=0)



    y_pred = torch.sigmoid(outputs).detach().numpy()
    plot_ROC(y, y_pred)
    plot_hist(y, y_pred)
    
BATCH_SIZE = 2
total = 1000
evaluate(genemodel, dtrain_X[:total], dtrain_y[:total], BATCH_SIZE)
# evaluate(genemodel, dvalid_X[:total], dvalid_y[:total], BATCH_SIZE)

NameError: ignored

In [0]:
|

In [0]:
# custom parameters for GPT2 model
vocab_size = 21
max_position_embeddings = 20 # 1024
n_ctw = max_position_embeddings # 1024
n_embd = 16 # 768
n_layer = 8 # 12
n_head = 8 # 12
resid_pdrop = 0 # 0.1
embd_pdrop = 0 # 0.1
attn_pdrop = 0 # 0.1
layer_norm_epsilon = 1e-5 # 1e-5


config = GPT2Config(vocab_size_or_config_json_file=vocab_size,
                    n_positions=max_position_embeddings,
                    n_ctw=n_ctw,
                    n_embd=n_embd,
                    n_layer=n_layer,
                    n_head=n_head, 
                    resid_pdrop=resid_pdrop,
                    embd_pdrop=embd_pdrop,
                    attn_pdrop=attn_pdrop,
                    layer_norm_epsilon=layer_norm_epsilon)

model = GPT2LMHeadModel(config)

# print(model)
model.to('cuda')

In [0]:
# load data
d = "drive/My Drive/Colab Notebooks/smaug/data"
shingle_path = os.path.join(d, "ecoli_MG1655_shingles_length20_overlap10.npy")

with open(shingle_path, 'rb') as f:
    ecoli_shingles = np.load(shingle_path)#[:2056]

In [0]:
# pass to GPU
tokens_tensor = torch.tensor(ecoli_shingles).to('cuda')


dtrain = tokens_tensor

In [0]:
optimizer = optim.AdamW(model.parameters())

In [0]:
def savemodel():
    # save model
    # Load the Drive helper and mount
    from google.colab import drive

    # This will prompt for authorization.
    drive.mount('/content/drive')

    modeldir = "drive/My Drive/Colab Notebooks/smaug/data/models/ecoli_trivial_length20_overlap10"
    model.save_pretrained(modeldir)
    print(os.listdir(modeldir))

In [0]:
# train model with single aa masked at a time ##### batch
model.train()

batch_size = 2**14

optimizer.zero_grad()
np.random.seed(42424)
for i in range(1000000):
    if i%500==10:
        savemodel()
    optimizer.zero_grad()

    select_idx = np.random.randint(0, len(dtrain), batch_size)

#     input_ids = dtrain[select_idx].unsqueeze(0) # singleton
    input_ids = dtrain[select_idx]
    
    
    outputs = model(input_ids, labels=input_ids)
    loss, prediction_scores = outputs[:2]
    
    loss.backward()
    optimizer.step()
    
    clear_output(wait=True)
    print("Loss:", loss.item())

    for k in range(10, 20):
#         print(i, torch.argmax(prediction_scores[0,k-1]).item(), input_ids[0,k].item(), "\t", loss.item()) #TODO figure out why GPT2 only offsets sometimes
        print(i, "\t", torch.argmax(prediction_scores[10,k-1]).item(), "\t", input_ids[10,k].item(), "\t")
#         print(prediction_scores[0])
    
