In [1]:
import numpy
import torch
from transformers import BertModel, BertTokenizer

import re
import gc
import os
import pandas as pd
import requests
from tqdm.auto import tqdm

In [15]:
model_name = "Rostlab/prot_bert"
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)
model = BertModel.from_pretrained(model_name)
gc.collect()

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


7315

In [17]:
torch.cuda.is_available()

False

In [18]:
print("Number of model parameters is: " + str(int(sum(p.numel() for p in model.parameters())/1000000)) + " Million")
device = torch.device('0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model = model.eval()
# if torch.cuda.is_available():
#   model = model.half()

Number of model parameters is: 419 Million


In [19]:
def embed_seq(sample, shift_left = 1, shift_right = -1):
    with torch.no_grad():
      ids = tokenizer.batch_encode_plus([sample], add_special_tokens=True, padding=True, is_split_into_words=True, return_tensors="pt")
      embedding = model(input_ids=ids['input_ids'].to(device))[0]

    return embedding[0].detach().cpu().numpy()[shift_left:shift_right][0,:]

In [20]:
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
import re
import numpy as np
import os
import requests
from tqdm.auto import tqdm

In [21]:
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = AutoModel.from_pretrained("Rostlab/prot_bert")

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
fe = pipeline('feature-extraction', model=model, tokenizer=tokenizer)

In [45]:
import requests, sys

def get_protinfo(gene, info = 'sequence'):
    requestURL = "https://www.ebi.ac.uk/proteins/api/coordinates?offset=0&size=100&gene=%s" % gene
    r = requests.get(requestURL, headers={ "Accept" : "application/json"})

    if not r.ok:
        r.raise_for_status()
        sys.exit()

    data = r.json()
    return data[0][info]

def get_target_feats(genes, print_seq = False):
    prot_feats = {}
    for gene in genes:
        seq = get_protinfo(gene)
        print ('Get the sequence of gene %s' % gene)
        if print_seq:
            print (seq)
        seq = ' '.join(list(seq))
        embedding = fe(seq)
        embedding = np.array(embedding)
        for seq_num in range(len(embedding)):
            seq_len = len(seq[seq_num].replace(" ", ""))
            start_Idx = 1
            end_Idx = seq_len+1
            seq_emd = embedding[seq_num][start_Idx:end_Idx]
            prot_feats[gene] = seq_emd.reshape(-1)
    return prot_feats

In [47]:
gene = 'HRH1'
prot_feats = get_target_feats([gene], print_seq = True)
np.save('data/%s_feats.npy' % gene, prot_feats)

Get the sequence of gene HRH1
LHYSLHNALLGVSLGLLSLLTIIMNLLVLYAVKKEKTLHTVGNLYIVSLSVADLIVGTTVMPLNLMYLLEDEWSLGRAVCQFWLIMDYVASTASIFSLFILCLDRYRSVRQPLKYLKYRTRGKASLMISGAWLLSMMWIIPILGWRSFTHVDLKPEEENKCDTDFRFVTWFKVITAVFNFYVPSILMLWFYTHIYLAVRQHLRDRERIIHPADSFGENENGGNAPSSKSLGNETEVSLKQLKKDRLLDQNTLAQTYSLEDGEKTKSASFRTHRKIGVKCQQTSLLSMTTKRLRMARRGKTCSLSPEEGQPGPELPLSQSSAPQDMACSGGNNENKHQASLNECHVTVTNSVSGVCGISPVSDVQRYTDVLCNSYDPSQALPWPEEGVEDTRIDSDNGVTLKQAWHRFIDQSRHRIQSLRIHKEHKAAKQLGFIIAAFLLCWIPYFIVFMVMAFCPECVHHDLHMFTIWLGYINSTLNPFIYPLCNGNFKRVFKNILKINL
