<a href="https://colab.research.google.com/github/rishi15-t/Bio-Bert-Emebddings/blob/master/bio_bert_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
from torch.utils.data import DataLoader, SequentialSampler

def GetTextEmbeddings_Bert(dataset, batch_size = 32):
  
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  input_ids = TextTransformation_Bert(dataset['Disease Class'], max_input_length=16)
  sampler = SequentialSampler(input_ids)
  dataloader = DataLoader(input_ids, sampler=sampler, batch_size=batch_size)

  model = BertClassifier(gen_embeddings = True)
  model.to(device)
  model.eval()

  for num, batch_data in enumerate(dataloader):
      print(num) 
      indexed_tokens, segment_ids , masked_ids = tuple(t for t in batch_data)   
      data = indexed_tokens.to(device)
      with torch.no_grad():
        emdeddings = model(data)

  return emdeddings

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import nltk
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class TextTransformation_Bert(Dataset) :

    def __init__(self, text_list, max_input_length = 512):

        self.text_list = text_list
        self.indexed_tokens = []
        self.segment_ids = []
        self.masked_ids = []
        self.max_input_length = max_input_length

    def GetIndexedTokens(self, text):
        
        tokenizer = AutoTokenizer.from_pretrained("monologg/biobert_v1.1_pubmed")
        tokenized_text = tokenizer.tokenize(text)
        tokenized_text.append("[SEP]")
        tokenized_text.insert(0,"[CLS]")
        self.indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        

    def GetSegmentIds(self) :
        
        self.segment_ids = [1] * len(self.indexed_tokens)

 
    def GetMaskedIds(self) :
        
        self.masked_ids = [1] * len(self.indexed_tokens)


    def Padding(self) :

        if(len(self.indexed_tokens) < self.max_input_length) :
           padding = [0]*(self.max_input_length - len(self.indexed_tokens))
           self.indexed_tokens += padding
           self.segment_ids += padding
           self.masked_ids += padding
        else :
           del self.indexed_tokens[self.max_input_length:]
           del self.segment_ids[self.max_input_length:]
           del self.masked_ids[self.max_input_length:]

    def __len__(self):
        return len(self.text_list)


    def __getitem__(self, idx):

        text = self.text_list[idx]

        self.GetIndexedTokens(text)
        self.GetSegmentIds()
        self.GetMaskedIds()
        self.Padding()
        
        self.indexed_tokens = torch.tensor(self.indexed_tokens)
        self.segment_ids = torch.tensor(self.segment_ids)
        self.masked_ids = torch.tensor(self.masked_ids)
        
        return self.indexed_tokens, self.segment_ids , self.masked_ids

In [5]:
from transformers import BertModel, BertConfig
from torch import nn
from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead

class BertClassifier(nn.Module):

    def __init__(self, hidden_layer_size = 512, input_size = 768, num_labels = 23, dropout = 0.1, gen_embeddings = False, use_pooled_output = False):

        super(BertClassifier, self).__init__()
        self.num_labels = num_labels
        self.input_size = input_size
        self.gen_embeddings = gen_embeddings
        self.hidden_layer_size = hidden_layer_size
        self.use_pooled_output = use_pooled_output

 
        self.base_model = AutoModelWithLMHead.from_pretrained("monologg/biobert_v1.1_pubmed")
        
        self.hidden_layer = torch.nn.Linear(self.input_size, self.hidden_layer_size)
        self.hidden_activation = torch.nn.ReLU()

        self.output_layer = torch.nn.Linear(self.hidden_layer_size, self.num_labels)
        self.output_activation = torch.nn.Sigmoid()

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, indexed_tokens, segment_ids=None, masked_ids=None):
        
        if(self.gen_embeddings):
            if(self.use_pooled_output) :
                pooled_output = self.base_model(indexed_tokens, segment_ids, masked_ids)
                embeddings = pooled_output[1]
                return embeddings
            else:
                config = BertConfig.from_pretrained("monologg/biobert_v1.1_pubmed")
                config.output_hidden_states=True
                self.base_model = BertModel.from_pretrained("monologg/biobert_v1.1_pubmed", config=config)
                output = self.base_model(indexed_tokens, segment_ids, masked_ids)
                return output
        else :
            embeddings = indexed_tokens
        
        logits1 = self.hidden_layer(embeddings)
        activation1 = self.hidden_activation(logits1)

        dropped = self.dropout(activation1)

        logits2 = self.output_layer(dropped)
        if(self.training) :
            return logits2
        else :
            output = self.output_activation(logits2)
            return output


    def freeze_base_model(self):
        for param in self.base_model.parameters():
            param.requires_grad = False


    def unfreeze_base_model(self):
        for param in self.base_model.named_parameters():
            param.requires_grad = True

In [4]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ee/fc/bd726a15ab2c66dc09306689d04da07a3770dad724f0883f0a4bfb745087/transformers-2.4.1-py3-none-any.whl (475kB)
[K     |▊                               | 10kB 18.2MB/s eta 0:00:01[K     |█▍                              | 20kB 1.8MB/s eta 0:00:01[K     |██                              | 30kB 2.3MB/s eta 0:00:01[K     |██▊                             | 40kB 1.7MB/s eta 0:00:01[K     |███▍                            | 51kB 1.9MB/s eta 0:00:01[K     |████▏                           | 61kB 2.2MB/s eta 0:00:01[K     |████▉                           | 71kB 2.4MB/s eta 0:00:01[K     |█████▌                          | 81kB 2.6MB/s eta 0:00:01[K     |██████▏                         | 92kB 2.9MB/s eta 0:00:01[K     |██████▉                         | 102kB 2.8MB/s eta 0:00:01[K     |███████▋                        | 112kB 2.8MB/s eta 0:00:01[K     |████████▎                       | 122kB 2.8M

In [0]:
import pandas as pd
dataset = pd.read_csv('/content/drive/My Drive/BioBert/bio-decagon-effectcategories.csv')

In [9]:
results = GetTextEmbeddings_Bert(dataset)

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=435780013, style=ProgressStyle(description_…




HBox(children=(IntProgress(value=0, description='Downloading', max=213450, style=ProgressStyle(description_wid…


0


In [10]:
results

(tensor([[[ 0.3303,  0.1576, -0.4849,  ..., -0.0889,  0.0125,  0.0308],
          [-0.1575,  0.0612, -0.2163,  ...,  0.5560, -0.4811,  0.4393],
          [ 0.3083,  0.5124, -0.5316,  ...,  0.8872, -0.1127,  0.4777],
          ...,
          [-0.2826, -0.0590,  0.1738,  ..., -0.1438,  0.6185, -0.1558],
          [-0.3556, -0.0079,  0.1817,  ..., -0.1531,  0.6737, -0.1463],
          [-0.3416, -0.1726,  0.0983,  ..., -0.2009,  0.6931, -0.1744]],
 
         [[ 0.2428,  0.0507, -0.2883,  ..., -0.1774,  0.0476, -0.1719],
          [ 0.0132,  0.2049, -0.3541,  ...,  0.2511, -0.1991,  0.2735],
          [-0.2431,  0.2626, -0.4073,  ...,  0.1045,  0.0156, -0.2432],
          ...,
          [-0.3555, -0.0873,  0.1860,  ..., -0.1570,  0.6212, -0.2552],
          [-0.3911, -0.1334,  0.1647,  ..., -0.1608,  0.6698, -0.2377],
          [-0.3630, -0.2482,  0.1210,  ..., -0.1897,  0.6985, -0.3113]],
 
         [[ 0.2540, -0.0075, -0.3811,  ..., -0.3075,  0.1695, -0.2956],
          [ 0.0474,  0.1713,

In [11]:
len(results[2])

13

In [12]:
results[0].shape

torch.Size([5, 16, 768])

In [13]:
results[-1][-1].mean(1).shape

torch.Size([5, 768])

In [0]:
base_model = AutoModelWithLMHead.from_pretrained("monologg/biobert_v1.1_pubmed")

In [0]:
base_model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr