<a href="https://colab.research.google.com/github/rishi15-t/Bio-Bert-Emebddings/blob/master/bio_bert_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
from torch.utils.data import DataLoader, SequentialSampler

def GetTextEmbeddings_Bert(dataset, batch_size = 32):
  
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  input_ids = TextTransformation_Bert(dataset['Disease Class'], max_input_length=16)
  sampler = SequentialSampler(input_ids)
  dataloader = DataLoader(input_ids, sampler=sampler, batch_size=batch_size)

  model = BertClassifier(gen_embeddings = True)
  model.to(device)
  model.eval()

  for num, batch_data in enumerate(dataloader):
      print(num) 
      indexed_tokens, segment_ids , masked_ids = tuple(t for t in batch_data)   
      data = indexed_tokens.to(device)
      with torch.no_grad():
        emdeddings = model(data)

  return emdeddings

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
import nltk
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class TextTransformation_Bert(Dataset) :

    def __init__(self, text_list, max_input_length = 512):

        self.text_list = text_list
        self.indexed_tokens = []
        self.segment_ids = []
        self.masked_ids = []
        self.max_input_length = max_input_length

    def GetIndexedTokens(self, text):
        
        tokenizer = AutoTokenizer.from_pretrained("monologg/biobert_v1.1_pubmed")
        tokenized_text = tokenizer.tokenize(text)
        tokenized_text.append("[SEP]")
        tokenized_text.insert(0,"[CLS]")
        self.indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

#Segment ids are used to break down paragraphs (each para being one input) into sentences that are treated separately by Bert.

    def GetSegmentIds(self) :
        
        self.segment_ids = [1] * len(self.indexed_tokens)

 #Masked ids is used to just let Bert know which tokens are actual information and which are padded tokens.

    def GetMaskedIds(self) :
        
        self.masked_ids = [1] * len(self.indexed_tokens)


    def Padding(self) :

        if(len(self.indexed_tokens) < self.max_input_length) :
           padding = [0]*(self.max_input_length - len(self.indexed_tokens))
           self.indexed_tokens += padding
           self.segment_ids += padding
           self.masked_ids += padding
        else :
           del self.indexed_tokens[self.max_input_length:]
           del self.segment_ids[self.max_input_length:]
           del self.masked_ids[self.max_input_length:]

    def __len__(self):
        return len(self.text_list)


    def __getitem__(self, idx):

        text = self.text_list[idx]

        self.GetIndexedTokens(text)
        self.GetSegmentIds()
        self.GetMaskedIds()
        self.Padding()
        
        self.indexed_tokens = torch.tensor(self.indexed_tokens)
        self.segment_ids = torch.tensor(self.segment_ids)
        self.masked_ids = torch.tensor(self.masked_ids)
        
        return self.indexed_tokens, self.segment_ids , self.masked_ids

In [0]:
from transformers import BertModel, BertConfig
from torch import nn
from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead

class BertClassifier(nn.Module):

    def __init__(self, hidden_layer_size = 512, input_size = 768, num_labels = 23, dropout = 0.1, gen_embeddings = False, use_pooled_output = False):

        super(BertClassifier, self).__init__()
        self.num_labels = num_labels
        self.input_size = input_size
        self.gen_embeddings = gen_embeddings
        self.hidden_layer_size = hidden_layer_size
        self.use_pooled_output = use_pooled_output

 
        self.base_model = AutoModelWithLMHead.from_pretrained("monologg/biobert_v1.1_pubmed")
        
        self.hidden_layer = torch.nn.Linear(self.input_size, self.hidden_layer_size)
        self.hidden_activation = torch.nn.ReLU()

        self.output_layer = torch.nn.Linear(self.hidden_layer_size, self.num_labels)
        self.output_activation = torch.nn.Sigmoid()

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, indexed_tokens, segment_ids=None, masked_ids=None):
        
        if(self.gen_embeddings):
            if(self.use_pooled_output) :
                pooled_output = self.base_model(indexed_tokens, segment_ids, masked_ids)
                embeddings = pooled_output[1]
                return embeddings
            else:
                config = BertConfig.from_pretrained("monologg/biobert_v1.1_pubmed")
                config.output_hidden_states = True
                self.base_model = BertModel.from_pretrained("monologg/biobert_v1.1_pubmed", config=config)
                output = self.base_model(indexed_tokens, segment_ids, masked_ids)
                return output
        else :
            embeddings = indexed_tokens
        
        logits1 = self.hidden_layer(embeddings)
        activation1 = self.hidden_activation(logits1)

        dropped = self.dropout(activation1)

        logits2 = self.output_layer(dropped)
        if(self.training) :
            return logits2
        else :
            output = self.output_activation(logits2)
            return output


    def freeze_base_model(self):
        for param in self.base_model.parameters():
            param.requires_grad = False


    def unfreeze_base_model(self):
        for param in self.base_model.named_parameters():
            param.requires_grad = True

In [0]:
import pandas as pd
dataset = pd.read_csv('/content/drive/My Drive/BioBert/bio-decagon-effectcategories.csv')
dataset = dataset[0:5]

In [68]:
results = GetTextEmbeddings_Bert(dataset)

0


In [63]:
#output of the 12th (last) Bert layer [#samples, #tokens + #padding elements, #hidden neurons]
results[0].shape

torch.Size([5, 16, 768])

In [64]:
#pooled output corresponding to the embedding of the [CLS] token [#samples, #hidden neurons]
(results[1]).shape

torch.Size([5, 768])

In [73]:
#output of each Bert layer
(results[2][12]).shape

torch.Size([5, 16, 768])

In [66]:
len(results)

2

In [0]:
base_model = AutoModelWithLMHead.from_pretrained("monologg/biobert_v1.1_pubmed")

In [0]:
base_model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr