In [1]:
!pip install transformers

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
!pip install sklearn_crfsuite

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.[0m


### Set the Parameters

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils import data
import os
import numpy as np
from collections import OrderedDict
from sklearn.metrics import classification_report
from sklearn_crfsuite.utils import flatten

from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModel

In [4]:
import os
os.getcwd()

'/home/ec2-user/SageMaker/BIO_NER'

#### Load the data

In [5]:
import numpy as np 
from torch.utils import data 
import torch 



class HParams:
    def __init__(self, vocab_type):
        self.VOCAB_DICT = {
            'bc5cdr': ('<PAD>', 'B-Chemical', 'O', 'B-Disease' , 'I-Disease', 'I-Chemical'),
            'bionlp3g' : ('<PAD>', 'B-Amino_acid', 'B-Anatomical_system', 'B-Cancer', 'B-Cell', 
                        'B-Cellular_component', 'B-Developing_anatomical_structure', 'B-Gene_or_gene_product', 
                        'B-Immaterial_anatomical_entity', 'B-Multi-tissue_structure', 'B-Organ', 'B-Organism', 
                        'B-Organism_subdivision', 'B-Organism_substance', 'B-Pathological_formation', 
                        'B-Simple_chemical', 'B-Tissue', 'I-Amino_acid', 'I-Anatomical_system', 'I-Cancer', 
                        'I-Cell', 'I-Cellular_component', 'I-Developing_anatomical_structure', 'I-Gene_or_gene_product', 
                        'I-Immaterial_anatomical_entity', 'I-Multi-tissue_structure', 'I-Organ', 'I-Organism', 
                        'I-Organism_subdivision', 'I-Organism_substance', 'I-Pathological_formation', 'I-Simple_chemical', 
                        'I-Tissue', 'O')
        }
        self.VOCAB = self.VOCAB_DICT[vocab_type]
        self.tag2idx = {v:k for k,v in enumerate(self.VOCAB)}
        self.idx2tag = {k:v for k,v in enumerate(self.VOCAB)}

        self.batch_size = 128 
        self.lr = 0.0001
        self.n_epochs = 30 

        #self.tokenizer = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)
        self.tokenizer = AutoTokenizer.from_pretrained("monologg/biobert_v1.0_pubmed_pmc")
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'


class NerDataset(data.Dataset):
    def __init__(self, path, vocab_type):
        self.hp = HParams(vocab_type)
        instances = open(path).read().strip().split('\n\n')
        sents = []
        tags_li = []
        for entry in instances:
            words = [line.split()[0] for line in entry.splitlines()]
            tags = ([line.split()[-1] for line in entry.splitlines()])
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<PAD>"] + tags + ["<PAD>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)


    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = self.hp.tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = self.hp.tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<PAD>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [self.hp.tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), f"len(x)={len(x)}, len(y)={len(y)}, len(is_heads)={len(is_heads)}"

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen


def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

#### Define the Model

In [6]:
import torch
import torch.nn as nn

class Net(nn.Module):
    def __init__(self, vocab_len, device = 'cpu'):
        super().__init__()
        self.bert = AutoModel.from_pretrained("monologg/biobert_v1.0_pubmed_pmc")
        self.bert.eval()
        self.rnn = nn.LSTM(bidirectional=True, num_layers=2, input_size=768, hidden_size=768//2, batch_first=True)
        self.fc = nn.Linear(768, vocab_len)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64

        Returns
        enc: (N, T, VOCAB)
        '''
        x = x.to(self.device)
        y = y.to(self.device)
        #print(x.shape)
        #print(y.shape)

        with torch.no_grad():
            encoded_layers, _ = self.bert(x)
            #print(encoded_layers.shape)
            #enc = encoded_layers[-1]
            #print(enc.shape)
        enc, _ = self.rnn(encoded_layers)
        #print(enc.shape)
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

### Train the model

In [7]:
### Train the model
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i==0:
            print("=====sanity check======")
            #print("x:", x.cpu().numpy()[0])
            print("words:", words[0])
            print("\n")
            #print("tokens:", hp.tokenizer.convert_ids_to_tokens(x.cpu().numpy()[0]))
            #print("y:", _y.cpu().numpy()[0])
            #print("is_heads:", is_heads[0])
            print("tags:", tags[0])
            print("\n")
            #print("seqlen:", seqlens[0])


        if i%10==0: # monitoring
            print(f"step: {i}, loss: {loss.item()}")

def eval(model, iterator, f):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    
    y_pred =[]
    y_true = []
    for words, is_heads, tags, y_hat, y in zip(Words, Is_heads, Tags, Y_hat, Y):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            y_pred.append(y_hat[1:-1])
            y = [hat for head, hat in zip(is_heads, y) if head == 1]
            y_true.append(y[1:-1])
            assert len(y_pred)==len(y_true), f"len(y_pred)={len(y_pred)}, len(y_true)={len(y_true)}"
            assert len(flatten(y_pred))==len(flatten(y_true)), f"len(flatten(y_pred))={flatten(y_pred)}, len(flatten(y_true))={flatten(y_true)}"

    print(classification_report(flatten(y_true), flatten(y_pred), target_names=list(hp.tag2idx.keys())[1:]))
    
    final_report = classification_report(flatten(y_true), flatten(y_pred), target_names=list(hp.tag2idx.keys())[1:],output_dict=True)

    return final_report["weighted avg"]['precision'], final_report["weighted avg"]['recall'], final_report["weighted avg"]['f1-score']

def save_the_results(model, iterator, f):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open(f, 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [hp.idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write(f"{w} {t} {p}\n")
            fout.write("\n")

    ## calc metric
    y_true =  np.array([hp.tag2idx[line.split()[1]] for line in open(f, 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([hp.tag2idx[line.split()[2]] for line in open(f, 'r').read().splitlines() if len(line) > 0])

    print(classification_report(y_true, y_pred, target_names=list(hp.tag2idx.keys())[1:]))
    
    final_report = classification_report(y_true, y_pred, target_names=list(hp.tag2idx.keys())[1:],output_dict=True)

    final = f + ".P%.2f_R%.2f_F%.2f" %(final_report['weighted avg']['precision'], final_report['weighted avg']['recall'], final_report['weighted avg']['f1-score'])
    with open(final, 'w') as fout:
        result = open(f, "r").read()
        fout.write(f"{result}\n")

        fout.write(f"accuracy={final_report['accuracy']}\n")
        fout.write(f"precision={final_report['weighted avg']['precision']}\n")
        fout.write(f"recall={final_report['weighted avg']['recall']}\n")
        fout.write(f"f1={final_report['weighted avg']['f1-score']}\n")

    os.remove(f)
    return final_report["weighted avg"]['precision'], final_report["weighted avg"]['recall'], final_report["weighted avg"]['f1-score']

In [11]:
### Combine devel and train file to get more data for training

# Python program to 
# demonstrate merging 
# of two files 
  
train_data = dev_data = "" 
  
# Reading data from file1 
with open("./data/BioNLP13CG-IOB/train.tsv") as fp: 
    train_data = fp.read() 
  
# Reading data from file2 
with open("./data/BioNLP13CG-IOB/devel.tsv") as fp: 
    dev_data = fp.read() 
  
# Merging 2 files 
# To add the data of file2 
# from next line 
#train_data += "\n"
train_data += dev_data 
  
with open ('./data/BioNLP13CG-IOB/train_dev.tsv', 'w') as fp: 
    fp.write(train_data) 

In [None]:
train_dataset = NerDataset("./data/BioNLP13CG-IOB/train_dev.tsv", 'bionlp3g')  # here bc5cdr is dataset type
eval_dataset = NerDataset("./data/BioNLP13CG-IOB/test.tsv", 'bionlp3g')
hp = HParams('bionlp3g')

# Define model 
#config = BertConfig(vocab_size_or_config_json_file=BERT_CONFIG_FILE)
model = Net(vocab_len = len(hp.VOCAB), device=hp.device)
if torch.cuda.is_available():
    model.cuda()
model.train()
# update with already pretrained weight


    
train_iter = data.DataLoader(dataset=train_dataset,
                            batch_size=hp.batch_size,
                            shuffle=True,
                            num_workers=4,
                            collate_fn=pad)
eval_iter = data.DataLoader(dataset=eval_dataset,
                            batch_size=hp.batch_size,
                            shuffle=False,
                            num_workers=4,
                            collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = hp.lr)
# optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
criterion = nn.CrossEntropyLoss(ignore_index=0)

for epoch in range(1, hp.n_epochs+1):
    train(model, train_iter, optimizer, criterion)
    print(f"=========eval at epoch={epoch}=========")
    if not os.path.exists('checkpoints_bionlp3g'): os.makedirs('checkpoints_bionlp3g')
    fname = os.path.join('checkpoints_bionlp3g', str(epoch))
    precision, recall, f1 = eval(model, eval_iter, fname)
    torch.save(model.state_dict(), f"{fname}.pt")

words: [CLS] In all cases , the somatostatin receptors were localized in veins , particularly in the smooth - muscle cell layer . [SEP]


tags: <PAD> O O O O O B-Gene_or_gene_product I-Gene_or_gene_product O O O B-Multi-tissue_structure O O O O B-Tissue I-Tissue I-Tissue I-Tissue I-Tissue O <PAD>


step: 0, loss: 3.5331735610961914
step: 10, loss: 2.2306199073791504
step: 20, loss: 1.429896593093872
step: 30, loss: 1.1847739219665527


  _warn_prf(average, modifier, msg_start, len(result))


                                   precision    recall  f1-score   support

                     B-Amino_acid       0.00      0.00      0.00        62
              B-Anatomical_system       0.00      0.00      0.00        17
                         B-Cancer       0.00      0.00      0.00       924
                           B-Cell       0.00      0.00      0.00      1013
             B-Cellular_component       0.00      0.00      0.00       180
B-Developing_anatomical_structure       0.00      0.00      0.00        17
           B-Gene_or_gene_product       0.00      0.00      0.00      2520
   B-Immaterial_anatomical_entity       0.00      0.00      0.00        31
         B-Multi-tissue_structure       0.00      0.00      0.00       303
                          B-Organ       0.00      0.00      0.00       156
                       B-Organism       0.00      0.00      0.00       518
           B-Organism_subdivision       0.00      0.00      0.00        39
             B-Organism_

In [None]:
train_iter.batch_size

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("monologg/biobert_v1.0_pubmed_pmc")

model = AutoModel.from_pretrained("monologg/biobert_v1.0_pubmed_pmc")


In [None]:
tokenizer.vocab_size