In [1]:
!pip install transformers

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
!pip install sklearn_crfsuite

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.[0m


### Set the Parameters

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils import data
from new_model import Net
from data_load import NerDataset, pad, HParams
import os
import numpy as np
import parameters
from collections import OrderedDict
from sklearn.metrics import classification_report

from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModel

In [10]:
import os
os.getcwd()

'/home/ec2-user/SageMaker/SolvingAlmostAnythingWithBert/biobert_ner'

#### Load the data

In [11]:
import numpy as np 
from torch.utils import data 
import parameters
import torch 
from pytorch_pretrained_bert import BertTokenizer


class HParams:
    def __init__(self, vocab_type):
        self.VOCAB_DICT = {
            'bc5cdr': ('<PAD>', 'O', 'B-Chemical', 'B-Disease' , 'I-Disease', 'I-Chemical'),
            'bionlp3g' : ('<PAD>', 'O' ,'B-Amino_acid', 'B-Anatomical_system', 'B-Cancer', 'B-Cell', 
                        'B-Cellular_component', 'B-Developing_anatomical_structure', 'B-Gene_or_gene_product', 
                        'B-Immaterial_anatomical_entity', 'B-Multi-tissue_structure', 'B-Organ', 'B-Organism', 
                        'B-Organism_subdivision', 'B-Organism_substance', 'B-Pathological_formation', 
                        'B-Simple_chemical', 'B-Tissue', 'I-Amino_acid', 'I-Anatomical_system', 'I-Cancer', 
                        'I-Cell', 'I-Cellular_component', 'I-Developing_anatomical_structure', 'I-Gene_or_gene_product', 
                        'I-Immaterial_anatomical_entity', 'I-Multi-tissue_structure', 'I-Organ', 'I-Organism', 
                        'I-Organism_subdivision', 'I-Organism_substance', 'I-Pathological_formation', 'I-Simple_chemical', 
                        'I-Tissue')
        }
        self.VOCAB = self.VOCAB_DICT[vocab_type]
        self.tag2idx = {v:k for k,v in enumerate(self.VOCAB)}
        self.idx2tag = {k:v for k,v in enumerate(self.VOCAB)}

        self.batch_size = 128 
        self.lr = 0.0001
        self.n_epochs = 30 

        #self.tokenizer = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)
        self.tokenizer = AutoTokenizer.from_pretrained("monologg/biobert_v1.0_pubmed_pmc")
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'


class NerDataset(data.Dataset):
    def __init__(self, path, vocab_type):
        self.hp = HParams(vocab_type)
        instances = open(path).read().strip().split('\n\n')
        sents = []
        tags_li = []
        for entry in instances:
            words = [line.split()[0] for line in entry.splitlines()]
            tags = ([line.split()[-1] for line in entry.splitlines()])
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<PAD>"] + tags + ["<PAD>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)


    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = self.hp.tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = self.hp.tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<PAD>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [self.hp.tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), f"len(x)={len(x)}, len(y)={len(y)}, len(is_heads)={len(is_heads)}"

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen


def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

#### Define the Model

In [12]:
import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertModel

class Net(nn.Module):
    def __init__(self, vocab_len, device = 'cpu'):
        super().__init__()
        self.bert = AutoModel.from_pretrained("monologg/biobert_v1.0_pubmed_pmc")
        self.bert.eval()
        self.rnn = nn.LSTM(bidirectional=True, num_layers=2, input_size=768, hidden_size=768//2, batch_first=True)
        self.fc = nn.Linear(768, vocab_len)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64

        Returns
        enc: (N, T, VOCAB)
        '''
        x = x.to(self.device)
        y = y.to(self.device)
        #print(x.shape)
        #print(y.shape)

        with torch.no_grad():
            encoded_layers, _ = self.bert(x)
            #print(encoded_layers.shape)
            #enc = encoded_layers[-1]
            #print(enc.shape)
        enc, _ = self.rnn(encoded_layers)
        #print(enc.shape)
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

### Train the model

In [13]:
### Train the model
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i==0:
            print("=====sanity check======")
            #print("x:", x.cpu().numpy()[0])
            print("words:", words[0])
            print("\n")
            #print("tokens:", hp.tokenizer.convert_ids_to_tokens(x.cpu().numpy()[0]))
            #print("y:", _y.cpu().numpy()[0])
            #print("is_heads:", is_heads[0])
            print("tags:", tags[0])
            print("\n")
            #print("seqlen:", seqlens[0])


        if i%10==0: # monitoring
            print(f"step: {i}, loss: {loss.item()}")

def eval(model, iterator, f):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open(f, 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [hp.idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write(f"{w} {t} {p}\n")
            fout.write("\n")

    ## calc metric
    y_true =  np.array([hp.tag2idx[line.split()[1]] for line in open(f, 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([hp.tag2idx[line.split()[2]] for line in open(f, 'r').read().splitlines() if len(line) > 0])

    print(classification_report(y_true, y_pred, target_names=list(hp.tag2idx.keys())[1:]))
    
    final_report = classification_report(y_true, y_pred, target_names=list(hp.tag2idx.keys())[1:],output_dict=True)

    final = f + ".P%.2f_R%.2f_F%.2f" %(precision, recall, f1)
    with open(final, 'w') as fout:
        result = open(f, "r").read()
        fout.write(f"{result}\n")

        fout.write(f"accuracy={final_report['accuracy']}\n")
        fout.write(f"precision={final_report['weighted avg']['precision']}\n")
        fout.write(f"recall={final_report['weighted avg']['recall']}\n")
        fout.write(f"f1={final_report['weighted avg']['f1-score']}\n")

    os.remove(f)
    return final_report["weighted avg"]['precision'], final_report["weighted avg"]['recall'], final_report["weighted avg"]['f1-score']

In [None]:
train_dataset = NerDataset("./../../MTL-Bioinformatics-2016/data/BC5CDR-IOB/train.tsv", 'bc5cdr')  # here bc5cdr is dataset type
eval_dataset = NerDataset("./../../MTL-Bioinformatics-2016/data/BC5CDR-IOB/test.tsv", 'bc5cdr')
hp = HParams('bc5cdr')

# Define model 
model = Net(vocab_len = len(hp.VOCAB), device=hp.device)
if torch.cuda.is_available():
    model.cuda()
model.train()
# update with already pretrained weight


    
train_iter = data.DataLoader(dataset=train_dataset,
                            batch_size=hp.batch_size,
                            shuffle=True,
                            num_workers=4,
                            collate_fn=pad)
eval_iter = data.DataLoader(dataset=eval_dataset,
                            batch_size=hp.batch_size,
                            shuffle=False,
                            num_workers=4,
                            collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = hp.lr)
# optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
criterion = nn.CrossEntropyLoss(ignore_index=0)

for epoch in range(1, hp.n_epochs+1):
    train(model, train_iter, optimizer, criterion)
    print(f"=========eval at epoch={epoch}=========")
    if not os.path.exists('checkpoints'): os.makedirs('checkpoints')
    fname = os.path.join('checkpoints', str(epoch))
    precision, recall, f1 = eval(model, eval_iter, fname)
    torch.save(model.state_dict(), f"{fname}.pt")

words: [CLS] Previous reports have suggested that pain associated with the injection of lidocaine is related to the acidic pH of the solution . [SEP]


tags: <PAD> O O O O O B-Disease O O O O O B-Chemical O O O O O O O O O O <PAD>


step: 0, loss: 1.8316820859909058


In [None]:
train_iter.batch_size

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("monologg/biobert_v1.0_pubmed_pmc")

model = AutoModel.from_pretrained("monologg/biobert_v1.0_pubmed_pmc")


In [None]:
tokenizer.vocab_size

In [None]:
train_dataset = NerDataset("./../../MTL-Bioinformatics-2016/data/BC5CDR-IOB/train.tsv", 'bc5cdr')  # here bc5cdr is dataset type
hp = HParams('bc5cdr')
model = Net(vocab_len = len(hp.VOCAB), device=hp.device)
train_iter = data.DataLoader(dataset=train_dataset,
                            batch_size=4,
                            shuffle=True,
                            num_workers=4,
                            collate_fn=pad)

In [None]:
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
print(input.shape)
target = torch.empty(3, dtype=torch.long).random_(5)
print(target.shape)
output = loss(input, target)
#output.backward()

In [None]:
optimizer = optim.Adam(model.parameters(), lr = hp.lr)
# optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
criterion = nn.CrossEntropyLoss(ignore_index=0)
len(train_iter)

In [None]:
for i, batch in enumerate(train_iter):
    #print(batch[0],batch[4])
    words, x, is_heads, tags, y, seqlens = batch
    _y = y # for monitoring
    optimizer.zero_grad()
    logits, y, _ = model(x, y)
    break

In [None]:
logits.shape

In [None]:
logits.shape[-1]

In [None]:
logits.view(-1,6).shape

In [None]:
logits = logits.view(-1, logits.shape[-1])
logits.shape

In [None]:
y = y.view(-1)
y.shape

In [None]:
hp.VOCAB

In [None]:
f = os.path.join('checkpoints', '10.P0.97_R0.97_F0.97')
f

In [None]:
y_true = []
y_pred = []
i = 0
for line in open(f, 'r').read().splitlines():
    i += 1
    if len(line) > 0:
        #print(line.split()[1])
        #print(hp.tag2idx[line.split()[1]])
        y_true.append(hp.tag2idx[line.split()[1]])
        #print(line.split()[2])
        #print(hp.tag2idx[line.split()[2]])
        y_pred.append(hp.tag2idx[line.split()[2]])
    if i > 2000:
        break

In [None]:

#y_true = [0, 1, 2, 2, 2]
#y_pred = [0, 0, 2, 2, 1]
target_names = ['class 0', 'class 1', 'class 2']
target_names = list(hp.tag2idx.keys())[1:]
print(classification_report(y_true, y_pred, target_names=target_names,output_dict=True))

In [None]:
final_report = classification_report(y_true, y_pred, target_names=target_names,output_dict=True)

In [None]:
print(final_report["accuracy"])
print(final_report["weighted avg"]['precision'])
print(final_report["weighted avg"]['recall'])
print(final_report["weighted avg"]['f1-score'])


In [None]:
len(y_pred)

In [None]:
y_pred = np.array(y_pred)
y_pred[y_pred > 1].shape

In [None]:
y_true = np.array(y_pred)
y_true[y_true > 1].shape

In [None]:
Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
with torch.no_grad():
    for i, batch in enumerate(eval_iter):
        words, x, is_heads, tags, y, seqlens = batch

        _, _, y_hat = model(x, y)  # y_hat: (N, T)

        Words.extend(words)
        Is_heads.extend(is_heads)
        Tags.extend(tags)
        Y.extend(y.numpy().tolist())
        Y_hat.extend(y_hat.cpu().numpy().tolist())
        
        

In [None]:
y_pred =[]
y_true = []
for words, is_heads, tags, y_hat, y in zip(Words, Is_heads, Tags, Y_hat, Y):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            y_pred.append(y_hat[1:-1])
            y = [hat for head, hat in zip(is_heads, y) if head == 1]
            y_true.append(y[1:-1])
            






In [None]:
len(y_pred)

In [None]:
len(y_true)

In [None]:
from sklearn_crfsuite.metrics import flat_classification_report

In [None]:
print(X)

In [None]:

    
total_sample = len(y_true[y_true>0])
total_num_correct = (np.logical_and(y_true==y_pred, y_true>0)).astype(np.int).sum()
    
print(f"Total num of samples:{total_sample}")
print(f"Total num correctly predicted:{total_num_correct}")
print("\n")
    
try:
    precision = total_num_correct / total_sample
except ZeroDivisionError:
    precision = 1.0
    
    
sample_with_entity = len(y_true[y_true>1])
entity_correct = (np.logical_and(y_true==y_pred, y_true>1)).astype(np.int).sum()
    
          
print(f"Total samples with real entity:{sample_with_entity}")
print(f"entity_correct: {entity_correct}")

In [None]:
(y_pred == y_true).sum()

In [None]:
list(hp.tag2idx.keys())