In [1]:
!pip install transformers
!pip install nltk
!pip install stanza
!pip install scispacy==0.3.0
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_sm-0.3.0.tar.gz
!pip install pytorch-crf

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 6.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 29.3MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 37.7MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=b7632

In [2]:
import os
import numpy as np
import pandas as pd

import stanza
import spacy
import re
import en_core_sci_sm

from transformers import BertTokenizer, BertModel
# from transformers import BertForTokenClassification, AdamW

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torchcrf import CRF

import random
import itertools

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
train_path = "/content/drive/MyDrive/MeasEval/final_data/train/text/"
val_path = "/content/drive/MyDrive/MeasEval/final_data/dev/text/"
model_save_path = '/content/drive/MyDrive/MeasEval/scibert_crf_quant_ent_2iter.pt'

testtext_path = '/content/drive/MyDrive/MeasEval/final_data/testtext/'

In [6]:
typemap = {"Quantity": "QUANT",
           "MeasuredEntity": "ME", 
           "MeasuredProperty": "MP", 
           "Qualifier": "QUAL"}

In [7]:
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227845.0, style=ProgressStyle(descripti…




In [8]:
nlp = en_core_sci_sm.load()
def sen_split(text):
  doc = nlp(text)
  sen = [s.text for s in doc.sents]
  return sen

In [9]:
def length(text):
    l = 0
    for i in range(len(text)):
        if text[i] != '#':
            l += 1
    return l

In [10]:
stanza.download('en')
nlp2 = stanza.Pipeline(lang='en', processors='tokenize', tokenize_no_ssplit=True)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 26.3MB/s]                    
2021-04-12 16:19:35 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.2.0/en/default.zip: 100%|██████████| 411M/411M [01:21<00:00, 5.06MB/s]
2021-04-12 16:21:02 INFO: Finished downloading models and saved to /root/stanza_resources.
2021-04-12 16:21:02 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2021-04-12 16:21:02 INFO: Use device: gpu
2021-04-12 16:21:02 INFO: Loading: tokenize
2021-04-12 16:21:21 INFO: Done loading processors!


In [11]:
def test_preprocess(text):
    sen_lis = sen_split(text=text)
    pre =[]
    len_p = 0
    for sente in sen_lis:
        sen = nlp2(sente)
        sentok=[]
        temp = []
        for word in sen.sentences[0].words[:]:
            rng = re.findall(r'\d+', word.misc)
            bert_tok = tokenizer.tokenize(word.text)
            sentok.extend(bert_tok)
        if len(sentok) > 255:
            sentok = sentok[:255]
        pre.append(sentok)

    test = []
    # print('pre')
    for val in pre:
        # lab = np.zeros(256)
        tok_arr = np.zeros(256)
        att_mask = np.zeros(256)
        sen_tok = []
        tok_arr[0] = 102
        att_mask[0] = 1
        for tok in val:
            sen_tok.append(tokenizer.convert_tokens_to_ids(tok))
        # print(len(val))
        # print(val)
        for i in range(len(val)):
            # lab[i+1] = val[1][i]
            tok_arr[i+1] = sen_tok[i]
            att_mask[i+1] = 1
        test.append([tok_arr,att_mask])
        # print('end')
    return test

In [12]:
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = BertModel.from_pretrained('allenai/scibert_scivocab_uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442221694.0, style=ProgressStyle(descri…




In [13]:
class BERT_Arch(nn.Module):

    def __init__(self, bert, embed_dim, hidden_dim, drop_prob, n_layers, out_dim):
      
      super(BERT_Arch, self).__init__()
      self.bert = bert 
      self.dropout = nn.Dropout(drop_prob)
      self.fc1 = nn.Linear(2*embed_dim,out_dim)
      self.w1 = nn.Linear(embed_dim, embed_dim)
      self.w2 = nn.Linear(embed_dim, embed_dim)
      #self.bilstm = nn.LSTM(embed_dim, hidden_dim,  bidirectional=True, batch_first=True)
      self.softmax = nn.LogSoftmax(dim = 2)
      self.crf = CRF(3, batch_first=True)  
      self.tanh = nn.Tanh()

    #define the forward pass
    def forward(self, sent_id, mask_val, labels=None):
      x = self.bert(sent_id, attention_mask=mask_val)
      x = x.last_hidden_state
      x = self.tanh(x)
      cls = x[:,0,:]
      cls = cls.unsqueeze(1).repeat(1, 256, 1)
      cls = self.w1(cls)
      x = self.w2(x)
      x = torch.cat([x,cls], dim = 2)
      #x,_ = self.bilstm(x)
      x = self.dropout(x)
      x = self.fc1(x)
      mask_val = mask_val.type(torch.uint8)
      logit = self.softmax(x)
      if labels is not None:
          loss = -self.crf(logit, labels, mask=mask_val, reduction='mean')
          return loss
      else:
          prediction = self.crf.decode(x, mask=mask_val)
          return prediction

In [14]:
bert_model = BERT_Arch(model, 768, 64, 0.1, 1,3)
bert_model = bert_model.to(device)

In [15]:
# load pretrained to use directly
model_save_path = '/content/drive/MyDrive/MeasEval/scibert_crf_quant_ent_2iter.pt'

bert_model = torch.load(model_save_path)

In [16]:
# read files

testtextset = {}
# for fileset in testtext_path:
for fn in os.listdir(testtext_path):
    with open(os.path.join(testtext_path,fn)) as textfile:
        text = textfile.read() #.splitlines()
        #print(fn[:-4])
        testtextset[fn[:-4]] = text


In [17]:
#pre-process, pred and 

quant_ent ={}
for docId, text in testtextset.items():
    print(docId)
    if docId == 'S0925443913001385-1319':
        continue
    quant_ent[docId] = []
    annotSet = 1
    # text = testtextset[docId]
    test_info = test_preprocess(text)
    # print(test_info)
    x_test_id = np.zeros((0,256))
    x_test_mask = np.zeros((0,256))
    for test in test_info:
        x_test_id = np.vstack((x_test_id, test[0]))
        x_test_mask = np.vstack((x_test_mask, test[1]))
    
    # print(len(x_test_id))
    test_data = TensorDataset(torch.from_numpy(x_test_id), torch.from_numpy(x_test_mask))
    # print(len(test_data))
    batch_size = len(test_info)
    # print(len(test_info))
    test_loader = DataLoader(test_data, shuffle=False, batch_size = batch_size)
    # print(len(test_loader))
    bert_model.zero_grad()
    bert_model.eval()
    for seq, mask in test_loader:
        # print(seq)
        # print(mask)
        y_pred = bert_model(seq.long().to(device), mask.long().to(device))
        # print(y_pred)
        sen_lis = sen_split(text=text)
        len_p =0
        k = 0
        for sente in sen_lis:
            # tokenized_sen = tokenizer.tokenize(sente)
            sen = nlp2(sente)
            tokenized_sen=[]
            for word in sen.sentences[0].words[:]:
                rng = re.findall(r'\d+', word.misc)
                bert_tok = tokenizer.tokenize(word.text)
                tokenized_sen.extend(bert_tok)
            if len(tokenized_sen)>255:
                tokenized_sen = tokenized_sen[:255]
            # print((tokenized_sen))
            # print(len(y_pred[k]))
            y_out = np.zeros(len(y_pred[k]))
            prev_2 = 0
            for i in range(len(y_out)):
                if y_pred[k][i] == 2:
                    y_out[i]=1
                    prev_2 = 1
                elif y_pred[k][i] == 1 and prev_2 == 1:
                    y_out[i]=1
                else:
                    prev_2 = 0
                    y_out[i]=0

            prop_word = []
            prev_word = ''
            y_new = []
            yp = 0
            i = 1
            
            for word in tokenized_sen:
                if len(word) == length(word) and i == 1:
                    prev_word = word
                    yp = y_out[i]
                elif len(word) == length(word):
                    prop_word.append(prev_word)
                    y_new.append(yp)
                    prev_word = word
                    yp = y_out[i]
                # elif 
                else :
                    prev_word = prev_word + word[2:]
                i+=1
            prop_word.append(prev_word)
            y_new.append(yp)
            prop_y = np.array(y_new)
            # print(prop_word)
            # print(y_new)
            sen = nlp2(sente)
            j = 0
            oldy = 0
            qt = ''
            so =0
            eo =0
            il = {}
            for z, w in enumerate(sen.sentences[0].words[:]):
                if prop_y[j] == 1 and j == 0:
                    j+=1
                    oldy = 1
                    qt += w.text
                    rng = re.findall(r'\d+', w.misc)
                    so = int(rng[0]) + len_p
                    continue
                elif prop_y[j] == 1 and oldy == 0:
                    j+=1
                    oldy =1
                    qt += w.text
                    rng = re.findall(r'\d+', w.misc)
                    so = int(rng[0]) + len_p
                    continue
                elif prop_y[j] == 1 and oldy == 1 and j != len(prop_y) and z != len(sen.sentences[0].words[:])-1:
                    j+=1
                    oldy =1
                    qt += ' '+w.text
                    continue
                elif prop_y[j] == 1 and (j == len(prop_y) or z == len(sen.sentences[0].words[:])-1):
                    qt += ' '+w.text
                    rng = re.findall(r'\d+', w.misc)
                    eo = int(rng[1]) + len_p
                    il['annotSet'] = annotSet
                    il['annotType'] = 'Quantity'
                    il['startoffset'] = so
                    il['endOffset'] = eo
                    il['annotId'] = str(docId) + str('<')+str(annotSet)+ str('>')
                    il['text'] = qt
                    il['other'] = '{"unit": N/A}'
                    quant_ent[docId].append(il)
                    annotSet += 1
                    so =0
                    eo = 0
                    qt = ''
                    oldy = 0
                    j+=1
                    continue
                elif prop_y[j] == 0 and oldy ==1:
                    rng = re.findall(r'\d+', w.misc)
                    eo = int(rng[0]) -1 +len_p
                    il['annotSet'] = annotSet
                    il['annotType'] = 'Quantity'
                    il['startoffset'] = so
                    il['endOffset'] = eo
                    il['annotId'] = str(docId) + str('<')+str(annotSet)+ str('>')
                    il['text'] = qt
                    il['other'] = '{"unit": N/A}'
                    quant_ent[docId].append(il)
                    annotSet += 1
                    so =0
                    eo = 0
                    qt = ''
                    oldy = 0
                    j+=1
                    continue
                else:
                    j+=1
                    oldy = 0
                    continue
                # if docId == 'S0025322712001600-2584':
                #     print(prop_y)
                #     print(w)
                #     print(len(prop_y))
                #     print(len(sen.sentences[0].words[:]))
                #     print(oldy)
                #     print(j)
                #     print(z)
                #     print(prop_y[j-1])
                #     print(qt)
                #     print(so)
                #     print(eo)
            len_p += len(sente) + 1
            k+=1

S2213671113001306-1385
S0301010413004096-767
S0960148113005727-1203
S0960148113005727-855
S0032063313003218-5269
S2213158213000582-1279
S2213158213000582-1041
S0032063312002437-593
S1359645413009816-1712
S1750583613004192-1126
S0378383913001567-7073
S0378112713005288-1800
S2213158213000582-1390
S0167610513001001-1566
S0167278913001450-12425
S0301010413004096-693
S2213671113000738-667
S0038071711004354-2389
S0257897213007573-574
S0016236113008041-3257
S2213158213000582-1050
S0167610513001001-1769
S0960148113004989-2841
S0927024813002961-1357
S0032386113005454-2886
S037842901300244X-1654
S0168945213001805-4574
S0019103512002801-1781
S0967064513002774-1376
S0025322712001600-2584
S1873506114000075-1104
S0925443913001385-1621
S037842901300244X-1427
S0925443913001385-1683
S0019103512002801-2075
S0378112713005288-1916
S0019103512003995-2579
S2213671113000921-756
S0925443913001385-849
S0022459611006116-1200
S2213158213000582-766
S2213158213000582-1469
S0378383912000130-1041
S1873506114000075-6

In [18]:
# making tsv files
header = "docId\tannotSet\tannotType\tstartOffset\tendOffset\tannotId\ttext\tother"
subdir = "/content/drive/MyDrive/MeasEval/final_data/sub/"
for docid, ils in quant_ent.items():
    if len(ils) > 0:        
        # print(docid)
        # print(il['annotSet'])
        iss = []
        for il in ils:
            qs = (str(docid) + '\t' + str(il['annotSet']) + '\tQuantity\t' + str(il['startoffset']) + '\t' + str(il['endOffset']) + '\t' + str(il['annotId']) + '\t' + str(il['text']) + '\t' + str(il['other']))
            iss.append(qs)
        iset = set(iss)
        iss = list(iset)
        iss.sort()
        sub = open(subdir+docid + ".tsv", "w")
        sub.write(header+"\n")
        for isl in iss:
            print(isl)
            sub.write(isl+'\n')
        sub.close()
        print(docid, 'written')
    else :
        print(docid, 'nothing to write')

S2213671113001306-1385	1	Quantity	310	323	S2213671113001306-1385<1>	beyond 1 week	{"unit": N/A}
S2213671113001306-1385 written
S0301010413004096-767	3	Quantity	257	274	S0301010413004096-767<3>	2.5 g ) was loaded	{"unit": N/A}
S0301010413004096-767	5	Quantity	412	417	S0301010413004096-767<5>	1 day	{"unit": N/A}
S0301010413004096-767	6	Quantity	579	590	S0301010413004096-767<6>	5 ± 0.2 K ) .	{"unit": N/A}
S0301010413004096-767	8	Quantity	662	680	S0301010413004096-767<8>	in order to ensure	{"unit": N/A}
S0301010413004096-767	9	Quantity	762	765	S0301010413004096-767<9>	5 K	{"unit": N/A}
S0301010413004096-767 written
S0960148113005727-1203	2	Quantity	402	419	S0960148113005727-1203<2>	% per 3 % increase	{"unit": N/A}
S0960148113005727-1203 written
S0960148113005727-855	2	Quantity	267	276	S0960148113005727-855<2>	to 21 % at	{"unit": N/A}
S0960148113005727-855	4	Quantity	521	523	S0960148113005727-855<4>	% .	{"unit": N/A}
S0960148113005727-855 written
S0032063313003218-5269	2	Quantity	265	267	S0