In [None]:
!pip install sklearn-crfsuite
!pip install quantities
!pip install transformers
!pip install nltk
!pip install cookiecutter
!pip install stanza
!pip install scispacy==0.3.0
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_sm-0.3.0.tar.gz
!pip install pytorch-crf

Subtask-1

In [None]:
import spacy
import glob
import numpy as np
import pandas as pd
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from quantities import units as u

data_path = "drive/MyDrive/MeasEval-main/data/train/tsv"
textdata_path = "drive/MyDrive/MeasEval-main/data/train/text"
test_data_path = "drive/MyDrive/MeasEval-main/data/eval/tsv"
test_textdata_path = "drive/MyDrive/MeasEval-main/data/eval/text"


units = ['%','‰']
nlp = spacy.load('en_core_web_sm')

for key, val in u.__dict__.items():
    if isinstance(val, type(u.l)):
        if key not in units and key.lower() not in nlp.Defaults.stop_words:
            units.append(key.lower())

        if val.name not in units and val.name.lower() not in nlp.Defaults.stop_words:
            units.append(val.name.lower())

print(units)

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    lemma = sent[i][3]
    dep = sent[i][4]
    tag = sent[i][5]
    is_stop = sent[i][6]
    shape = sent[i][7]
    token = sent[i][8]

    features = {
        'bias': 1.0,
        'bais1': 1.0,
        'word.lemma': lemma,
        'word.is_unit': (word in units) or (lemma in units),
        'word.shape': shape,
        'word.dep': dep,
        'word.is_stop()': is_stop,
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.like_num': token.like_num,
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }

    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        lemma1 = sent[i-1][3]
        dep1 = sent[i-1][4]
        tag1 = sent[i-1][5]
        is_stop1 = sent[i-1][6]
        shape1 = sent[i-1][7]
        token1 = sent[i-1][8]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.is_unit': (word1 in units) or (lemma1 in units),
            '-1:word.lemma': lemma1,
            '-1:word.shape': shape1,
            '-1:word.dep': dep1,
            '-1:word.is_stop()': is_stop1,
            '-1:word.like_num': token1.like_num,
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i > 1:
        word1 = sent[i-2][0]
        postag1 = sent[i-2][1]
        lemma1 = sent[i-2][3]
        dep1 = sent[i-2][4]
        tag1 = sent[i-2][5]
        is_stop1 = sent[i-2][6]
        shape1 = sent[i-2][7]
        token1 = sent[i-2][8]
        features.update({
            '-2:word.lower()': word1.lower(),
            '-2:word.lemma': lemma1,
            '-2:word.shape': shape1,
            '-2:word.dep': dep1,
            '-2:word.like_num': token1.like_num,
            '-2:word.istitle()': word1.istitle(),
            '-2:word.isupper()': word1.isupper(),
            '-2:word.isdigit()': word1.isdigit(),
            '-2:postag': postag1,
            '-2:postag[:2]': postag1[:2],
        })
    else:
        features['SBOS'] = True


    if i < len(sent)-2:
        word1 = sent[i+2][0]
        postag1 = sent[i+2][1]
        lemma1 = sent[i+2][3]
        dep1 = sent[i+2][4]
        tag1 = sent[i+2][5]
        is_stop1 = sent[i+2][6]
        shape1 = sent[i+2][7]
        token1 = sent[i+2][8]
        features.update({
            '+2:word.lower()': word1.lower(),
            '+2:word.lemma': lemma1,
            '+2:word.shape': shape1,
            '+2:word.dep': dep1,
            '+2:word.like_num': token1.like_num,
            '+2:word.istitle()': word1.istitle(),
            '+2:word.isupper()': word1.isupper(),
            '+2:word.isdigit()': word1.isdigit(),
            '+2:postag': postag1,
            '+2:postag[:2]': postag1[:2],
        })
    else:
        features['SEOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        lemma1 = sent[i+1][3]
        dep1 = sent[i+1][4]
        tag1 = sent[i+1][5]
        is_stop1 = sent[i+1][6]
        shape1 = sent[i+1][7]
        token1 = sent[i+1][8]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.is_unit': (word1 in units) or (lemma1 in units),
            '+1:word.lemma': lemma1,
            '+1:word.lemma': lemma1,
            '+1:word.shape': shape1,
            '+1:word.dep': dep1,
            '+1:word.is_stop()': is_stop1,
            '+1:word.like_num': token1.like_num,
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label, lemma, dep, tag, is_stop, shape, token in sent]

def sent2tokens(sent):
    return [token for token, postag, label, lemma, dep, tag, is_stop, shape, token in sent]


nlp = spacy.load("en_core_web_sm")
sents = []
train_sents = []
test_sents = []
labels = []
features = []

file_list = glob.glob(textdata_path+"/*")
train_file_size = len(file_list)
file_list+=glob.glob(test_textdata_path+"/*")
it = 0

test_file_list=[]
for files in file_list:
    file_obj = open(files)
    if(it<train_file_size):
      tsv_file = data_path+files[files.rfind('/'):-3]+"tsv" # the corresponding tsv file name
    else:
      tsv_file = test_data_path+files[files.rfind('/'):-3]+"tsv"
    it+=1
    #print(tsv_file)
    try:
      tsv_data = pd.read_csv(tsv_file, sep='\t')
    except:
      print('file not found!')
      continue
        
    s = file_obj.read()
    tokens = nlp(s)
    sent = []            # each sent contains each text file as an element

    # Below lists are used to store the quantity spans from the tsv file
    ent_spans = []
    quant_spans = []
    prop_spans = []

    for k in range(0,tsv_data.shape[0]):
      if(tsv_data.loc[k][2]=='Quantity' and tsv_data.loc[k][3] not in quant_spans):
        quant_spans.append([tsv_data.loc[k][3],'s'])
        quant_spans.append([tsv_data.loc[k][4],'e'])
      if(tsv_data.loc[k][2]=='MeasuredEntity' and tsv_data.loc[k][3] not in ent_spans):
        ent_spans.append([tsv_data.loc[k][3],'s'])
        ent_spans.append([tsv_data.loc[k][4],'e'])
      if(tsv_data.loc[k][2]=='MeasuredProperty' and tsv_data.loc[k][3] not in prop_spans):
        prop_spans.append([tsv_data.loc[k][3],'s'])
        prop_spans.append([tsv_data.loc[k][4],'e'])

    ind = 0 #the index of the current word(token)
    quant_end = 0 #stores the end of the current quantity span
    ent_end = 0
    prop_end = 0
    for token in tokens:
      word = token.text
      shape = token.shape_
      pos = token.pos_
      lemma = token.lemma_
      tag = token.tag_
      dep = token.dep_
      is_stop = token.is_stop
      label = 'O'

      ind = s.find(word,ind)
      if ([ind,'s'] in quant_spans or ind < quant_end):
        if([ind,'s'] in quant_spans):
          quant_end = quant_spans[quant_spans.index([ind,'s'])+1][0]
          label = 'B'
          #print('quant: ',word)
        else:
          label = 'I'
          #print('quant: ',word)
      if ([ind,'s'] in ent_spans or ind < ent_end):
        if([ind,'s'] in ent_spans):
          ent_end = ent_spans[ent_spans.index([ind,'s'])+1][0]
          #label = 'B1'
          #print('ent: ',word)
        #else:
          #label = 'I1'
          #print('ent: ',word)
      if ([ind,'s'] in prop_spans or ind < prop_end):
        if([ind,'s'] in prop_spans):
          prop_end = prop_spans[prop_spans.index([ind,'s'])+1][0]
          #label = 'B2'
          #print('prop: ',word)
        #else:
          #label = 'I2'
          #print('prop: ',word)
      ind = ind + len(word)
      sent.append((word,token.pos_,label,token.lemma_,token.dep_,token.tag_,token.is_stop,token.shape_,token)) 
    sents.append(sent)


    if(it<=train_file_size):
      train_sents.append(sent)
    else:
      test_file_list.append(files)
      test_sents.append(sent)               
#train_sents,test_sents = sklearn.model_selection.train_test_split(train_sents,test_size=0.30)

X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    # other algorithms -lbfgs (has c1 also), l2sgd, ap, pa, arow 
    c2 = 0.1,
    c1 = 0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

labels = list(crf.classes_)
#print(labels)
labels.remove('O')
#labels.remove('B1')
#labels.remove('I1')
#labels.remove('B2')
#labels.remove('I2')
#print('size of labels: ',len(labels))

y_pred = crf.predict(X_test)
print(y_pred[0])
print(y_pred[1])
test_sents = list(test_sents)

k = 0
for i in range(0,len(test_sents)):
  for j in range(0,len(test_sents[i])):
    k+=1
  if(k>4000):
    break
sc = metrics.flat_f1_score(y_test, y_pred,average='weighted', labels=labels)
print(sc)
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
# state features

from collections import Counter
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])
#print(len(X_test))

Creating the files in specified location

In [4]:
import csv

n=len(y_pred)
default="{\"unit\": \"kg\"}"
default1="{\"HasQuantity\": \"T1-1\"}"
default2="{\"HasQuantity\": \"T1-1\"}"
csvFile = ''
csvWriter = ''
max_annot = []
for i in range(n):
  annt=1
  max_annot.append(0)
  file_obj = open(test_file_list[i])
  #print(test_file_list[i])
  if(test_file_list[i].find('S0012821X13007309-1649')+1>0):
    print(y_pred[i])
  s=file_obj.read()
  csvFile = open("drive/MyDrive/MeasEval-main/eval/submission/"+test_file_list[i][43:-4]+".tsv", 'w', newline='', encoding='utf8')
  csvWriter = csv.writer(csvFile,delimiter='\t')
  csvWriter.writerow(['docId','annotSet','annotType','startOffset','endOffset','annotId','text','other'])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


writing the results of subtask-1 in the files

In [5]:
import csv

n=len(y_pred)
default="{\"unit\": \"kg\"}"
default1="{\"HasQuantity\": \"T1-1\"}"
default2="{\"HasQuantity\": \"T1-1\"}"
csvFile = ''
csvWriter = ''
max_annot = []
for i in range(n):
  annt=1
  max_annot.append(0)
  file_obj = open(test_file_list[i])
  #print(test_file_list[i])
  if(test_file_list[i].find('S0012821X13007309-1649')+1>0):
    print(y_pred[i])
  s=file_obj.read()
  csvFile = open("drive/MyDrive/MeasEval-main/eval/submission/"+test_file_list[i][43:-4]+".tsv", 'a', newline='', encoding='utf8')
  csvWriter = csv.writer(csvFile,delimiter='\t')
  #csvWriter.writerow(['docId','annotSet','annotType','startOffset','endOffset','annotId','text','other'])
  words=[]
  for j in range(len(test_sents[i])):
    words.append(test_sents[i][j][0])
  predictions=y_pred[i]
  j=0
  while j<len(predictions):
    if predictions[j]!="B":
      j+=1
      continue
    else:
      k=j+1
      while k<len(predictions) and predictions[k]!="O":
        k+=1
      ind=0
      for t in range(j):
        ind=s.find(words[t],ind)
        ind+=len(words[t])
      startoffset=s.find(words[j],ind)
      ind=0
      for t in range(k-1):
        ind=s.find(words[t],ind)
        ind+=len(words[t])
      endoffset=s.find(words[k-1],ind)
      endoffset+=len(words[k-1])
      csvWriter.writerow([test_file_list[i][43:-4],annt,"Quantity",startoffset,endoffset,"T1-"+str(annt),s[startoffset:endoffset],default])
      max_annot[i] = max(max_annot[i],annt)
      annt+=1
      j=k
csvWriter = csv.writer(csvFile,delimiter='\t')
csvWriter.writerow(['docId','annotSet','annotType','startOffset','endOffset','annotId','text','other'])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


67

For testing

In [None]:
!pwd
!cd drive/MyDrive/MeasEval-main/eval
!pip install -r drive/MyDrive/MeasEval-main/eval/requirements.txt
!python drive/MyDrive/MeasEval-main/eval/measeval-eval.py -i drive/MyDrive/MeasEval-main/ -s eval/submission/ -g data/eval/tsv/


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Subtask-3

In [52]:
import os
import io
import glob
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel, BertForPreTraining
from sklearn.model_selection import train_test_split
import stanza
import spacy
import re
import en_core_sci_sm
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertModel
from transformers import BertForTokenClassification, AdamW
import torch
import torch.nn as nn
from transformers import BertForTokenClassification, AdamW
from torchcrf import CRF


In [53]:
data_path = "drive/MyDrive/MeasEval-main/data/train/tsv"
textdata_path = "drive/MyDrive/MeasEval-main/data/train/text"
test_data_path = "drive/MyDrive/MeasEval-main/eval/submission" # the one obtained in subtask-1 above
test_textdata_path = "drive/MyDrive/MeasEval-main/data/eval/text"

checking the device. GPU should be used for further code

In [54]:
if torch.cuda.is_available():
  device = 'cuda'
else:
  device = 'cpu'
print(device)
device = torch.device(device)

cuda


In [55]:
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

splitting sentence into bert tokens and storing their corresponding entity and property spans

In [None]:
def sen_split(text):
  doc = nlp(text)
  sen = [s.text for s in doc.sents]
  return sen

file_list = glob.glob(textdata_path+"/*")
train_file_size = len(file_list)
file_list+=glob.glob(test_textdata_path+"/*")
it = 0
tuples = []  #stores the (sentence,quantity,measured entity,measured property) tuples (if they exist)
test_tuples = [] #for the test set
nlp1 = en_core_sci_sm.load()
sum = 0

for files in file_list:
    file_obj = open(files)
    if(it<train_file_size):
      tsv_file = data_path+files[files.rfind('/'):-3]+"tsv" # the corresponding tsv file name
    else:
      tsv_file = test_data_path+files[files.rfind('/'):-3]+"tsv"
    it+=1
    s_file_name = files[files.rfind('/')+1:-4]
    print(tsv_file)
    try:
      tsv_data = pd.read_csv(tsv_file, sep='\t')
    except:
      print('file not found!')
      continue

    tsv_list = tsv_data.values
    s = file_obj.read()
    #tokens = nlp(s)
    sent = []            # each sent contains each text file as an element

    # Below lists are used to store the quantity spans from the tsv file
    ent_spans = []
    quant_spans = []
    prop_spans = []
    for k in range(0,tsv_data.shape[0]):
      if(tsv_data.loc[k][2]=='Quantity' and tsv_data.loc[k][3] not in quant_spans):
        sum+=1
        quant_spans.append([tsv_data.loc[k][3],'s'])
        quant_spans.append([tsv_data.loc[k][4],'e'])
      if(tsv_data.loc[k][2]=='MeasuredEntity' and tsv_data.loc[k][3] not in ent_spans):
        ent_spans.append([tsv_data.loc[k][3],'s'])
        ent_spans.append([tsv_data.loc[k][4],'e'])
      if(tsv_data.loc[k][2]=='MeasuredProperty' and tsv_data.loc[k][3] not in prop_spans):
        prop_spans.append([tsv_data.loc[k][3],'s'])
        prop_spans.append([tsv_data.loc[k][4],'e'])
    
    doc = nlp1(s)
    sent_list = [s1.text for s1 in doc.sents]
    ind = 0
    ind1 = 0
    
    for sent in sent_list:
      #ind1 = ind
      for i in range(0,len(tsv_list)):
        if (tsv_list[i][2]=='Quantity' and tsv_list[i][3]>=ind and tsv_list[i][4]<=ind+len(sent)):
          val = []
          val.append(sent)
          val.append((tsv_list[i][3]-ind,tsv_list[i][4]-ind))
          val.append((-1,-1))
          val.append((-1,-1))
          annotId = tsv_list[i][5]
          val.append(annotId)
          val.append(ind1)
          val.append(s_file_name)
          val.append(files)
          val.append(tsv_list[i][1])
          for j in range(i,len(tsv_list)):
            if(tsv_list[j][2]=='MeasuredEntity' and tsv_list[j][4]<=ind+len(sent) and tsv_list[j][3]>=ind and tsv_list[j][1]==tsv_list[i][1]):
              val[2] = (tsv_list[j][3]-ind,tsv_list[j][4]-ind)
            if(tsv_list[j][2]=='MeasuredProperty' and tsv_list[j][3]>=ind and tsv_list[j][4]<=ind+len(sent) and tsv_list[j][1]==tsv_list[i][1]):
              val[3] = (tsv_list[j][3]-ind,tsv_list[j][4]-ind)
          if (it<=train_file_size):
            tuples.append(val)
          else:
            test_tuples.append(val)
      ind+=len(sent)+1
      ind1+=len(sent)+1
for tup in test_tuples:
  print(tup)
print(len(tuples))
print(sum)

Storing the measrued entity labels

In [None]:
#only for measured entity
nlp = spacy.load('en_core_web_sm')
pretrain = []
i = 0

def token_len(s):
  cnt = 0
  for i in s:
    if(i!='#'):
      cnt+=1
  return cnt

for tup in tuples:
  quan_labels = []
  ent_labels = []
  sent_tok = []
  sent = tup[0]
  tokens = nlp(sent)
  annotId = tup[4]
  ind1 = tup[5]
  s_file_name = tup[6]
  file_name = tup[7]
  annot_set = tup[8]
  ind = 0
  for token in tokens:
    word = token.text
    ind = sent.find(word,ind)
    bert_tokens = tokenizer.tokenize(word)
    sent_tok.extend(bert_tokens)
    for t in bert_tokens:
      if (ind>=tup[1][0] and ind<=tup[1][1]):
        quan_labels.append(1)
      else:
        quan_labels.append(0)
      if (ind>=tup[2][0] and ind<=tup[2][1]):
        ent_labels.append(1)
      else:
        ent_labels.append(0)
      ind+=token_len(t)
  for i in range(0,len(quan_labels)):
    if quan_labels[i] == 1 and i == 0:
        ent_labels.insert(i, 0)
        sent_tok.insert(i,"$")
    elif quan_labels[i] == 1 and quan_labels[i-1] == 0:
      ent_labels.insert(i, 0)
      sent_tok.insert(i,"$")
    if quan_labels[i] == 1 and i == len(quan_labels) - 1:
      ent_labels.insert(i+2, 0)
      sent_tok.insert(i+2,"$")
    elif quan_labels[i] == 1 and quan_labels[i+1] == 0:
      ent_labels.insert(i+2, 0)
      sent_tok.insert(i+2,"$")
  pretrain.append([sent_tok, ent_labels,annotId,ind1,s_file_name,file_name,annot_set])

validation = []
for tup in test_tuples:
  quan_labels = []
  ent_labels = []
  sent_tok = []
  sent = tup[0]
  tokens = nlp(sent)
  tokens = nlp(sent)
  annotId = tup[4]
  ind1 = tup[5]
  s_file_name = tup[6]
  file_name = tup[7]
  print(file_name)
  annot_set = tup[8]
  ind = 0
  for token in tokens:
    word = token.text
    ind = sent.find(word,ind)
    bert_tokens = tokenizer.tokenize(word)
    sent_tok.extend(bert_tokens)
    for t in bert_tokens:
      if (ind>=tup[1][0] and ind<=tup[1][1]):
        quan_labels.append(1)
      else:
        quan_labels.append(0)
      if (ind>=tup[2][0] and ind<=tup[2][1]):
        ent_labels.append(1)
      else:
        ent_labels.append(0)
      ind+=token_len(t)
  for i in range(0,len(quan_labels)):
    if quan_labels[i] == 1 and i == 0:
        ent_labels.insert(i, 0)
        sent_tok.insert(i,"$")
    elif quan_labels[i] == 1 and quan_labels[i-1] == 0:
      ent_labels.insert(i, 0)
      sent_tok.insert(i,"$")
    if quan_labels[i] == 1 and i == len(quan_labels) - 1:
      ent_labels.insert(i+2, 0)
      sent_tok.insert(i+2,"$")
    elif quan_labels[i] == 1 and quan_labels[i+1] == 0:
      ent_labels.insert(i+2, 0)
      sent_tok.insert(i+2,"$")
  validation.append([sent_tok, ent_labels,annotId,ind1,s_file_name,file_name,annot_set])
for ele in validation:
  print((ele[0]))

Storing the labels for measured quantity spans (skip the below cell when you want to run for measured entity)

In [None]:
#only for measured property
nlp = spacy.load('en_core_web_sm')
pretrain = []
i = 0

def token_len(s):
  cnt = 0
  for i in s:
    if(i!='#'):
      cnt+=1
  return cnt

for tup in tuples:
  quan_labels = []
  ent_labels = []
  sent_tok = []
  sent = tup[0]
  tokens = nlp(sent)
  annotId = tup[4]
  ind1 = tup[5]
  s_file_name = tup[6]
  file_name = tup[7]
  annot_set = tup[8]
  ind = 0
  for token in tokens:
    word = token.text
    ind = sent.find(word,ind)
    bert_tokens = tokenizer.tokenize(word)
    sent_tok.extend(bert_tokens)
    for t in bert_tokens:
      if (ind>=tup[1][0] and ind<=tup[1][1]):
        quan_labels.append(1)
      else:
        quan_labels.append(0)
      if (ind>=tup[3][0] and ind<=tup[3][1]):
        ent_labels.append(1)
      else:
        ent_labels.append(0)
      ind+=token_len(t)
  for i in range(0,len(quan_labels)):
    if quan_labels[i] == 1 and i == 0:
        ent_labels.insert(i, 0)
        sent_tok.insert(i,"$")
    elif quan_labels[i] == 1 and quan_labels[i-1] == 0:
      ent_labels.insert(i, 0)
      sent_tok.insert(i,"$")
    if quan_labels[i] == 1 and i == len(quan_labels) - 1:
      ent_labels.insert(i+2, 0)
      sent_tok.insert(i+2,"$")
    elif quan_labels[i] == 1 and quan_labels[i+1] == 0:
      ent_labels.insert(i+2, 0)
      sent_tok.insert(i+2,"$")
  pretrain.append([sent_tok, ent_labels,annotId,ind1,s_file_name,file_name,annot_set])

validation = []
for tup in test_tuples:
  quan_labels = []
  ent_labels = []
  sent_tok = []
  sent = tup[0]
  tokens = nlp(sent)
  tokens = nlp(sent)
  annotId = tup[4]
  ind1 = tup[5]
  s_file_name = tup[6]
  file_name = tup[7]
  print(file_name)
  annot_set = tup[8]
  ind = 0
  for token in tokens:
    word = token.text
    ind = sent.find(word,ind)
    bert_tokens = tokenizer.tokenize(word)
    sent_tok.extend(bert_tokens)
    for t in bert_tokens:
      if (ind>=tup[1][0] and ind<=tup[1][1]):
        quan_labels.append(1)
      else:
        quan_labels.append(0)
      if (ind>=tup[3][0] and ind<=tup[3][1]):
        ent_labels.append(1)
      else:
        ent_labels.append(0)
      ind+=token_len(t)
  for i in range(0,len(quan_labels)):
    if quan_labels[i] == 1 and i == 0:
        ent_labels.insert(i, 0)
        sent_tok.insert(i,"$")
    elif quan_labels[i] == 1 and quan_labels[i-1] == 0:
      ent_labels.insert(i, 0)
      sent_tok.insert(i,"$")
    if quan_labels[i] == 1 and i == len(quan_labels) - 1:
      ent_labels.insert(i+2, 0)
      sent_tok.insert(i+2,"$")
    elif quan_labels[i] == 1 and quan_labels[i+1] == 0:
      ent_labels.insert(i+2, 0)
      sent_tok.insert(i+2,"$")
  validation.append([sent_tok, ent_labels,annotId,ind1,s_file_name,file_name,annot_set])
for ele in validation:
  print((ele))

In [58]:
train = []
x_train_id = np.zeros((0,256))
x_train_mask = np.zeros((0,256))
y_train = np.zeros((0,256))
for val in pretrain:
  tok_arr = [0]*256
  att_mask = [0]*256
  lab = [0]*256
  tok_arr[0] = 102 # BERT token id of the [SEP] token
  att_mask[0] = 1
  for i in range(len(val[1])):
    tok_arr[i+1] = tokenizer.convert_tokens_to_ids(val[0][i])
    att_mask[i+1] = 1
    lab[i+1] = val[1][i]
  x_train_id = np.vstack((x_train_id, np.array(tok_arr)))
  x_train_mask = np.vstack((x_train_mask, np.array(att_mask)))
  y_train = np.vstack((y_train, np.array(lab)))
  train.append([tok_arr,att_mask, lab])
print(x_train_id.shape[0])

883


In [None]:
val_data = []
x_val_id = np.zeros((0,256))
x_val_mask = np.zeros((0,256))
y_val = np.zeros((0,256))
for val in validation:
  tok_arr = [0]*256
  att_mask = [0]*256
  lab = [0]*256
  tok_arr[0] = 102 # BERT token id of the [SEP] token
  att_mask[0] = 1
  for i in range(len(val[1])):
    tok_arr[i+1] = tokenizer.convert_tokens_to_ids(val[0][i])
    att_mask[i+1] = 1
    lab[i+1] = val[1][i]
  x_val_id = np.vstack((x_val_id, np.array(tok_arr)))
  x_val_mask = np.vstack((x_val_mask, np.array(att_mask)))
  y_val = np.vstack((y_val, np.array(lab)))
  train.append([tok_arr,att_mask, lab])
print(x_val_mask[2])
print(x_val_id.shape[0])

In [60]:
for i in range(len(y_train)):
  for j in range(1,len(y_train[0])):
    if y_train[i][j-1] == 0 and y_train[i][j] == 1:
      y_train[i][j] = 2

In [61]:
for i in range(len(y_val)):
  for j in range(1,len(y_val[0])):
    if y_val[i][j-1] == 0 and y_val[i][j] == 1:
      y_val[i][j] = 2

In [62]:
train_data = TensorDataset(torch.from_numpy(x_train_id), torch.from_numpy(x_train_mask), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(x_val_id), torch.from_numpy(x_val_mask), torch.from_numpy(y_val))

In [None]:
batch_size = 24
train_loader = DataLoader(train_data, shuffle=False, batch_size = batch_size)
val_loader = DataLoader(val_data, shuffle=False, batch_size = batch_size)
for x,y,z in val_loader:
  print(z[2])

In [64]:
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = BertModel.from_pretrained('allenai/scibert_scivocab_uncased')

Bert Architecture 

In [72]:
class BERT_Arch(nn.Module):

    def __init__(self, bert, embed_dim, hidden_dim, drop_prob, n_layers, out_dim):
      
      super(BERT_Arch, self).__init__()
      self.bert = bert 
      self.dropout = nn.Dropout(drop_prob)
      self.fc1 = nn.Linear(2*embed_dim,out_dim)
      self.w1 = nn.Linear(embed_dim, embed_dim)
      self.w2 = nn.Linear(embed_dim, embed_dim)
      #self.bilstm = nn.LSTM(embed_dim, hidden_dim,  bidirectional=True, batch_first=True)
      self.softmax = nn.LogSoftmax(dim = 2)
      self.crf = CRF(3, batch_first=True)  
      self.tanh = nn.Tanh()

    #define the forward pass
    def forward(self, sent_id, mask_val, labels=None):
      x = self.bert(sent_id, attention_mask=mask_val)
      x = x.last_hidden_state
      x = self.tanh(x)
      cls = x[:,0,:]
      cls = cls.unsqueeze(1).repeat(1, 256, 1)
      cls = self.w1(cls)
      x = self.w2(x)
      x = torch.cat([x,cls], dim = 2)
      #x,_ = self.bilstm(x)
      x = self.dropout(x)
      x = self.fc1(x)
      mask_val = mask_val.type(torch.uint8)
      logit = self.softmax(x)
      if labels is not None:
          loss = -self.crf(logit, labels, mask=mask_val, reduction='mean')
          return loss
      else:
          prediction = self.crf.decode(x, mask=mask_val)
          return prediction

In [73]:
bert_model = BERT_Arch(model, 768, 64, 0.1, 1,3)
bert_model = bert_model.to(device)

In [None]:
print(bert_model)

In [74]:
optimizer = torch.optim.Adam(bert_model.parameters(), lr=1e-5)

Training the model 

In [None]:
epochs = 10
for e in range(epochs):
  
  bert_model.train()
  i=0
  train_loss=0
  for seq, mask, y in train_loader:
    bert_model.zero_grad()
    loss = bert_model(seq.long().to(device), mask.long().to(device), y.long().to(device))
    train_loss += loss.item()*batch_size
    loss.backward()
    torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)
    optimizer.step()
    if(i%5==0):
      print("Epoch-{}/{} Iterations-{} loss-{}".format(e+1,epochs,i+1,loss.item()))
    i+=1
  
  
  bert_model.eval()
  val_loss=0
  for seq, mask, y in val_loader:
    bert_model.zero_grad()
    loss = bert_model(seq.long().to(device), mask.long().to(device), y.long().to(device))
    val_loss += loss.item()*batch_size
    i+=1
  
  print("Epoch-{}/{} train_loss-{} Val_loss-{}".format(e+1,epochs,train_loss/len(train_loader),val_loss/len(val_loader)))

Creates measured entity entries in the output files

In [None]:
#for measured entity
import csv
p=0
n=0
pos=0
neg=0
prec_num = 0 
prec_den = 0
rec_num = 0
rec_den = 0 
p1 = 0
n1=0
k1 = 0

def remove_hash(s):
  out = ''
  for i in s:
    if(i!='#'):
      out+=i
  return out

file_cnt = -1
annot = 1
print('hi')
file_name = ''
for seq, mask, y in val_loader:
    bert_model.zero_grad()
    bert_model.eval()
    y_pred = bert_model(seq.long().to(device), mask.long().to(device))
    print(y_pred)
    np_out = np.zeros((len(y_pred),256, 1))
    np_act = (y.cpu().data.numpy() >= 1).astype(int).reshape((len(y), len(y[0]),1))
    for i in range(k1,min(len(validation),k1+24)):
      sent_tuple = validation[i]
      annotId = validation[i][2]
      ind = validation[i][3]
      s_file_name = validation[i][4]
      file_name1 = file_name
      file_name = validation[i][5]
      annot_set = validation[i][6]
      fl = open(file_name,'r')
      if(file_name1!=file_name):
        print('annot is 1')
        annot = 1
        file_cnt+=1
      s = fl.read()
      s = s.casefold()
      f2 = open("drive/MyDrive/MeasEval-main/eval/submission/"+s_file_name+".tsv",'r', newline='', encoding='utf8')
      s2 = f2.read()
      startOffset = -1
      endOffset = -1
      pos = ind
      cnt = 0
      in_flag = 0

      tsv_data = pd.read_csv("drive/MyDrive/MeasEval-main/eval/submission/"+s_file_name+".tsv", sep='\t')
      max_annot = 0
      for it in range(0,tsv_data.shape[0]):
        if(tsv_data.loc[it][2]=='Quantity'):
          max_annot+=1
      for words in validation[i][0]:
        if(words == '$'or (words[0]=='[' and words[len(words)-1]==']')):
          cnt+=1
          continue
        out = remove_hash(words)
        if (int(y_pred[i-k1][cnt+1])>0 and in_flag==0):
          in_flag = 1
          startOffset = s.find(out,pos)
          #pos = s.find(out,pos)
          #print('startOffset: ',startOffset,s.find(out,pos),out,ind,s)
        elif (int(y_pred[i-k1][cnt+1])==0 and in_flag==1):
          endOffset = s.find(out,pos)
          print('endOffset: ',startOffset,endOffset,pos,out)
          in_flag = 0
          print('len(s2): ',len(s2))
        cnt+=1
        pos1 = pos
        pos = s.find(out,pos)
        if(pos==-1):
          pos = ind
        if(pos>pos1+1):
          pos=pos1+1
        pos+=len(out)
      if (endOffset<startOffset):
        endOffset = pos
      if(startOffset+1>0 and startOffset<endOffset and len(s2)>80):
        csvFile = open("drive/MyDrive/MeasEval-main/eval/submission/"+s_file_name+".tsv", 'a', newline='', encoding='utf8')
        csvWriter = csv.writer(csvFile,delimiter='\t')
        csvWriter.writerow([s_file_name,annot_set,"MeasuredEntity",startOffset,endOffset,'T2-'+str(annot),s[startOffset:endOffset],"{\"HasQuantity\": \"T1-"+str(annot_set)+"\"}"])
        annot+=1

    k1+=24

For measured property

In [None]:
#for measured property
import csv
p=0
n=0
pos=0
neg=0
prec_num = 0 
prec_den = 0
rec_num = 0
rec_den = 0 
p1 = 0
n1=0
k1 = 0

def remove_hash(s):
  out = ''
  for i in s:
    if(i!='#'):
      out+=i
  return out

file_cnt = -1
annot = 1
print('hi')
file_name = ''
for seq, mask, y in val_loader:
    bert_model.zero_grad()
    bert_model.eval()
    y_pred = bert_model(seq.long().to(device), mask.long().to(device))
    print(y_pred)
    np_out = np.zeros((len(y_pred),256, 1))
    np_act = (y.cpu().data.numpy() >= 1).astype(int).reshape((len(y), len(y[0]),1))
    for i in range(k1,min(len(validation),k1+24)):
      sent_tuple = validation[i]
      annotId = validation[i][2]
      ind = validation[i][3]
      s_file_name = validation[i][4]
      file_name1 = file_name
      file_name = validation[i][5]
      annot_set = validation[i][6]
      fl = open(file_name,'r')
      if(file_name1!=file_name):
        print('annot is 1')
        annot = 1
        file_cnt+=1
      s = fl.read()
      s = s.casefold()
      f2 = open("drive/MyDrive/MeasEval-main/eval/submission/"+s_file_name+".tsv",'r', newline='', encoding='utf8')
      s2 = f2.read()
      startOffset = -1
      endOffset = -1
      pos = ind
      cnt = 0
      in_flag = 0

      tsv_data = pd.read_csv("drive/MyDrive/MeasEval-main/eval/submission/"+s_file_name+".tsv", sep='\t')
      max_annot = 0
      for it in range(0,tsv_data.shape[0]):
        if(tsv_data.loc[it][2]=='Quantity'):
          max_annot+=1
      for words in validation[i][0]:
        if(words == '$'or (words[0]=='[' and words[len(words)-1]==']')):
          cnt+=1
          continue
        out = remove_hash(words)
        if (int(y_pred[i-k1][cnt+1])>0 and in_flag==0):
          in_flag = 1
          startOffset = s.find(out,pos)
          #pos = s.find(out,pos)
          #print('startOffset: ',startOffset,s.find(out,pos),out,ind,s)
        elif (int(y_pred[i-k1][cnt+1])==0 and in_flag==1):
          endOffset = s.find(out,pos)
          #print('endOffset: ',startOffset,endOffset,pos,out)
          in_flag = 0
          #print('len(s2): ',len(s2))
        cnt+=1
        pos1 = pos
        pos = s.find(out,pos)
        if(pos==-1):
          pos = ind
        if(pos>pos1+1):
          pos=pos1+1
        pos+=len(out)
      if (endOffset<startOffset):
        endOffset = pos
      if(startOffset+1>0 and startOffset<endOffset and len(s2)>80):
        csvFile = open("drive/MyDrive/MeasEval-main/eval/submission/"+s_file_name+".tsv", 'a', newline='', encoding='utf8')
        csvWriter = csv.writer(csvFile,delimiter='\t')
        csvWriter.writerow([s_file_name,annot_set,"MeasuredProperty",startOffset,endOffset,'T3-'+str(annot),s[startOffset:endOffset],"{\"HasQuantity\": \"T1-"+str(annot_set)+"\"}"])
        annot+=1

    k1+=24