In [None]:
#### Git cloning the Training data 
!git clone https://github.com/ncg-task/training-data

In [None]:
#### Git cloning the Validation data
!git clone https://github.com/ncg-task/trial-data.git

In [None]:
#### Load the required packages and setting the seed
import os
import copy
import random
import numpy as np
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import time
seed_val = 66
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
#### Training dataset reading
input_dir = "/content/training-data/" #Point it to the directory where your Training Data is present
list_of_folders = ["query_wellformedness", "passage_re-ranking", "part-of-speech_tagging", 
         "sentence_compression", "sentiment_analysis", "temporal_information_extraction", 
         "phrase_grounding", "text_generation", "text-to-speech_synthesis", 
         "smile_recognition", "topic_models", "question_generation", 
         "relation_extraction", "paraphrase_generation", "question_similarity", 
         "question_answering", "sentence_classification", "prosody_prediction", 
         "semantic_role_labeling", "text_summarization", "semantic_parsing", 
         "sarcasm_detection", "natural_language_inference", "negation_scope_resolution"] #List all the folders present in the directory
input_stanza_list = [] #Stores individual lines from Stanza_out.txt file
input_sent_num_list = [] #Stores individual lines from sentences.txt file
input_entity_list = [] #Stores the list of all phrases from entities.txt
file_name_list = [] #Stores the name of individual files
total_phrases_truth = 0
for fls in list_of_folders:
  count=0
  for i in os.listdir(input_dir + fls + '/'):
    count=count+1
    for files in os.listdir(input_dir + fls + '/' + str(i)):
      if files.endswith("Stanza-out.txt"):
        stanza_file = open(input_dir + fls + '/' + str(i) + '/' + files, "r")
        stanza_lines = (stanza_file.read()).lower()
        stanza_lines_list = list(filter(None,stanza_lines.splitlines())) # filter empty strings and split into lines
        stanza_lines_list_trimmed = [x.strip() for x in stanza_lines_list]
        input_stanza_list.append(stanza_lines_list_trimmed)
      if files.endswith("sentences.txt"):
        sentence_file = open(input_dir + fls + '/' + str(i) + '/' + files, "r")
        sentence_num_list = list(filter(None,(sentence_file.read().lower()).splitlines())) # filter empty strings and split into lines
        sentence_num_list_trimmed  = [x.strip() for x in sentence_num_list]
        input_sent_num_list.append(list(map(int, sentence_num_list_trimmed)))
      if files.endswith("entities.txt"):
        entities_file = open(input_dir + fls + '/' + str(i) + '/' + files, "r")
        entities_list = list(filter(None,(entities_file.read().lower()).splitlines())) # filter empty strings and split into lines
        entities_list_trimmed = [x.strip() for x in entities_list]
        input_entity_list.append(entities_list_trimmed)
        total_phrases_truth = total_phrases_truth + len(entities_list)
    file_name_list.append(fls + '/' + str(i))
  print("completed",fls,total_phrases_truth, count)


In [None]:
#### Validation dataset reading
val_input_dir = "/content/trial-data/" #Point it to the directory where your Validation Data is present
val_list_of_folders = ["machine-translation", "named-entity-recognition", "question-answering",
         "relation-classification", "text-classification"] #List all the folders present in the directory
val_input_stanza_list = [] #Stores individual lines from Stanza_out.txt file
val_input_sent_num_list = [] #Stores individual lines from sentences.txt file
val_input_entity_list = [] #Stores the list of all phrases from entities.txt
val_file_name_list = [] #Stores the name of individual files
val_total_phrases_truth = 0
for fls in val_list_of_folders:
  count=0
  for i in os.listdir(val_input_dir + fls + '/'):
    count=count+1
    for files in os.listdir(val_input_dir + fls + '/' + str(i)):
      if files.endswith("Stanza-out.txt"):
        stanza_file = open(val_input_dir + fls + '/' + str(i) + '/' + files, "r")
        stanza_lines = stanza_file.read().lower()
        stanza_lines_list = list(filter(None,stanza_lines.splitlines())) # filter empty strings and split into lines
        stanza_lines_list_trimmed = [x.strip() for x in stanza_lines_list]
        val_input_stanza_list.append(stanza_lines_list_trimmed)
      if files.endswith("sentences.txt"):
        sentence_file = open(val_input_dir + fls + '/' + str(i) + '/' + files, "r")
        sentence_num_list = list(filter(None,(sentence_file.read().lower()).splitlines())) # filter empty strings and split into lines
        sentence_num_list_trimmed  = [x.strip() for x in sentence_num_list]
        val_input_sent_num_list.append(list(map(int, sentence_num_list_trimmed)))
      if files.endswith("entities.txt"):
        entities_file = open(val_input_dir + fls + '/' + str(i) + '/' + files, "r")
        entities_list = list(filter(None,(entities_file.read().lower()).splitlines())) # filter empty strings and split into lines
        entities_list_trimmed = [x.strip() for x in entities_list]
        val_input_entity_list.append(entities_list_trimmed)
        val_total_phrases_truth = val_total_phrases_truth + len(entities_list)
    val_file_name_list.append(fls + '/' + str(i))
  print("completed",fls,val_total_phrases_truth,count)

In [None]:
#### It returns BILUO labels based on number of phrases(n)
def make_substring(n): 
  if n==0:
    return ''
  elif n ==1:
    return 'U'
  elif n==2:
    return 'B L'
  else:
    t1 = 'I '*(n-2)
    return 'B '+t1+'L'

In [None]:
#### Here we replace contribution sentences in Training data with BILUO labels using the helper function make_substring  
task2_in = [] #Stores the contribution sentences
task2_label = [] #Stores the BILUO labelling of contribution sentences

for i in range(len(input_stanza_list)): #Process each file for labels
  entity_list = [j.split('\t') for j in input_entity_list[i]] #Split the entities line
  entity_list.sort(key=lambda x: (int(x[0]),int(x[1]))) #Phrases arrangement in ascending order according to their sentence numbers and their starting character numbers
  sent_num_list = copy.deepcopy(input_sent_num_list[i]) #Copy of the sentences list
  sent_num_list.sort() #Sort the sentences list
  sent_list = [] #Temporarily stores contribution sentences
  
  for x in sent_num_list: 
    sent_list.append(input_stanza_list[i][x-1])
  
  sent_dict_list = dict(zip(sent_num_list,sent_list)) #Dictionary of sentence number and their corresponding sentences

  for n ,ind_s ,ind_e, ph in entity_list: #BILUO Label Formation using phrases stored in entity_list
    if int(n) in sent_num_list:
      sent_dict_list[int(n)] = sent_dict_list[int(n)].replace(ph,make_substring(len(ph.split())),1)

  sent_label_list = list(sent_dict_list.values())
  task2_in.append(sent_list)
  task2_label.append(sent_label_list)

In [None]:
#### Here we replace contribution sentences in Validation data with BILUO labels using the helper function make_substring  
val_task2_in = [] #Stores the contribution sentences
val_task2_label = [] #Stores the BILUO labelling of contribution sentences
for i in range(len(val_input_stanza_list)): #Process each file for labels
  val_entity_list = [j.split('\t') for j in val_input_entity_list[i]] # Split the entities line
  val_entity_list.sort(key=lambda x: (int(x[0]),int(x[1]))) #Phrases arrangement in ascending order according to their sentence numbers and their starting character numbers
  val_sent_num_list = copy.deepcopy(val_input_sent_num_list[i]) #Copy of the sentences list
  val_sent_num_list.sort() #Sort the sentences list
  val_sent_list = []  #Temporarily stores contribution sentences
  
  for x in val_sent_num_list: 
    val_sent_list.append(val_input_stanza_list[i][x-1])
  
  val_sent_dict_list = dict(zip(val_sent_num_list,val_sent_list)) #Dictionary of sentence number and their corresponding sentences
  for n ,ind_s ,ind_e, ph in val_entity_list: #BILUO Label Formation using phrases stored in val_entity_list
    if int(n) in val_sent_num_list:
      val_sent_dict_list[int(n)] = val_sent_dict_list[int(n)].replace(ph,make_substring(len(ph.split())),1)
      
  val_sent_label_list = list(val_sent_dict_list.values())
  val_task2_in.append(val_sent_list)
  val_task2_label.append(val_sent_label_list)

In [None]:
#### Replacing all words which are not "BILU", in contribution sentences of Training data with "O" token  
for i,out in enumerate(task2_label): 
  for j,line in enumerate(out):
    for k,tok in enumerate(line.split()):
      if tok not in ['B','I','L','U']:
        task2_label[i][j]=task2_label[i][j].replace(tok,'O',1)

In [None]:
#### Replacing all words which are not "BILU", in contribution sentences of Validation data with "O" token  
for i,out in enumerate(val_task2_label): 
  for j,line in enumerate(out):
    for k,tok in enumerate(line.split()):
      if tok not in ['B','I','L','U']:
        val_task2_label[i][j]=val_task2_label[i][j].replace(tok,'O',1)

In [None]:
#### Helper Functions

#Return the argmax as a python int
def argmax(vec):
    _, idx = torch.max(vec, 1)
    return idx.item()

#The function uses SciBERT tokenizer to prepare the sentence to be fed into SciBERT 
def prepare_sequence(seq):
    for count,i in enumerate(seq):
       temp = tokenizer.tokenize(i) 
       if(len(temp)>1): #Only the first token of the word has been considered(same as in NER paper) 
         seq[count] = temp[0]           
    sentences = " ".join(seq)
    inputs = tokenizer(sentences, return_tensors="pt")
    return inputs


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [None]:
#### Define the Model 

class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        #Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        #Pretrained SciBert downloaded from allenai
        self.modell = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

        #Matrix of transition parameters. Entry i,j is the score of transitioning to i from j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        #These two statements enforce the constraint that we never transfer to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self): 
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        #Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        #START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        #Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        #Iterate through the sentence
        for feat in feats:
            alphas_t = []  #The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                #Broadcast the emission score: it is the same regardless of the previous tag
                emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
                #The ith entry of trans_score is the score of transitioning to next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                #The ith entry of next_tag_var is the value for th edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        #The tokenized sentence is passed through SciBERT and output from LSTM is mapped into tag space
        outputs = self.modell(**sentence, output_hidden_states = True)
        scibert_out = ((outputs[2][12])[0]).view(len(sentence["input_ids"][0]), 1, -1)
        self.hidden = self.init_hidden()
        lstm_out, self.hidden = self.lstm(scibert_out, self.hidden)
        lstm_out = lstm_out.view(len(sentence["input_ids"][0]), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        #Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        #Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        #forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  #Holds the backpointers for this step
            viterbivars_t = [] #Holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                #next_tag_var[i] holds the viterbi variable for tag i at the previous step, plus the score of transitioning
                #from tag i to next_tag. We don't include the emission scores here because the max does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            #Now we add in the emission scores, and assign forward_var to the set of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        #Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        #Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        #Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  #Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        #The loss function 
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  
        #Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        #Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [None]:
#### Model hyperparameters defined
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 768 #Since SciBERT encodes tokens of a sentence in (768,1) dimension.
HIDDEN_DIM = 200 #For BiLSTM

In [None]:
#### Maps BILUO labels to index numbers and vice-versa.
tag_to_ix = {"B": 0, "I": 1, "L": 2, "U": 3, "O": 4, START_TAG: 5, STOP_TAG: 6}
ix_to_tag = {v: k for k, v in tag_to_ix.items()}

In [None]:
#### Allenai SciBERT
!pip install transformers
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

In [None]:
#### Initialize the Model and Optimizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
if (device=='cuda'):
  model.cuda()

In [None]:
#### listp is the list of predicted labels, listg is the list of ground truth labels. This function
#calculates the number of correct predictions and the number of total phrases predicted.
 
def calc_eval(listp, listg):
  correct = 0
  total_p = 0
  for i in zip(listp, listg):
    flag = 0
    for j in zip(i[0], i[1]):
      if j[0]=='U' or j[0]=='B':
        total_p += 1
      if flag==0:
        if j[0]=='U' and j[1]=='U':
          correct += 1
        if j[0]=='B' and j[1]=='B':
          flag = 1
      else:
        if j[0]=='L' and j[1]=='L':
          correct += 1
          flag = 0
        elif j[0]==j[1]:
          continue
        else:
          flag = 0
  return correct, total_p

In [None]:
#### This function evaluates the precision, recall and F1 score achieved on training data using the helper function calc_eval 
def training_eval():
  #task2_out_label stores the prediction labels made by the model
  task2_out_label = []
  for i,file in enumerate(task2_in):
    output = []
    for j,sent in enumerate(file):
      with torch.no_grad():
        precheck_sent = prepare_sequence(task2_in[i][j].split()).to(device)
        sent_out_label = model(precheck_sent)[1]
        sent_str_label = [ix_to_tag[t] for t in sent_out_label]   
        output.append(sent_str_label)
    task2_out_label.append(output)
  print(len(task2_out_label))

  true_pos = 0 #All correct predictions
  total_ph_pred = 0 #All phrases prediction made(BILU)
  for i,file in enumerate(task2_label):
    file_true_pos, file_total_ph_pred = calc_eval(task2_out_label[i],[["O"] + s.split() + ["O"] for s in task2_label[i]])
    true_pos = true_pos + file_true_pos
    total_ph_pred = total_ph_pred + file_total_ph_pred
    if i%20==0:
      print("Each example",i,file_true_pos,file_total_ph_pred)
  print(true_pos,total_ph_pred,total_phrases_truth)

  precision = 0
  recall = 0
  F1score =  0
  if(total_ph_pred!=0):
    precision = true_pos/total_ph_pred
  if(total_phrases_truth!=0):
    recall = true_pos/total_phrases_truth
  if((precision + recall)!=0):  
    F1score = 2 * precision*recall/(precision+recall)
  print("Precision is {} and recall is {} and F1 Score is {}".format(precision,recall,F1score))

In [None]:
#### This function evaluates the precision, recall and F1 score achieved on validation data using the helper function calc_eval 
def validation_eval():
  #task2_out_label stores the prediction labels made by the model
  val_task2_out_label = []
  for i,file in enumerate(val_task2_in):
    output = []
    for j,sent in enumerate(file):
      with torch.no_grad():
        precheck_sent = prepare_sequence(val_task2_in[i][j].split()).to(device)
        sent_out_label = model(precheck_sent)[1]
        sent_str_label = [ix_to_tag[t] for t in sent_out_label]
        output.append(sent_str_label)
    val_task2_out_label.append(output)
  print(len(val_task2_out_label))

  val_true_pos = 0  #All correct predictions
  val_total_ph_pred = 0 #All phrases prediction made(BILU)
  for i,file in enumerate(val_task2_label):
    val_file_true_pos, val_file_total_ph_pred = calc_eval(val_task2_out_label[i],[["O"] + s.split() + ["O"] for s in val_task2_label[i]])
    val_true_pos = val_true_pos + val_file_true_pos
    val_total_ph_pred = val_total_ph_pred + val_file_total_ph_pred
    print("Each example",i,"true pos",val_file_true_pos,"phrase in pred",val_file_total_ph_pred)
  print(val_true_pos,val_total_ph_pred,val_total_phrases_truth)

  val_precision = 0
  val_recall = 0
  val_F1score =  0
  if(val_total_ph_pred!=0):
    val_precision = val_true_pos/val_total_ph_pred
  if(val_total_phrases_truth!=0):
    val_recall = val_true_pos/val_total_phrases_truth
  if((val_precision + val_recall)!=0):  
    val_F1score = 2 * val_precision*val_recall/(val_precision+val_recall)
  print("Precision is {} and recall is {} and F1 Score is {}".format(val_precision,val_recall,val_F1score))

In [None]:
#### The MAIN TRAINING CELL

#Check predictions before training
with torch.no_grad():
    
    precheck_sent = prepare_sequence(task2_in[0][0].split()).to(device) #Tokenize the sentence before passing it into SciBERT
    precheck_tags = torch.tensor([4] + [tag_to_ix[t] for t in task2_label[0][0].split()] + [4], dtype=torch.long).to(device) #Ground Truth Labels
    print(precheck_tags)
    print(model(precheck_sent),'\n',precheck_tags)
    print("checkpoint passed")


for epoch in range(1): #Decide the number of epochs
    start = time.time() #Starting time measured
    model.train() #Model in training mode

    for i,file in enumerate(task2_in):
      for j,sent in enumerate(file):
        
        model.zero_grad() #Zero gradients 

        sentence_in = prepare_sequence(sent.split()).to(device)
    
        targets = torch.tensor([4] + [tag_to_ix[t] for t in task2_label[i][j].split()] + [4], dtype=torch.long).to(device)
        
        loss = model.neg_log_likelihood(sentence_in, targets)

        loss.backward()

        optimizer.step()

    end = time.time() #Ending time measured

    #Saving the Model, note that it will be very heavy to store(approx 400 Mb)  
    if epoch%1 == 0: 
      torch.save(model,"/content/mod" + str(epoch) + ".pt")

    print("The epoch completed is",epoch, "and time", end-start)

    #Validation Loss computation
    if epoch%1 ==0:
      model.eval() #In evaluation mode
      training_eval() #Training metrics for every epoch
      validation_eval() #Validation metrics for every epoch
      val_total_loss = 0.
      with torch.no_grad():
        for k, val_file in enumerate(val_task2_in):
          for l, val_sent in enumerate(val_file):
            val_sentence_in = prepare_sequence(val_sent.split()).to(device)
            val_targets = torch.tensor([4] + [tag_to_ix[t] for t in val_task2_label[k][l].split()] + [4], dtype=torch.long).to(device)
            loss = model.neg_log_likelihood(val_sentence_in, val_targets)
            val_total_loss += loss.item()
      print("validation loss after epoch",epoch," is", val_total_loss)
      

#Simple check
with torch.no_grad():
    precheck_sent = prepare_sequence(task2_in[0][0].split()).to(device)
    print(model(precheck_sent))

In [None]:
#### Load the saved model and put it into evaluation mode 
model = torch.load("/content/mod0.pt")
model.eval()

In [None]:
#### Test dataset reading

#Point it to the directory where your Test Data is present
test_input_dir = "/content/drive/MyDrive/sub1_ph2/"

test_list_of_folders = ["constituency_parsing","coreference_resolution",
                   "data-to-text_generation","dependency_parsing",
                   "document_classification","entity_linking",
                   "face_alignment","face_detection", "hypernym_discovery",
                   "natural_language_inference"]

test_input_stanza_list = [] #Stores individual lines from Stanza_out.txt file
test_input_sent_num_list = [] #Stores individual lines from sentences.txt file
test_input_entity_list = [] #Stores the list of all phrases from entities.txt
test_file_name_list = [] #Stores the name of individual files
test_total_phrases_truth = 0
Capital_test_input_stanza_list = [] #Stores individual lines from Stanza_out.txt file in its original case(not lowered case)

for fls in test_list_of_folders:
  count=0
  for i in os.listdir(test_input_dir + fls + '/'):
    count=count+1
    for files in os.listdir(test_input_dir + fls + '/' + str(i)):
      if files.endswith("Stanza-out.txt"):
        stanza_file = open(test_input_dir + fls + '/' + str(i) + '/' + files, "r")
        print(test_input_dir + fls + '/' + str(i))
        Capital_stanza_lines = stanza_file.read()
        Capital_stanza_lines_list = list(filter(None,Capital_stanza_lines.splitlines())) # filter empty strings and split into lines
        Capital_test_input_stanza_list.append(Capital_stanza_lines_list)

        stanza_lines = Capital_stanza_lines.lower()
        stanza_lines_list = list(filter(None,stanza_lines.splitlines())) # filter empty strings and split into lines
        test_input_stanza_list.append(stanza_lines_list)
      if files.endswith("sentences.txt"):
        sentence_file = open(test_input_dir + fls + '/' + str(i) + '/' + 'sentences.txt', "r")
        sentence_num_list = list(filter(None,(sentence_file.read().lower()).splitlines())) # filter empty strings and split into lines
        test_input_sent_num_list.append(list(map(int, sentence_num_list)))
     
    test_file_name_list.append(fls + '/' + str(i))
  print("completed",fls,count)

#Check test data
print("Test no. of examples for each stanza,sentences and entities")
print(len(test_input_sent_num_list),len(test_file_name_list))

print("print one example to show all files")
print(len(test_input_sent_num_list[0]),test_file_name_list[0])


In [None]:
#### Store the contribution sentences in Lower Case and in their Original Case 
import copy
test_task2_in = [] #Stores the contribution sentences
Capital_test_task2_in = [] #Stores the contribution sentences in Original Case

for i in range(len(test_input_sent_num_list)): #Process each file for labels
  
  test_sent_num_list = copy.deepcopy(test_input_sent_num_list[i]) #Copy of the sentences list
  test_sent_num_list.sort() #Sorting the sentences list
  test_sent_list = []  #List containing sentence strings
  Capital_test_sent_list = []
 
  for x in test_sent_num_list: 
    test_sent_list.append(test_input_stanza_list[i][x-1])
    Capital_test_sent_list.append(Capital_test_input_stanza_list[i][x-1])
  
  test_task2_in.append(test_sent_list)
  Capital_test_task2_in.append(Capital_test_sent_list)

In [None]:
#### This is where we make a dictionary that can map sentence to its corresponding number in Stanza_out.txt 

list_of_dict_for_number_to_sentence = []
for i in range(len(test_input_stanza_list)): #Process each file for labels
 
  test_sent_num_list = copy.deepcopy(test_input_sent_num_list[i]) #Copy of the sentences list
  test_sent_num_list.sort() #Sorting the sentences list
  test_sent_list = [] #List containg sentence strings

  for x in test_sent_num_list: 
    test_sent_list.append(test_input_stanza_list[i][x-1])
  test_sent_dict_list = dict(zip(test_sent_num_list,test_sent_list)) #Dictionary of sentence number and strings
  list_of_dict_for_number_to_sentence.append(test_sent_dict_list)  

#This is the mapper list of dictionaries
list_of_dict_for_sentence_to_number = [dict((v,k) for k,v in a.items()) for a in list_of_dict_for_number_to_sentence]

In [None]:
#### Test label Prediction Function

test_task2_out_label = [] #Stores the Labels predicted

for i,file in enumerate(test_task2_in):
  output = []
  for j,sent in enumerate(file):
    with torch.no_grad():
      precheck_sent = prepare_sequence(test_task2_in[i][j].split()).to(device)
      sent_out_label = model(precheck_sent)[1]
      sent_str_label = [ix_to_tag[t] for t in sent_out_label]
      output.append(sent_str_label)
  test_task2_out_label.append(output)
print(len(test_task2_out_label))

In [None]:
#### For reference
print(len(test_task2_in))
print(len(test_task2_in[0]))
print(test_task2_in[0])
print()
print(len(Capital_test_task2_in))
print(len(Capital_test_task2_in[0]))
print(Capital_test_task2_in[0])
print()
print(test_file_name_list[0])
print()
print(len(test_task2_out_label))
print(len(test_task2_out_label[0]))
print((test_task2_out_label[0]))
print()
print((test_task2_in[0][2]))
print(len(test_task2_out_label[0][2]))
print(len(test_task2_in[0][2].split()))

In [None]:
#### Here we convert the predicted labels into phrases and write them into entities.txt file along with its starting and ending character number

for i,file in enumerate(test_task2_in):
  
  print(test_file_name_list[i]) #Print the current file name 
  
  f1 = open(test_input_dir + test_file_name_list[i] + "/entities.txt", "w") #Open it

  for j,sent in enumerate(file):

    biluo_list = (test_task2_out_label[i][j])[1:-1] #The start and end labels should be ignored as they correspond to CLS and SEP token respectively.
    respective_sentence = Capital_test_task2_in[i][j].split()
    sentence_number = (list_of_dict_for_sentence_to_number[i])[test_task2_in[i][j]]

    if(len(respective_sentence) != len(biluo_list)):
      print("Length mismatch in the sentence and BILUO sequence")
      continue

    temp_phrase_storer = []
    temp_phrase = []
    count_of_words_in_sentence = 0

    for k in zip(biluo_list,respective_sentence):

      if (k[0]=="U"): #That means its a single phrase
        temp_phrase_storer = temp_phrase_storer + [k[1]]

        start_of_word = 0
        if(count_of_words_in_sentence == 0):
          start_of_word = 0
        else:  
          start_of_word = len((" ".join(respective_sentence[0:count_of_words_in_sentence])).strip() + " ") 

        end_of_word = start_of_word + len(k[1].strip())

        f1.write(str(sentence_number) + "\t" +  str(start_of_word) + "\t" + str(end_of_word) + "\t" + k[1].strip() + "\n")

      elif (k[0]=="B"):
        temp_phrase = temp_phrase + [k[1]] 

      elif (k[0]=="I"):
        temp_phrase = temp_phrase + [" ", k[1]]

      elif (k[0]=="L"): #We have reached the end of phrase, so store it now
        temp_phrase = temp_phrase + [" ", k[1]]

        end_of_words = len((" ".join(respective_sentence[0:count_of_words_in_sentence])).strip() + " ") + len(respective_sentence[count_of_words_in_sentence].strip())
        start_of_words = end_of_words - len(("".join(temp_phrase)).strip())

        f1.write(str(sentence_number) + "\t" + str(start_of_words) + "\t" + str(end_of_words) + "\t" + ("".join(temp_phrase)).strip() + "\n")
        temp_phrase_storer =  temp_phrase_storer + copy.deepcopy(["".join(temp_phrase)])
        temp_phrase = []

      count_of_words_in_sentence += 1 

  f1.close()  
  print("done")