## This script preprocesses corpora in the format of Sherlock, SFU, and Bioscope. The Data class is taken from NegBERT by Khandelwal et al. and is adjusted to extract three vectors per sentence: tokens, cue labels, scope labels. The script puts them in a nested list and then writes them to json files.


In [None]:
# Imports needed
import os, re, torch, html, json, math, random

Upload the datasets to Google Drive. 
This allows access to your Google Drive from this notebook.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
TASK = 'negation'

## Data Preprocessing

In [None]:
class Data:
    def __init__(self, file, dataset_name = 'sfu', frac_no_cue_sents = 1.0):
        self.the_big_data_list = []
        '''
        file: The path of the data file.
        dataset_name: The name of the dataset to be preprocessed. Values supported: sfu, bioscope, starsem.
        frac_no_cue_sents: The fraction of sentences to be included in the data object which have no negation/speculation cues.
        '''
        def starsem(f_path, cue_sents_only=False, frac_no_cue_sents = 1.0):
            raw_data = open(f_path)
            sentence = []
            labels = []
            label = []
            scope_sents = []
            data_scope = []
            scope = []
            scope_cues = []
            # list of lists of all sentences
            data = []
            cue_only_data = []
            
            for line in raw_data:
                label = []
                sentence = []
                tokens = line.strip().split()

                # go through sentneces with NO NEGATION
                if len(tokens)==8: #This line has no cues
                        # append the word
                        sentence.append(tokens[3])
                        label.append(3) #Not a cue
                        for line in raw_data:
                            tokens = line.strip().split()
                            if len(tokens)==0:
                                break
                            else:
                                sentence.append(tokens[3])
                                label.append(3)
                        cue_only_data.append([sentence, label])
                        

               # deal with sentences WITH NEGATION     
                else: #The line has 1 or more cues
                    num_cues = (len(tokens)-7)//3
                    #cue_count+=num_cues
                    scope = [[] for i in range(num_cues)]
                    label = [[],[]] #First list is the real labels, second list is to modify if it is a multi-word cue.
                    label[0].append(3) #Generally not a cue, if it is will be set ahead.
                    label[1].append(-1) #Since not a cue, for now.
                    for i in range(num_cues):
                        # collect labels for CUES
                        if tokens[7+3*i] != '_': #Cue field is active
                            if tokens[8+3*i] != '_': #Check for affix
                                label[0][-1] = 0 #Affix
                                # this list is not defined or used anywhere!
                                affix_list.append(tokens[7+3*i])
                                label[1][-1] = i #Cue number
                                #sentence.append(tokens[7+3*i])
                                #new_word = '##'+tokens[8+3*i]
                            else:
                                label[0][-1] = 1 #Maybe a normal or multiword cue. The next few words will determine which.
                                label[1][-1] = i #Which cue field, for multiword cue altering.

                        # collect labels for tokens in the SCOPE     
                        if tokens[8+3*i] != '_':
                            scope[i].append(1)

                        # labels for non-scope tokens
                        else:
                            scope[i].append(0)

                    # append the word
                    sentence.append(tokens[3])
                    for line in raw_data:
                        tokens = line.strip().split()
                        if len(tokens)==0:
                            break
                        else:
                            sentence.append(tokens[3])
                            label[0].append(3) #Generally not a cue, if it is will be set ahead.
                            label[1].append(-1) #Since not a cue, for now.   
                            for i in range(num_cues):
                                if tokens[7+3*i] != '_': #Cue field is active
                                    if tokens[8+3*i] != '_': #Check for affix
                                        label[0][-1] = 0 #Affix
                                        label[1][-1] = i #Cue number
                                    else:
                                        label[0][-1] = 1 #Maybe a normal or multiword cue. The next few words will determine which.
                                        label[1][-1] = i #Which cue field, for multiword cue altering.
                                if tokens[8+3*i] != '_':
                                    scope[i].append(1)
                                else:
                                    scope[i].append(0)


                    # fix multiword cues
                    for i in range(num_cues):
                        indices = [index for index,j in enumerate(label[1]) if i==j]
                        count = len(indices)
                        if count>1:
                            for j in indices:
                                label[0][j] = 2
                    for i in range(num_cues):
                        sc = []
                        for a,b in zip(label[0],label[1]):
                            if i==b:
                                sc.append(a)
                            else:
                                sc.append(3)
                        scope_cues.append(sc)
                        scope_sents.append(sentence)
                        data_scope.append(scope[i])
                    labels.append(label[0])
                    data.append(sentence)
            cue_only_samples = random.sample(cue_only_data, k=int(frac_no_cue_sents*len(cue_only_data)))
            cue_only_sents = [i[0] for i in cue_only_samples]
            cue_only_cues = [i[1] for i in cue_only_samples]
            starsem_cues = (data+cue_only_sents,labels+cue_only_cues)
            starsem_scopes = (scope_sents, scope_cues, data_scope)
            return [starsem_cues, starsem_scopes]

            
        def bioscope(f_path, cue_sents_only=False, frac_no_cue_sents = 1.0):
            file = open(f_path, encoding = 'utf-8')
            sentences = []
            for s in file:
                sentences+=re.split("(<.*?>)", html.unescape(s))
            cue_sentence = []
            cue_cues = []
            cue_only_data = []
            scope_cues = []
            scope_scopes = []
            scope_sentence = []
            sentence = []
            cue = {}
            scope = {}
            in_scope = []
            in_cue = []
            word_num = 0
            c_idx = []
            s_idx = []
            in_sentence = 0
            for token in sentences:
                if token == '':
                    continue
                elif '<sentence' in token:
                    in_sentence = 1
                elif '<cue' in token:
                    if TASK in token:
                        in_cue.append(str(re.split('(ref=".*?")',token)[1][4:]))
                        c_idx.append(str(re.split('(ref=".*?")',token)[1][4:]))
                        cue[c_idx[-1]] = []
                elif '</cue' in token:
                    in_cue = in_cue[:-1]
                elif '<xcope' in token:
                    #print(re.split('(id=".*?")',token)[1][3:])
                    in_scope.append(str(re.split('(id=".*?")',token)[1][3:]))
                    s_idx.append(str(re.split('(id=".*?")',token)[1][3:]))
                    scope[s_idx[-1]] = []
                elif '</xcope' in token:
                    in_scope = in_scope[:-1]
                elif '</sentence' in token:
                    #print(cue, scope)
                    if len(cue.keys())==0:
                        cue_only_data.append([sentence, [3]*len(sentence)])
                    else:
                        cue_sentence.append(sentence)
                        cue_cues.append([3]*len(sentence))
                        for i in cue.keys():
                            scope_sentence.append(sentence)
                            scope_cues.append([3]*len(sentence))
                            if len(cue[i])==1:
                                cue_cues[-1][cue[i][0]] = 1
                                scope_cues[-1][cue[i][0]] = 1
                            else:
                                for c in cue[i]:
                                    cue_cues[-1][c] = 2
                                    scope_cues[-1][c] = 2
                            scope_scopes.append([0]*len(sentence))

                            if i in scope.keys():
                                for s in scope[i]:
                                    scope_scopes[-1][s] = 1

                    sentence = []
                    cue = {}
                    scope = {}
                    in_scope = []
                    in_cue = []
                    word_num = 0
                    in_sentence = 0
                    c_idx = []
                    s_idx = []
                elif '<' not in token:
                    if in_sentence==1:
                        words = token.split()
                        sentence+=words
                        if len(in_cue)!=0:
                            for i in in_cue:
                                cue[i]+=[word_num+i for i in range(len(words))]
                        elif len(in_scope)!=0:
                            for i in in_scope:
                                scope[i]+=[word_num+i for i in range(len(words))]
                        word_num+=len(words)
            cue_only_samples = random.sample(cue_only_data, k=int(frac_no_cue_sents*len(cue_only_data)))
            cue_only_sents = [i[0] for i in cue_only_samples]
            cue_only_cues = [i[1] for i in cue_only_samples]
            return [(cue_sentence+cue_only_sents, cue_cues+cue_only_cues),(scope_sentence, scope_cues, scope_scopes)]
        
        def sfu_review(f_path, cue_sents_only=False, frac_no_cue_sents = 1.0):
            file = open(f_path, encoding = 'utf-8')
            sentences = []
            for s in file:
                sentences+=re.split("(<.*?>)", html.unescape(s))
            cue_sentence = []
            cue_cues = []
            scope_cues = []
            scope_scopes = []
            scope_sentence = []
            sentence = []
            cue = {}
            scope = {}
            in_scope = []
            in_cue = []
            word_num = 0
            c_idx = []
            cue_only_data = []
            s_idx = []
            in_word = 0
            for token in sentences:
                if token == '':
                    continue
                elif token == '<W>':
                    in_word = 1
                elif token == '</W>':
                    in_word = 0
                    word_num += 1
                elif '<cue' in token:
                    if TASK in token:
                        in_cue.append(int(re.split('(ID=".*?")',token)[1][4:-1]))
                        c_idx.append(int(re.split('(ID=".*?")',token)[1][4:-1]))
                        cue[c_idx[-1]] = []
                elif '</cue' in token:
                    in_cue = in_cue[:-1]
                elif '<xcope' in token:
                    continue
                elif '</xcope' in token:
                    in_scope = in_scope[:-1]
                elif '<ref' in token:
                    in_scope.append([int(i) for i in re.split('(SRC=".*?")',token)[1][5:-1].split(' ')])
                    s_idx.append([int(i) for i in re.split('(SRC=".*?")',token)[1][5:-1].split(' ')])
                    for i in s_idx[-1]:
                        scope[i] = []
                elif '</SENTENCE' in token:
                    if len(cue.keys())==0:
                        cue_only_data.append([sentence, [3]*len(sentence)])
                    else:
                        cue_sentence.append(sentence)
                        cue_cues.append([3]*len(sentence))
                        for i in cue.keys():
                            scope_sentence.append(sentence)
                            scope_cues.append([3]*len(sentence))
                            if len(cue[i])==1:
                                cue_cues[-1][cue[i][0]] = 1
                                scope_cues[-1][cue[i][0]] = 1
                            else:
                                for c in cue[i]:
                                    cue_cues[-1][c] = 2
                                    scope_cues[-1][c] = 2
                            scope_scopes.append([0]*len(sentence))
                            if i in scope.keys():
                                for s in scope[i]:
                                    scope_scopes[-1][s] = 1
                    sentence = []
                    cue = {}
                    scope = {}
                    in_scope = []
                    in_cue = []
                    word_num = 0
                    in_word = 0
                    c_idx = []
                    s_idx = []
                elif '<' not in token:
                    if in_word == 1:
                        if len(in_cue)!=0:
                            for i in in_cue:
                                cue[i].append(word_num)
                        if len(in_scope)!=0:
                            for i in in_scope:
                                for j in i:
                                    scope[j].append(word_num)
                        sentence.append(token)
            cue_only_samples = random.sample(cue_only_data, k=int(frac_no_cue_sents*len(cue_only_data)))
            cue_only_sents = [i[0] for i in cue_only_samples]
            cue_only_cues = [i[1] for i in cue_only_samples]
            return [(cue_sentence+cue_only_sents, cue_cues+cue_only_cues),(scope_sentence, scope_cues, scope_scopes)]
        

        if dataset_name == 'bioscope':
            ret_val = bioscope(file, frac_no_cue_sents=frac_no_cue_sents)
            scope_sents, scope_cues, data_scope = ret_val[1]
            for item in zip(scope_sents, scope_cues, data_scope):
              self.the_big_data_list.append(item)

        elif dataset_name == 'sfu':
            sfu_cues = [[], []]
            sfu_scopes = [[], [], []]
            for dir_name in os.listdir(file):
                if '.' not in dir_name:
                    for f_name in os.listdir(file+"//"+dir_name):
                        r_val = sfu_review(file+"//"+dir_name+'//'+f_name, frac_no_cue_sents=frac_no_cue_sents)
                        sfu_cues = [a+b for a,b in zip(sfu_cues, r_val[0])]
                        sfu_scopes = [a+b for a,b in zip(sfu_scopes, r_val[1])]
            scope_sents, scope_cues, data_scope = sfu_scopes
            for item in zip(scope_sents, scope_cues, data_scope):
              self.the_big_data_list.append(item)
        elif dataset_name == 'starsem':
            if TASK == 'negation':
                ret_val = starsem(file, frac_no_cue_sents=frac_no_cue_sents)
                scope_sents, scope_cues, data_scope = ret_val[1]
                for item in zip(scope_sents, scope_cues, data_scope):
                  self.the_big_data_list.append(item)

        else:
            raise ValueError("Supported Dataset types are:\n\tbioscope\n\tsfu\n\tconll_cue")


In [None]:
bioscope_full_papers_data = Data('/content/gdrive/My Drive/multilingual_BERT_negations/data_raw/bioscope/full_papers.xml', dataset_name='bioscope')
sfu_data = Data('/content/gdrive/My Drive/multilingual_BERT_negations/data_raw/SFU_Review_Corpus_Negation_Speculation', dataset_name='sfu')
bioscope_abstracts_data = Data('/content/gdrive/My Drive/multilingual_BERT_negations/data_raw/bioscope/abstracts.xml', dataset_name='bioscope')
if TASK == 'negation':
    sherlock_train_data = Data('/content/gdrive/My Drive/multilingual_BERT_negations/data_raw/starsem/SEM-2012-SharedTask-CD-SCO-training-09032012.txt', dataset_name='starsem')
    sherlock_dev_data = Data('/content/gdrive/My Drive/multilingual_BERT_negations/data_raw/starsem/SEM-2012-SharedTask-CD-SCO-dev-09032012.txt', dataset_name='starsem')
    sherlock_test_gold_cardboard_data = Data('/content/gdrive/My Drive/multilingual_BERT_negations/data_raw/starsem/SEM-2012-SharedTask-CD-SCO-test-cardboard-GOLD.txt', dataset_name='starsem')
    sherlock_test_gold_circle_data = Data('/content/gdrive/My Drive/multilingual_BERT_negations/data_raw/starsem/SEM-2012-SharedTask-CD-SCO-test-circle-GOLD.txt', dataset_name='starsem')
    french_other = Data('/content/gdrive/My Drive/multilingual_BERT_negations/data_raw/CAS_sherlock_one_scope.txt', dataset_name='starsem')

In [None]:
all_data = bioscope_full_papers_data.the_big_data_list + bioscope_abstracts_data.the_big_data_list + sfu_data.the_big_data_list + sherlock_train_data.the_big_data_list + sherlock_dev_data.the_big_data_list + sherlock_test_gold_cardboard_data.the_big_data_list + sherlock_test_gold_circle_data.the_big_data_list

In [None]:
Bioscope =  bioscope_full_papers_data.the_big_data_list + bioscope_abstracts_data.the_big_data_list

In [None]:
Sherlock = sherlock_train_data.the_big_data_list + sherlock_dev_data.the_big_data_list + sherlock_test_gold_cardboard_data.the_big_data_list + sherlock_test_gold_circle_data.the_big_data_list

In [None]:
SFU = sfu_data.the_big_data_list

In [None]:
len(bioscope_full_papers_data.the_big_data_list)

376

In [None]:
len(bioscope_abstracts_data.the_big_data_list)

1719

In [None]:
len(Bioscope)

2095

In [None]:
len(SFU)

3528

In [None]:
len(Sherlock)

1421

In [None]:
len(all_data)

7044

In [None]:
outfile = format('/content/gdrive/My Drive/multilingual_BERT_negations/data/all_ENG_sent_cue_scope.txt')

In [None]:
count_sents = 0
hash_values = set()
with open(outfile, 'w', encoding='utf8') as outf:
  for sentence in all_data:
    scope_tokens = []
    sent = ' '.join(sentence[0])
    scope = sentence[2]
    cue = ''
    for item in zip(sentence[0], scope, sentence[1]):
      if item[1] == 1:
        scope_tokens.append(item[0])
      if item[2] != 3:
        cue += ' '+item[0]

    count_sents += 1
    outf.write(str(count_sents)+'\n')
    outf.write(sent+'\n')
    outf.write(str(sentence[1])+'\n')
    outf.write(str(scope)+'\n')
    outf.write(cue.strip()+'\n')
    outf.write(' '.join(scope_tokens)+'\n')
    outf.write('\n')

count_sents

In [None]:
for item in sfu_data.the_big_data_list:
  print(item[0])
  print(item[1])
  print(item[2])
  print()

# Choose file to write to.

In [None]:
outfile = format('/content/gdrive/My Drive/multilingual_BERT_negations/data/ENG.json')
with open(outfile, 'w') as outf:
  json.dump(all_data, outf)

In [None]:
outfile = format('/content/gdrive/My Drive/multilingual_BERT_negations/data/SHERLOCK.json')
with open(outfile, 'w') as outf:
  json.dump(ENG_FR, outf)

In [None]:
outfile = format('/content/gdrive/My Drive/multilingual_BERT_negations/data/SFU.json')
with open(outfile, 'w') as outf:
  json.dump(sfu_data.the_big_data_list, outf)

In [None]:
outfile = format('/content/gdrive/My Drive/multilingual_BERT_negations/data/BIOSCOPE.json')
with open(outfile, 'w') as outf:
  json.dump(Bioscope, outf)