In [0]:
# Installations needed
!pip install transformers
!pip install knockknock

In [0]:
!pip install -U ipykernel

In [0]:
# Imports needed
import os, re, torch, html, tempfile, copy, json, math, shutil, tarfile, tempfile, sys, random, pickle
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, BertConfig, BertModel, WordpieceTokenizer
from transformers.file_utils import cached_path
import numpy as np
import pandas as pd
from tqdm import tqdm

Upload the datasets to Google Drive. 
This allows access to your Google Drive from this notebook.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


# Class Definitions:
## Data Preprocessing

In [0]:
sherlock_data = '/content/gdrive/My Drive/multilingual_BERT_negations/data/CAS_sherlock_full.txt'

In [0]:
def starsem(f_path, cue_sents_only=False, frac_no_cue_sents = 1.0):
    raw_data = open(f_path)
    sentence = []
    labels = []
    label = []
    scope_sents = []
    data_scope = []
    scope = []
    scope_cues = []
    # list of lists of all sentences
    data = []
    cue_only_data = []
    
    for line in raw_data:
        # print(line)
        label = []
        sentence = []
        tokens = line.strip().split()
        if len(tokens)==8: #This line has no cues
                print(tokens)
                # append the word
                sentence.append(tokens[3])
                label.append(3) #Not a cue
                for line in raw_data:
                    tokens = line.strip().split()
                    if len(tokens)==0:
                        break
                    else:
                        sentence.append(tokens[3])
                        label.append(3)
                cue_only_data.append([sentence, label])
                
        # elif len(tokens) == 1:
          # print(tokens)  
        else: #The line has 1 or more cues
            num_cues = (len(tokens)-7)//3
            #cue_count+=num_cues
            scope = [[] for i in range(num_cues)]
            label = [[],[]] #First list is the real labels, second list is to modify if it is a multi-word cue.
            label[0].append(3) #Generally not a cue, if it is will be set ahead.
            label[1].append(-1) #Since not a cue, for now.
            # print(label)
            for i in range(num_cues):
                if tokens[7+3*i] != '_': #Cue field is active
                    if tokens[8+3*i] != '_': #Check for affix
                        label[0][-1] = 0 #Affix
                        # affix_list.append(tokens[7+3*i])
                        label[1][-1] = i #Cue number
                        #sentence.append(tokens[7+3*i])
                        #new_word = '##'+tokens[8+3*i]
                    else:
                        label[0][-1] = 1 #Maybe a normal or multiword cue. The next few words will determine which.
                        label[1][-1] = i #Which cue field, for multiword cue altering.
                        
                if tokens[8+3*i] != '_':
                    scope[i].append(1)
                else:
                    scope[i].append(0)
            sentence.append(tokens[3])
            for line in raw_data:
                tokens = line.strip().split()
                if len(tokens)==0:
                    break
                else:
                    sentence.append(tokens[3])
                    label[0].append(3) #Generally not a cue, if it is will be set ahead.
                    label[1].append(-1) #Since not a cue, for now.   
                    for i in range(num_cues):
                        if tokens[7+3*i] != '_': #Cue field is active
                            if tokens[8+3*i] != '_': #Check for affix
                                label[0][-1] = 0 #Affix
                                label[1][-1] = i #Cue number
                            else:
                                label[0][-1] = 1 #Maybe a normal or multiword cue. The next few words will determine which.
                                label[1][-1] = i #Which cue field, for multiword cue altering.
                        if tokens[8+3*i] != '_':
                            scope[i].append(1)
                        else:
                            scope[i].append(0)
            for i in range(num_cues):
                indices = [index for index,j in enumerate(label[1]) if i==j]
                count = len(indices)
                if count>1:
                    for j in indices:
                        label[0][j] = 2
            for i in range(num_cues):
                sc = []
                for a,b in zip(label[0],label[1]):
                    if i==b:
                        sc.append(a)
                    else:
                        sc.append(3)
                scope_cues.append(sc)
                scope_sents.append(sentence)
                data_scope.append(scope[i])
            labels.append(label[0])
            # print(labels)
            data.append(sentence)
    # print(type(data))
    # print(data)
    cue_only_samples = random.sample(cue_only_data, k=int(frac_no_cue_sents*len(cue_only_data)))
    cue_only_sents = [i[0] for i in cue_only_samples]
    cue_only_cues = [i[1] for i in cue_only_samples]
    starsem_cues = (data+cue_only_sents,labels+cue_only_cues)
    starsem_scopes = (scope_sents, scope_cues, data_scope)
    return [starsem_cues, starsem_scopes]

In [0]:
starsem_cues, starsem_scopes = starsem(sherlock_data)

In [0]:
scope_sents, scope_cues, data_scope = starsem_scopes
scope_data = zip(scope_sents, scope_cues, data_scope)

In [0]:
print(scope_sents[3])
print(scope_cues[3])
print('Scope labels:', data_scope[3])

["l'", 'examen', 'endoscopique', 'bronchique', 'ne', 'révèle', 'aucune', 'anomalie', '.']
[3, 3, 3, 3, 2, 3, 2, 3, 3]
Scope labels: [0, 0, 0, 0, 0, 1, 0, 1, 0]


Tokenize with BERT and convert the input to tensors

In [0]:
SCOPE_MODEL = 'bert-base-uncased'
# if 'bert-base-multilingual-cased' do_lower_case=False
do_lower_case = True
MAX_LEN = 128
method = 'augment'

In [0]:
device = torch.device("cuda")
n_gpu = torch.cuda.device_count()

In [0]:
tokenizer = BertTokenizer.from_pretrained(SCOPE_MODEL, do_lower_case=do_lower_case, cache_dir='bert_tokenizer')

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [0]:
# ####################################
# collect example of BERT tokenization
sents = [" ".join([s for s in sent]) for sent in scope_sents]
[sent.lower() for sent in sents]

for word in sents[1].split():
  print(word)
  subwords = tokenizer.tokenize(word)
  print(word, subwords)

# au
# au ['au']
# niveau
# niveau ['niveau']
# thoracique
# thoracique ['th', '##ora', '##ci', '##que']


In [0]:
def preprocess_data(zipped, tokenizer_obj):
    
    zipped = list(zipped)
    dl_sents = [item[0] for item in zipped]
    dl_cues = [item[1] for item in zipped]
    dl_scopes = [item[2] for item in zipped]
    
    # print(dl_sents[1])
    sentences = [" ".join([s for s in sent]) for sent in dl_sents]
    mytexts = []
    mylabels = []
    mycues = []
    mymasks = []
    if do_lower_case == True:
        sentences_clean = [sent.lower() for sent in sentences]
    else:
        sentences_clean = sentences
    
    for sent, tags, cues in zip(sentences_clean, dl_scopes, dl_cues):
        new_tags = []
        new_text = []
        new_cues = []
        new_masks = []
        for word, tag, cue in zip(sent.split(), tags, cues):
            sub_words = tokenizer_obj._tokenize(word)

            # update tags and labels for subwords
            for count, sub_word in enumerate(sub_words):
                # the first subword is the true token
                mask = 1
                if count > 0:
                    # all the other subwords that start with # receive a mask 0
                    # these will be the true_token_masks
                    mask = 0

                # start collecting masks for true and not true tokens
                new_masks.append(mask)

                # tags(scope) and cue labels stay the same for all the subwords
                new_tags.append(tag)
                new_cues.append(cue)
                new_text.append(sub_word)
        # true and not true tokens
        mymasks.append(new_masks)
        mytexts.append(new_text)
        mylabels.append(new_tags)
        mycues.append(new_cues)

    # start lists of final inputs
    final_sentences = []
    final_labels = []
    final_masks = []
    if method == 'replace':
        for sent,cues in zip(mytexts, mycues):
            temp_sent = []
            for token,cue in zip(sent,cues):
                if cue==3:
                    temp_sent.append(token)
                else:
                    temp_sent.append(f'[unused{cue+1}]')
            final_sentences.append(temp_sent)
        final_labels = mylabels
        final_masks = mymasks
    elif method == 'augment':
        # mylabels are scope tags
        # input is already subword-tokenized by BERT
        for sent, cues, labels, masks in zip(mytexts, mycues, mylabels, mymasks):
          
            temp_sent = []
            temp_label = []
            temp_masks = []
            first_part = 0
            for token, cue, label, mask in zip(sent, cues, labels, masks):
                # (token, 1, 0, 1)
                # find cues
                if cue!=3:
                    if first_part == 0:
                        first_part = 1
                        # add special token
                        temp_sent.append(f'[unused{cue+1}]')
                        # true_token_labels
                        temp_masks.append(1)
                        # scope label is 0 because it is a special token
                        temp_label.append(0)
                        # reconstruct the sentence
                        temp_sent.append(token)
                        # the actual token receives ZERO as true_token_label
                        temp_masks.append(0)
                        # scope label remains the same for the entire group of subwords
                        temp_label.append(label)
                        continue
                    # the other parts of the subword:
                    # add special token
                    temp_sent.append(f'[unused{cue+1}]')
                    # true_token_label is FALSE
                    temp_masks.append(0)
                    # scope_label stays the same for all the subwords. It is 0 because it's a cue.
                    temp_label.append(0)
                else:
                    # if the token is not a cue
                    first_part = 0
                temp_masks.append(mask)
                temp_sent.append(token)
                temp_label.append(label)
            final_sentences.append(temp_sent)
            final_labels.append(temp_label)
            final_masks.append(temp_masks)
    else:
        raise ValueError("Supported methods for scope detection are:\nreplace\naugment")


    return final_sentences, final_labels, final_masks         

In [0]:
final_sentences, final_labels, final_masks = preprocess_data(scope_data, tokenizer)
final_sentences

In [0]:
# ##############################
# collect vocab from data

data_vocab = {}
sentences = [t for s in final_sentences for t in s]

for t in sentences:
  if t not in data_vocab:
    data_vocab[t] = 1
  else:
    data_vocab[t] += 1

for i in sorted(data_vocab, key=data_vocab.get, reverse=True):
  print(i, data_vocab[i])

In [0]:
input_ids = pad_sequences([[tokenizer._convert_token_to_id(word) for word in txt] for txt in final_sentences],
                                      maxlen=MAX_LEN, dtype="long", truncating="post", padding="post").tolist()

In [0]:
tags = pad_sequences(final_labels, maxlen=MAX_LEN, value=0, padding="post", dtype="long", truncating="post").tolist()

In [0]:
finalest_masks = pad_sequences(final_masks, maxlen=MAX_LEN, value=0, padding='post', dtype='long', truncating='post').tolist()

In [0]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

## INPUT for BERT

In [0]:
x = 3
print(final_sentences[x])
print(len(final_sentences[x]))
print()
print("input_ids: ", input_ids[x])
print("scope ids: ", tags[x])
print("true token masks: ", finalest_masks[x])
print('attention masks: ', attention_masks[x])

['l', "'", 'exam', '##en', 'end', '##os', '##co', '##pi', '##que', 'bro', '##nch', '##ique', '[unused3]', 'ne', 'rev', '##ele', '[unused3]', 'au', '[unused3]', '##cu', '[unused3]', '##ne', 'an', '##oma', '##lie', '.']
26

input_ids:  [1048, 1005, 11360, 2368, 2203, 2891, 3597, 8197, 4226, 22953, 12680, 7413, 4, 11265, 7065, 12260, 4, 8740, 4, 10841, 4, 2638, 2019, 9626, 8751, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
scope ids:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [0]:
def revert_ids_to_tokens(token_ids):
  skip_special_tokens=True
  clean_up_tokenization_spaces=True
  special_ids = [1,2,3,4]

  token_ids_todecode = [id for id in token_ids if id not in special_ids]
  filtered_tokens = tokenizer.convert_ids_to_tokens(token_ids_todecode, skip_special_tokens=True)
  # To avoid mixing byte-level and unicode for byte-level BPT
  # we need to build string separatly for added tokens and byte-level tokens
  # cf. https://github.com/huggingface/transformers/issues/1133
  sub_texts = []
  current_sub_text = []
  for token in filtered_tokens:
      if skip_special_tokens and token in tokenizer.all_special_ids:
          continue
      if token in tokenizer.added_tokens_encoder:
          if current_sub_text:
              sub_texts.append(tokenizer.convert_tokens_to_string(current_sub_text))
              current_sub_text = []
          sub_texts.append(token)
      else:
          current_sub_text.append(token)
  if current_sub_text:
      sub_texts.append(tokenizer.convert_tokens_to_string(current_sub_text))
  text = " ".join(sub_texts)

  if clean_up_tokenization_spaces:
      clean_text = tokenizer.clean_up_tokenization(text)
      return clean_text
  else:
      return text


In [0]:
print(revert_ids_to_tokens(token_ids_todecode))

l'examen endoscopique bronchique ne revele aucune anomalie.
