In [1]:
import csv
import pandas as pd
import re
import numpy as np
import pickle
import torch
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def categorical(prob, n_samples):
    """
    sample a categorical distribution from a vect of probabilities
    """
    prob = prob.unsqueeze(0).repeat(n_samples, 1)
    cum_prob = torch.cumsum(prob, dim=-1)
    r = torch.rand(n_samples, 1)
    # argmax finds the index of the first True value in the last axis.
    samples = torch.argmax((cum_prob > r).int(), dim=-1)
    return samples.numpy()

In [3]:
def clean_data(data):
    if (len(data) > 0) and (data[-1] == ' '):
        return data[:-1]
    return data

In [4]:
def remove_punctuation(word):
    #print(word, len(word))
    if len(word) == 0:
        return []
    if len(word) == 1:
        return [word]
    
    if word[0] in [':', ',', '.']:
        return [word[0]] + remove_punctuation(word[1:])
    
    idx = 0
    while (idx < len(word)) and (not word[idx] in ['!', '(', ')', ';']) :
        idx += 1
    
    if idx == 0:
        return [word[0]] + remove_punctuation(word[1:])
    if idx == len(word):
        if word[-1] in ['.', ',', ':']:
            return [word[:-1], word[-1]]
        else:
            return [word]
    
    return [word[:idx], word[idx]] + remove_punctuation(word[idx+1:])

In [5]:
#auxiliary function for the custom tokenizer
def aux(word):
    if word.count('.') > 1:
        indices = [index for index in range(len(word)) if word[index] == '.']
        if indices[1] - indices[0] == 4: #it's a number
            return aux(re.sub("\.", "", word))
        else: #it's a date or a phone number
            pieces = [aux(word[:indices[0]])+'.']
            pieces.extend([aux(word[indices[i]+1:indices[i+1]])+'.' for i in range(len(indices)-1)])
            pieces.append(aux(word[indices[-1]+1:]))
            return "".join(pieces)

    if word.count(',') > 1:
        indices = [index for index in range(len(word)) if word[index] == ',']
        if indices[1] - indices[0] == 4: #it's a number
            return aux(re.sub(",", "", word))
        else: #it's a list
            pieces = [aux(word[:indices[0]])+',']
            pieces.extend([aux(word[indices[i]+1:indices[i+1]])+',' for i in range(len(indices)-1)])
            pieces.append(aux(word[indices[-1]+1:]))
            return "".join(pieces)
    
    search = re.search(r":", word)
    if search: #it's probably a time (10:30)
        return aux(word[:search.start()]) + ':' + aux(word[search.end():])

    if ('.' in word) and (',' in word):
        print(word)
    if ',' in word:
        word = re.sub(r',', '.', word)
    
    return "{:e}".format(float(word))

In [5]:
def aux(word):
    return word

In [6]:
def custom_tokenizer(word):    
    if word == '':
        return word    

    if re.search(r'o2|Cl2|cl2', word):
        #some chemistry notations
        return re.sub(r"2", "-2.000000e+00", word)
                
    if (word[0] in ['.', ',']):
        return word[0] + custom_tokenizer(word[1:])
    
    if re.search(r"\d", word[0]): #the word start with a digit
        idx=0
        while (idx<len(word)) and (re.search(r"(\d|\.|,|:)", word[idx])):
            idx += 1 #delimitation of the number
        if word[idx-1] in [',', '.', ':']:
            return aux(word[:idx-1]) + word[idx-1] + custom_tokenizer(word[idx:])
        if (idx == len(word)-1) and (word[idx:] in ['h', 'H', 'q', 'Q']):
            #probably time (hours) or medical term (ex 22q)
            return aux(word[:idx]) + "-" + word[idx]
            
        if (idx<len(word)) and (re.search(r"\A(h|H|q|Q)\d+", word[idx:])):
            #probably time (hours and minutes) or medical term (ex 22q11)
            return aux(word[:idx]) + "-" + word[idx] + "-" + custom_tokenizer(word[idx+1:])    
            
        if (idx<len(word)) and (word[idx:] in ['j', 'e', 'er', 'ème', 'eme', 'aire', 'ième', 'ieme', 'ier', 'nd', 'nde', 'ière']):
            #probably rank
            return aux(word[:idx]) + "-" + word[idx:]    
        if (idx<len(word)) and (re.search(r"[a-z]|\+", word[idx])):
            #probably number + unit (12mm) or operation (ex: 36+2)
            return aux(word[:idx]) + " " + custom_tokenizer(word[idx:])    
        if (idx<len(word)) and (re.search(r"[A-Z]", word[idx])):
            #probably medical code (2P2)
            return aux(word[:idx]) + "-" + custom_tokenizer(word[idx:])    
        return aux(word[:idx]) + custom_tokenizer(word[idx:])
    
    search = re.search(r"[A-Za-z]*m\d", word) #looks for m2 or cm3 etc.
    if search:
        return custom_tokenizer(word[:search.start()]) + word[search.start():search.end()-1] + '-' + aux(word[search.end()-1]) + custom_tokenizer(word[search.end():])

    if re.search(r"\A\-[A-Za-z]", word): #an item withing a list
        return "- " + custom_tokenizer(word[1:])
        
    idx = 0
    while (idx<len(word)) and (not re.search(r"(\d)", word[idx])):
        idx += 1 #delimitation of the not number subsequence
    if (idx<len(word)) and (word[idx-1] in ['-', '/', '%']):
        #probably a unit (g/2jour), or a negative number or a range of values
        return word[:idx] + custom_tokenizer(word[idx:])

    if (idx<len(word)) and (re.search(r"[A-Z]", word[idx-1])):
        #probably a medical code G2P2
        return word[:idx] + "-" + custom_tokenizer(word[idx:])

    if idx<len(word):
        return word[:idx] + " " + custom_tokenizer(word[idx:])
    
    return word[:idx] + custom_tokenizer(word[idx:])

In [7]:
custom_tokenizer("123.32mmgH")

'1.233200e+02 mmgH'

In [8]:
custom_tokenizer("14/04/2002")

'1.400000e+01/4.000000e+00/2.002000e+03'

In [9]:
custom_tokenizer("01.49.56.98.30")

'1.000000e+00.4.900000e+01.5.600000e+01.9.800000e+01.3.000000e+01'

In [10]:
custom_tokenizer("G2P2")

'G-2.000000e+00-P-2.000000e+00'

In [11]:
custom_tokenizer("2j")

'2.000000e+00-j'

In [12]:
custom_tokenizer("8h30")

'8.000000e+00-h-3.000000e+01'

In [13]:
custom_tokenizer("4H")

'4.000000e+00-H'

In [14]:
custom_tokenizer("22q11")

'2.200000e+01-q-1.100000e+01'

In [15]:
custom_tokenizer("32g/cm/cm2/m3")

'3.200000e+01 g/cm/cm-2.000000e+00/m-3.000000e+00'

In [16]:
custom_tokenizer("10:30:00")

'1.000000e+01:3.000000e+01:0.000000e+00'

In [17]:
custom_tokenizer("2ème")

'2.000000e+00-ème'

In [18]:
custom_tokenizer("Spo2")

'Spo-2.000000e+00'

In [19]:
custom_tokenizer('86mm2/m2')

'8.600000e+01 mm-2.000000e+00/m-2.000000e+00'

In [20]:
custom_tokenizer("36+2")

'3.600000e+01 + 2.000000e+00'

In [21]:
custom_tokenizer("-G2P2")

'- G-2.000000e+00-P-2.000000e+00'

In [22]:
custom_tokenizer("100-115%")

'1.000000e+02-1.150000e+02%'

### Tokenize numbers

In [23]:
with open("test", "rb") as fp:   # Unpickling
    test_ds = pickle.load(fp)
 
with open("val", "rb") as fp:   # Unpickling
    val_ds = pickle.load(fp)
 
with open("train", "rb") as fp:   # Unpickling
    train_ds = pickle.load(fp)
    
tokenized_test_ds = []
tokenized_val_ds = []
tokenized_train_ds = []

for sample in test_ds:
    tokenized_text = []
    new_classes = np.copy(sample['classes'])
    cursor = 0
    for i in range(len(sample['tokens'])):
        word = sample['tokens'][i]
        tokenized_word = custom_tokenizer(word).split()
        if len(tokenized_word) > 1:
            new_classes = np.insert(new_classes, cursor+1, [0 for k in range(len(tokenized_word)-1)])
            cursor += len(tokenized_word)-1
            #if sample['classes'][i] != 0:
            #    print("'", word, "'", tokenized_word, "extracted:", sample['extracted_from'])
        tokenized_text.append(tokenized_word)
        cursor += 1

    tokenized_sample = {'tokens': sum(tokenized_text, []), 'classes': new_classes, 'extracted_from': sample['extracted_from']}
    tokenized_test_ds.append(tokenized_sample)

for sample in val_ds:
    tokenized_text = []
    new_classes = np.copy(sample['classes'])
    cursor = 0
    for i in range(len(sample['tokens'])):
        word = sample['tokens'][i]
        tokenized_word = custom_tokenizer(word).split()
        if len(tokenized_word) > 1:
            new_classes = np.insert(new_classes, cursor+1, [0 for k in range(len(tokenized_word)-1)])
            cursor += len(tokenized_word)-1
            #if sample['classes'][i] != 0:
            #    print("'", word, "'", tokenized_word, "extracted:", sample['extracted_from'])
        tokenized_text.append(tokenized_word)
        cursor += 1
        
    tokenized_sample = {'tokens': sum(tokenized_text, []), 'classes': new_classes, 'extracted_from': sample['extracted_from']}
    tokenized_val_ds.append(tokenized_sample)

for sample in train_ds:
    tokenized_text = []
    new_classes = np.copy(sample['classes'])
    cursor = 0
    for i in range(len(sample['tokens'])):
        word = sample['tokens'][i]
        tokenized_word = custom_tokenizer(word).split()
        if len(tokenized_word) > 1:
            new_classes = np.insert(new_classes, cursor+1, [0 for k in range(len(tokenized_word)-1)])
            cursor += len(tokenized_word)-1
            #if sample['classes'][i] != 0:
            #    print("'", word, "'", tokenized_word, "extracted:", sample['extracted_from'])
        #if re.search(r'\d', word):
        #    print("'", word, "'", tokenized_word, "extracted:", sample['extracted_from'])
        tokenized_text.append(tokenized_word)
        cursor += 1
        
    tokenized_sample = {'tokens': sum(tokenized_text, []), 'classes': new_classes, 'extracted_from': sample['extracted_from']}
    tokenized_train_ds.append(tokenized_sample)

In [73]:
with open("tokenized_test_2", "wb") as fp:   #Pickling
   pickle.dump(tokenized_test_ds, fp)
 
with open("tokenized_val_2", "wb") as fp:   #Pickling
   pickle.dump(tokenized_val_ds, fp)
 
with open("tokenized_train_2", "wb") as fp:   #Pickling
   pickle.dump(tokenized_train_ds, fp)

# Creation of the ComNumDataset

In [14]:
import random
len_dataset = 200000
candidate_numbers = [k for k in range(len_dataset)]
dataset = []

In [15]:
for k in range(len_dataset//4):
    selected_numbers = random.sample(candidate_numbers, k=2)
    text = aux(str(selected_numbers[0]/1000)) + ' est supérieur à ' + aux(str(selected_numbers[1]/1000)) + '.'
    label = selected_numbers[0] > selected_numbers[1]
    dataset.append({'text': text, 'label': label})

    for number in selected_numbers:
        candidate_numbers.remove(number)

    selected_numbers = random.sample(candidate_numbers, k=2)
    text = aux(str(selected_numbers[0]/1000)) + ' est inférieur à ' + aux(str(selected_numbers[1]/1000)) + '.'
    label = selected_numbers[0] < selected_numbers[1]
    dataset.append({'text': text, 'label': label})
    for number in selected_numbers:
        candidate_numbers.remove(number)

In [17]:
len(dataset)

100000

In [18]:
rand = np.random.rand(len(dataset))
mask_arr = (rand < 0.2)
mlm_train_ds = []
mlm_val_ds = []
for i in range(len(dataset)):
    if mask_arr[i]:
        mlm_val_ds.append(dataset[i])
    else:
        mlm_train_ds.append(dataset[i])

In [19]:
print(len(mlm_train_ds), len(mlm_val_ds))

79855 20145


In [20]:
with open("mlm_val_2", "wb") as fp:   #Pickling
   pickle.dump(mlm_val_ds, fp)
 
with open("mlm_train_2", "wb") as fp:   #Pickling
   pickle.dump(mlm_train_ds, fp)

# Creation of the unlabeled tokenized dataset for MLM task

In [228]:
 with open("mlm_val", "rb") as fp:   # Unpickling
    val_ds = pickle.load(fp)
 
with open("mlm_train", "rb") as fp:   # Unpickling
    train_ds = pickle.load(fp)
    
mlm_tokenized_val_ds = []
mlm_tokenized_train_ds = []
mlm_tokenized_global_ds = []

for sample in val_ds:
    tokenized_sample = {k:v for (k,v) in sample.items()}
    tokenized_sample['tokens'] = sum([custom_tokenizer(word).split() for word in sample['tokens']], [])
    mlm_tokenized_val_ds.append(tokenized_sample)
    mlm_tokenized_global_ds.append(tokenized_sample)

for sample in train_ds:
    tokenized_sample = {k:v for (k,v) in sample.items()}
    tokenized_sample['tokens'] = sum([custom_tokenizer(word).split() for word in sample['tokens']], [])
    mlm_tokenized_train_ds.append(tokenized_sample)
    mlm_tokenized_global_ds.append(tokenized_sample)

In [229]:
with open("mlm_tokenized_val", "wb") as fp:   #Pickling
   pickle.dump(mlm_tokenized_val_ds, fp)
 
with open("mlm_tokenized_train", "wb") as fp:   #Pickling
   pickle.dump(mlm_tokenized_train_ds, fp)

with open("mlm_tokenized_global", "wb") as fp:   #Pickling
   pickle.dump(mlm_tokenized_global_ds, fp)

In [230]:
len(mlm_tokenized_global_ds)

25554

In [231]:
print(len(mlm_tokenized_train_ds), len(mlm_tokenized_val_ds))

20442 5112
