# Preprocess for casimedicos CONLL files

This notebooks preprocess the CONLL files from `casimedicos` in order to normalize them to a more common CONLL format, where each sentence is grouped together (no blank lines in-between words of the same sentence, only between sentences), and each paragraph (defined by the reset of the index counter) is spaced by two blank lines. It also tokenizes (separate) common puntuation symbols: `'"!?(),;:[]><=.`

In [None]:
import re

from pathlib import Path
from tqdm.auto import tqdm

In [None]:
DIR = Path('../data/casimedicos/')
SPLITS = ['train', 'dev', 'test']

In [None]:
def process_line(line):
    if line.strip() == '':
        # The line is empty
        return {'idx': None, 'label': None, 'tokens': []}

    idx, token, _, _, label = line.strip().split()
    idx = int(idx)
    tokens = []

    if re.match(r"[0-9]+\.$", token):
        # The token is a number, but ends with a dot, which might be the end of a sentence
        tokens = [token[:-1], "."]
    elif token.replace('.', '', 1).isdigit():
        # The token represents a whole number, might be a float, we should not replace it
        tokens = [token]
    else:
        # The token is neither a float or a number that ends with a dot, we can split it by puntuations
        tokens = [tk for tk in re.split('([,?:;~!#$%^&*()\[\]"\'><=.-])', token) if tk != '']

    return {
        'idx': idx,
        'label': label,
        'tokens': tokens
    }

In [None]:
for split in SPLITS:
    with open(DIR / f'{split}.conll', 'rt') as fhi, open(DIR / f'{split}_revisited.conll', 'wt') as fho:
        last_line = None
        paragraph_lines = 0
        for line in tqdm(fhi):
            new_line = process_line(line)
            if len(new_line['tokens']) == 0:
                continue  # Omit empty lines            
    
            if last_line is None:
                last_line = new_line
                continue  # We cannot do anything without at least 2 lines
    
            for idx, token in enumerate(last_line['tokens'], start=paragraph_lines):
                print(f"{idx}\t{token}\t_\t_\t{last_line['label']}", file=fho)
    
            paragraph_lines += len(last_line['tokens'])
    
            if new_line['idx'] < last_line['idx']:
                # We have a new paragraph
                paragraph_lines = 0
                print('', end='\n\n', file=fho)  # Print two blank lines for the end of a paragraph
            elif last_line['tokens'][-1] == '.' and last_line['label'] != 'O' and last_line['label'] != new_line['label']:
                # We have a dot signifiying the end of a sentence and the previous line label
                # is not an 'O' label and is not the same as the new line label (e.g. two 'I-*' labels)
                print('', file=fho) # Print an empty line for the end of a sentence
            last_line = new_line
    
        if last_line:
            for idx, token in enumerate(last_line['tokens'], start=paragraph_lines):
                print(f"{idx}\t{token}\t_\t_\t{last_line['label']}", file=fho)
            print('', file=fho)