# Preprocess for PICO files

This notebook process the files that are result of running the [`build_data.py`](https://github.com/bepnye/EBM-NLP/blob/master/models/lstm-crf/build_data.py) from the [EBM-NLP](https://github.com/bepnye/EBM-NLP/tree/master) repository. It takes the column format that is built by the script and turns it into a format that is compatible with the one we use here.

In [None]:
from pathlib import Path
from tqdm.auto import tqdm

In [None]:
INPUT_DIR = Path('../data/pico/')
SPLITS = ['train', 'test', 'dev']

LABELS_MAP = {
    'N': 'O',
    '1_i': 'Interventions',
    '1_o': 'Outcomes',
    '1_p': 'Participants'
}

In [None]:
for split in SPLITS:
    with open(INPUT_DIR / f'{split}.txt', 'rt') as fhi, open(INPUT_DIR / f'{split}.conll', 'wt') as fho:
        last_label = None
        paragraph_lines = 0

        for line in tqdm(fhi):
            splitted_line = line.strip().split()

            if len(splitted_line) < 3 and last_label is not None:
                # Empty line
                print('', file=fho)
                last_label = 'O'  # This is to avoid continuation of labels across multiple sentences
            elif len(splitted_line) > 3 and last_label is not None:
                # DOCSTART line, we need to print 2 empty lines and start new paragraph
                print('', file=fho)
                last_label = None
                paragraph_lines = 0
            elif len(splitted_line) == 3:
                # We have a token line
                token, pos, label = splitted_line
                label = LABELS_MAP[label]
                if label != 'O':
                    bio_label = f'B-{label}' if last_label != label else f'I-{label}'
                else:
                    bio_label = label
                print(f"{paragraph_lines}\t{token}\t{pos}\t_\t{bio_label}", file=fho)
                paragraph_lines += 1
                last_label = label