Let us first replicate the information taken at `__init__` by the class `PreparatorTOKCL`.

In [1]:
from smtag.encoder import XMLEncoder
import os
from lxml.etree import fromstring, Element
from smtag.config import Config
from smtag.xml2labels import SourceDataCodes as sd
from smtag.utils import innertext
from smtag.xml2labels import CodeMap

In [2]:
config = Config(
    max_length=512,  # in tokens! # sentence-level: 64, abstracts/full fig captions 512 tokens
    from_pretrained="bert-base-cased",  # leave empty if training a language model from scratch
    model_type="Autoencoder",
    asynchr=True,  # we need ordered examples while async returns results in non deterministic way
    tokenizer="bert-base-=cased"
)

In [6]:
source_dir_path = '/app/data/xml/sd_panels/'
dest_dir_path = '/app/data/json/sd_panels/'
max_length = config.max_length
subsets = ["train", "eval", "test"]
tokenizer = config.tokenizer
code_maps = [sd.ENTITY_TYPES, sd.GENEPROD_ROLES, sd.BORING, sd.PANELIZATION]
source_file_path = f"{os.path.join(source_dir_path, subsets[0])}.txt"
source_file_path

'/app/data/xml/sd_panels/train.txt'

For developing and debugging purposes we will select a single line instead of the entire file.

In [78]:
with open(source_file_path) as f:
    lines = f.readlines()

line = lines[9411]

xml_example = fromstring(line)

We test here the `_encode_example` function

In [79]:
xml_encoder = XMLEncoder(xml_example)
inner_text = innertext(xml_encoder.element)


`inner_text` represents the text of a given XML element. Using `code_maps` we can get the label for each character of `inner_text`.

In [80]:
for c, l in zip(inner_text, xml_encoder.encode(code_maps[0])['label_ids']):
    print(f"{c} -> {l}")

( -> None
E -> None
) -> None
  -> None
s -> None
i -> None
R -> None
N -> None
A -> None
  -> None
k -> None
n -> None
o -> None
c -> None
k -> None
d -> None
o -> None
w -> None
n -> None
  -> None
o -> None
f -> None
  -> None
P -> 2
a -> 2
r -> 2
k -> 2
i -> 2
n -> 2
  -> None
w -> None
a -> None
s -> None
  -> None
p -> None
e -> None
r -> None
f -> None
o -> None
r -> None
m -> None
e -> None
d -> None
  -> None
i -> None
n -> None
  -> None
S -> 4
H -> 4
- -> 4
S -> 4
Y -> 4
5 -> 4
Y -> 4
  -> None
c -> None
e -> None
l -> None
l -> None
s -> None
  -> None
f -> None
o -> None
l -> None
l -> None
o -> None
w -> None
e -> None
d -> None
  -> None
b -> None
y -> None
  -> None
2 -> None
5 -> None
  -> None
μ -> None
M -> None
  -> None
A -> None
A -> None
  -> None
f -> None
o -> None
r -> None
  -> None
2 -> None
  -> None
h -> None
o -> None
u -> None
r -> None
s -> None
  -> None
p -> None
r -> None
i -> None
o -> None
r -> None
  -> None
t -> None
o -> None
  -> None
f -> None

The next step now is to put the characters together in words. At the same time, the labels must also be generated on a way that each label belongs to each word.

We do the same for each of the code maps, with the exception of `panel_start` which gets a bit of a different treatment.


In [81]:
for code_map in code_maps:
    # At this point we have a tag for each character.
    # It is here where I should put chars together into words
    words, label_words = [], []
    xml_encoded = xml_encoder.encode(code_map)
    if code_map.name != "panel_start":
        char_level_labels = xml_encoded['label_ids']
    else:
        char_level_labels = ["O"] * len(xml_encoded['label_ids'])
        offsets = xml_encoded["offsets"]
        for offset in offsets:
            char_level_labels[offset[0]] = "B-PANEL_START"
    print(code_map.name, char_level_labels)


entity_types [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 2, 2, 2, 2, 2, 2, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 4, 4, 4, 4, 4, 4, 4, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 2, 2, 2, 2, 2, None, None, None, None, None, None, None, None, None, None, 2, 2, 2, 2, 2, 2, None, None, None, No

In [82]:
sd.ENTITY_TYPES

<SourceDataCodes.ENTITY_TYPES: CodeMap(name='entity_types', mode='whole_entity', constraints=OrderedDict([(1, {'label': 'SMALL_MOLECULE', 'tag': 'sd-tag', 'attributes': {'entity_type': ['molecule']}}), (2, {'label': 'GENEPROD', 'tag': 'sd-tag', 'attributes': {'entity_type': ['geneprod', 'gene', 'protein']}}), (3, {'label': 'SUBCELLULAR', 'tag': 'sd-tag', 'attributes': {'entity_type': ['subcellular']}}), (4, {'label': 'CELL', 'tag': 'sd-tag', 'attributes': {'entity_type': ['cell']}}), (5, {'label': 'TISSUE', 'tag': 'sd-tag', 'attributes': {'entity_type': ['tissue']}}), (6, {'label': 'ORGANISM', 'tag': 'sd-tag', 'attributes': {'entity_type': ['organism']}}), (7, {'label': 'EXP_ASSAY', 'tag': 'sd-tag', 'attributes': {'category': ['assay']}})]))>

The next step is to go from text on a character level basis to text on a word-level basis. This is done through the method `_from_char_to_token_level_labels`

In [83]:
from typing import List


def _labels_to_iob2(code_map: CodeMap, words: List[str], labels: List) -> List:
    """
    Args:
        code_map (CodeMap): CodeMap, each specifying The XML-to-code mapping of label codes
                            to specific combinations of tag name and attribute values.
        text List[str]:     List of separated words
        labels List:        List of labels for each word inside the XML elements.

    Returns:
        List[str]           Word-level tokenized labels in IOB2 format

    """
    iob2_labels = []

    for idx, label in enumerate(labels):
        if code_map.name == "panel_start":
            iob2_labels.append("O")

        if code_map.name != "panel_start":
            if label == "O":
                iob2_labels.append(label)

            if label != "O":
                if idx == 0:
                    iob2_labels.append(code_map.iob2_labels[int(label) * 2])
                if (idx > 0) and (labels[idx - 1] != label):
                    iob2_labels.append(code_map.iob2_labels[int(label) * 2])
                if (idx > 0) and (labels[idx - 1] == label):
                    iob2_labels.append(code_map.iob2_labels[int(label) * 2 - 1])

    return iob2_labels


def _from_char_to_token_level_labels(code_map: CodeMap, text: List[str], labels: List[str]) -> List:
    """
    Args:
        code_map (CodeMap): CodeMap, each specifying Tthe XML-to-code mapping of label codes
                            to specific combinations of tag name and attribute values.
        text List[str]:     List of the characters inside the text of the XML elements
        labels List:        List of labels for each character inside the XML elements. They will be
                            a mix of integers and None

    Returns:
        List[str]           Word-level tokenized labels for the input text
    """

    word, label_word = '', ''
    word_level_words, word_level_labels = [], []

    for i, char in enumerate(text):
        if char.isalnum():
            word += char
            label_word += str(labels[i]).replace("None", "O")
        elif char == " ":
            if word not in [""]:
                word_level_words.append(word)
                word_level_labels.append(label_word[0])
            word = ''
            label_word = ''
        else:
            if word not in [""]:
                word_level_words.append(word)
                word_level_labels.append(label_word[0])

            word_level_words.append(char)
            word_level_labels.append(str(labels[i]).replace("None", "O"))
            word = ''
            label_word = ''

    word_level_iob2_labels = _labels_to_iob2(code_map, word_level_words, word_level_labels)
    assert len(word_level_words) == len(word_level_iob2_labels), "Length of labels and words not identical!"
    return word_level_words, word_level_iob2_labels


In [84]:
code_map = sd.ENTITY_TYPES
words, iob_labels = _from_char_to_token_level_labels(code_map, inner_text, xml_encoder.encode(code_map)["label_ids"])
for w,l in zip(words, iob_labels):
    print(w, l)

( O
E O
) O
siRNA O
knockdown O
of O
Parkin B-GENEPROD
was O
performed O
in O
SH B-CELL
- I-CELL
SY5Y I-CELL
cells O
followed O
by O
25 O
μM O
AA O
for O
2 O
hours O
prior O
to O
fixation O
and O
immunostaining B-EXP_ASSAY
with O
antibodies O
specific O
to O
TOM20 B-GENEPROD
( O
green O
) O
, O
Tollip B-GENEPROD
( O
red O
) O
and O
Cytochrome B-GENEPROD
c I-GENEPROD
( O
blue O
) O
. O
We O
observed O
Tollip B-GENEPROD
colocalisation B-EXP_ASSAY
with O
TOM20 B-GENEPROD
+ O
ve O
/ O
PDH B-SUBCELLULAR
- O
ve O
MDVs B-SUBCELLULAR
( O
denoted O
by O
arrowheads O
) O
, O
which O
was O
still O
maintained O
following O
Parkin B-GENEPROD
siRNA O
knockdown O
. O


 At this point we have the characters orders in words. Note that we treat words as consecutive strings of alphanumeric characters. Non alpha numeric characters will be part of different words. This is ok since the tokenizer id for non alphanumeric characters is not going to be different from them being alone or attached to a word.

In [105]:
import json
data = []
with open('/data/json/sd_panels/train.jsonl') as f:
    for line in f:
        data.append(json.loads(line))

In [106]:
for (w, l) in zip(data[215]['words'], data[215]['label_ids']['entity_types']):
    print(w, l)

( O
A O
) O
Representative O
fluorescence B-EXP_ASSAY
images I-EXP_ASSAY
of O
Mock O
, O
SARS B-ORGANISM
- I-ORGANISM
CoV I-ORGANISM
- I-ORGANISM
2 I-ORGANISM
infected O
( O
Ctrl O
) O
, O
and O
Salmeterol B-SMALL_MOLECULE
- O
treated O
wells O
analyzed O
with O
the O
Multiwavelength O
Cell O
Scoring O
application O
in O
MetaXpress O
. O
Grayscales O
of O
the O
images B-EXP_ASSAY
were O
adjusted O
to O
enable O
direct O
comparison O
of O
the O
relative O
levels B-EXP_ASSAY
of O
fluorescence B-EXP_ASSAY
among O
the O
treatments O
: O
Segmentation O
images B-EXP_ASSAY
show O
how O
cells O
were O
segmented O
and O
identified O
as O
spike B-GENEPROD
positive O
. O
Purple O
, O
nuclei B-SUBCELLULAR
; O
cyan O
, O
spike B-GENEPROD
. O
Scale O
bar O
, O
100 O
μm O
. O


In [None]:
from transformers import AutoTokenizer, PreTrainedTokenizerFast

In [None]:
class RobertaTokenizerForListOfStrings:
    """
    Uses tokenizers that work with lists of strings
    to be able to properly run the roberta tokenizer.
    Arguments:
    ----------
    tokenizer: `PreTrainedTokenizerFast`
    """
    def __init__(self, tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained("bert-base-cased")):
        self.roberta = AutoTokenizer.from_pretrained("roberta-base", is_pretokenized=True)
        self.tokenizer = tokenizer
        
    def __call__(self, batch):
        encoded_batch = self.tokenizer.batch_encode_plus(batch, is_split_into_words=True)
        batch_plain_string = self.tokenizer.batch_decode(encoded_batch["input_ids"], skip_special_tokens=True)
        roberta_batch = self.roberta.batch_encode_plus(batch_plain_string)
        return roberta_batch
        
        

In [None]:
SENTENCES  = ["This is just a simple test",
              "What about putting a-symbol like this?"]
SENTENCE_SPLIT = [sentence.split() for sentence in SENTENCES]

In [None]:
class_ = RobertaTokenizerForListOfStrings()
assert class_(SENTENCE_SPLIT) == roberta(SENTENCES), f"Something went wrong"

In [None]:
print(SENTENCES[-1])
encoded_batch = bert.batch_encode_plus([SENTENCES[-1].split()], is_split_into_words=True)
bert.batch_decode(encoded_batch["input_ids"], skip_special_tokens=True)


In [None]:
roberta = AutoTokenizer.from_pretrained("roberta-base", is_pretokenized=True, add_prefix_space=True)
bert = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
roberta("Wha -t -about- pu-tting a - symbol like this? or here ? { } ")

In [None]:
roberta.decode([0, 14447, 102, 111, 90, 111, 9006, 12, 18829, 12, 90, 2577, 10, 111, 7648, 101, 42, 116, 50, 259, 17487, 25522, 35524, 1437, 2])

In [None]:
roberta("Wha -t -about- pu-tting a - symbol like this? or here ? { } ".split(), is_split_into_words=True)

In [None]:
roberta.decode()

In [None]:
roberta.decode([0, 653, 59, 2057, 10, 12, 7648, 101, 42, 116, 2])

In [None]:
bert("What about putting a- symbol like this?")

In [None]:
bert.decode([101, 1327, 1164, 4518, 170, 118, 5961, 1176, 1142, 136, 102], skip_special_tokens=True)

In [None]:
from datasets import load_dataset, DatasetDict

In [None]:
ds = load_dataset("EMBO/sd-nlp-non-tokenized")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base", is_pretokenized=True, add_prefix_space=True)

In [None]:
def _align_labels_with_tokens(labels, word_ids, word_labels):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def _shift_label(label):
    # If the label is B-XXX we change it to I-XX
    if label % 2 == 1:
        label += 1
    return label

def _tokenize_and_align_labels(examples) -> DatasetDict:
    """
    Tokenizes data split into words into sub-token tokenization parts.
    Args:
        examples: batch of data from a `datasets.DatasetDict`

    Returns:
        `datasets.DatasetDict` with entries tokenized to the `AutoTokenizer`
    """
    tokenized_inputs = tokenizer(examples['words'],
                                      truncation=True,
                                      is_split_into_words=True,
                                      max_length=512)

    all_labels = examples['labels']
    new_labels = []
    tag_mask = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(_align_labels_with_tokens(labels, word_ids, all_labels))
        tag_mask.append([0 if tag == 0 else 1 for tag in new_labels[-1]])

    tokenized_inputs['labels'] = new_labels
    tokenized_inputs['tag_mask'] = tag_mask

    return tokenized_inputs


In [None]:
ds['train'][0]

In [None]:
tokenized_data = ds.map(
                _tokenize_and_align_labels,
                batched=True)

In [None]:
tokenized_inputs = tokenizer(ds['train'][2]['words'],
                                  truncation=True,
                                  is_split_into_words=True,)
labels = ds['train'][2]['labels']
words = ds['train'][2]['words']

for i in range(len(tokenized_inputs.word_ids())):
    print(tokenized_inputs.word_ids()[i],
         _align_labels_with_tokens(labels, tokenized_inputs.word_ids())[i],
         tokenized_inputs['input_ids'][i],
         tokenizer.decode(tokenized_inputs['input_ids'][i]))
# new_labels.append(_align_labels_with_tokens(labels, word_ids))
# tag_mask.append([0 if tag == 0 else 1 for tag in new_labels[-1]])
# print(i, tokenized_inputs.word_ids(i), ds['train'][0]['words'][i])


In [None]:
for i in range(len(tokenized_data['train'][2]["labels"])):
    print(tokenized_data['train'][2]["labels"][i], tokenizer.decode(tokenized_data['train'][2]['input_ids'][i]))
                        

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", is_pretokenized=True, add_prefix_space=False)
tokenized_data = ds.map(
                _tokenize_and_align_labels,
                batched=True)
for i in range(len(tokenized_data['train'][2]["labels"])):
    print(tokenized_data['train'][2]["labels"][i], tokenizer.decode(tokenized_data['train'][0]['input_ids'][i]))


In [None]:
for i in range(len(ds['train'][0]["labels"])):
    print(ds['train'][0]["words"][i], ds['train'][0]['labels'][i])


In [None]:
ds['train'][0].keys()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base', 
                                          use_fast=True, 
                                          is_pretokenized=True, 
                                          add_prefix_space=True)
enc = tokenizer(ds['train'][0]['words'], is_split_into_words=True)
enc

In [None]:
enc.word_ids()

In [None]:
enc.word_to_tokens(2)

In [7]:
class test1:
    def __init__(self,
                arg_1_1: int = 1,
                arg_1_2: int = 2,
                arg_1_3: int = 3,
                arg_1_4: int = 4,
                ):
        self.arg_1_1 = arg_1_1
        self.arg_1_2 = arg_1_2
        self.arg_1_3 = arg_1_3
        self.arg_1_4 = arg_1_4
        
class test2(test1):
    def __init__(self, 
                 arg_2=10,
                 **kw):
        self.arg2 = arg_2
        super(test2, self).__init__(**kw)
    
t2 = test2(arg_1_1=3)

In [8]:
t2.arg_1_1

3

In [10]:
from ray.tune import CLIReporter

type(CLIReporter())

ray.tune.progress_reporter.CLIReporter