In [1]:
from datasets import load_dataset
from transformers import AutoModelForTokenClassification, AutoTokenizer

In [2]:
model = AutoModelForTokenClassification.from_pretrained("EMBO/sd-panelization-v2")
tokenizer = AutoTokenizer.from_pretrained("EMBO/sd-panelization-v2",
                                         add_prefix_space=True)

Downloading:   0%|          | 0.00/877 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/379 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/220k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [3]:
from transformers import pipeline

ds = load_dataset("EMBO/sd-nlp-v2", "PANELIZATION")["train"]

tokenizer = AutoTokenizer.from_pretrained("EMBO/sd-panelization-v2", model_max_length=512)

model = AutoModelForTokenClassification.from_pretrained("EMBO/sd-panelization-v2")

tagger = pipeline(task="token-classification", 
                     model=model, 
                     tokenizer=tokenizer,
                     device=0,
                     aggregation_strategy="none")

tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512,'return_tensors':'pt'}

Downloading:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading and preparing dataset source_data_nlp/PANELIZATION to /root/.cache/huggingface/datasets/EMBO___source_data_nlp/PANELIZATION/2.0.0/697847190b4f17eb8b2bf15419fdd4e8cacc32bc57734ea9909016d809b4eb55...


Downloading:   0%|          | 0.00/3.78M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset source_data_nlp downloaded and prepared to /root/.cache/huggingface/datasets/EMBO___source_data_nlp/PANELIZATION/2.0.0/697847190b4f17eb8b2bf15419fdd4e8cacc32bc57734ea9909016d809b4eb55. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
load_dataset("EMBO/sd-nlp-v2", "PANELIZATION")

W1004 13:14:50.943357 139662323529536 builder.py:532] Reusing dataset source_data_nlp (/root/.cache/huggingface/datasets/EMBO___source_data_nlp/PANELIZATION/2.0.0/697847190b4f17eb8b2bf15419fdd4e8cacc32bc57734ea9909016d809b4eb55)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['words', 'text', 'labels', 'tag_mask'],
        num_rows: 2648
    })
    test: Dataset({
        features: ['words', 'text', 'labels', 'tag_mask'],
        num_rows: 307
    })
    validation: Dataset({
        features: ['words', 'text', 'labels', 'tag_mask'],
        num_rows: 312
    })
})

In [5]:
from IPython.display import HTML as html_print

def color_string(s, color='black'):
    return "<text style=background-color:{};weight:b>{}</text>".format(color, s)
def normal_string(s, color='black'):
    return "<text style=color:{};weight:b>{}</text>".format(color, s)

def get_predicted_panels(example):
    generation = tagger(" ".join(example["words"]))
    list_ = []
    for item in generation:
        if item["entity"] == "B-PANEL_START":
            list_.append(item)
    return list_

def get_labeled_panels(example):
    chars = 0
    list_ = []
    for idx, i in enumerate(example["labels"]):
        word = example["words"][idx]
        if i == 1:
            list_.append({
                "index": idx,
                "start": chars
            })
        chars += len(word) + 1
    return list_

In [6]:
def show_example(idx):
    example = ds[idx]

    predictions = get_predicted_panels(example)
    labels = get_labeled_panels(example)
    text = " ".join(example["words"])

    chars = 0
    text = ""
    for word in example["words"]:
        if chars in [d['start'] for d in predictions if 'start' in d] and chars not in [d['start'] for d in labels if 'start' in d]:
            text += color_string(word, color='cyan')
        elif chars in [d['start'] for d in predictions if 'start' in d] and chars in [d['start'] for d in labels if 'start' in d]:
            text += color_string(word, color='lime')
        elif chars not in [d['start'] for d in predictions if 'start' in d] and chars in [d['start'] for d in labels if 'start' in d]:
            text += color_string(word, color='red')
        else:
            text += normal_string(word, color='black')
        text += " "
        chars += len(word) + 1
        
    return text, predictions, labels

    

## Visualizing the panelization task

Below can be shown the prediction of our machine learning model on the figure legend when the panelization task is applied. Color codes are as following:

* <font color='lime'>Green (True positive)</font> - Correct prediction
* <font color='red'>Red (False negative)</font>   - The prediction missed a panel start present in the labelled data
* <font color='cyan'>Cyan (False positive or bad labelling)</font>  - The prediction shows a positive panel not shown in the labelled data
* Black - true negatives :-)

In [42]:
import pandas as pd
ds = load_dataset("EMBO/sd-nlp-v2", "PANELIZATION")
temp_train_ds = ds["train"].to_pandas()
temp_test_ds = ds["test"].to_pandas()
temp_eval_ds = ds["validation"].to_pandas()
ds = ds["test"]

W1004 13:17:57.846580 139662323529536 builder.py:532] Reusing dataset source_data_nlp (/root/.cache/huggingface/datasets/EMBO___source_data_nlp/PANELIZATION/2.0.0/697847190b4f17eb8b2bf15419fdd4e8cacc32bc57734ea9909016d809b4eb55)


  0%|          | 0/3 [00:00<?, ?it/s]

In [41]:
from datasets import Dataset
ds = Dataset.from_pandas(temp_train_ds)

In [51]:
EXAMPLE = 3

In [52]:
ds[EXAMPLE]['text']

'IP3R1 silencing favors frequent changes in direction in immature DCs migrating in micro‐channelsAnalysis of shScramble (gray)‐, shIP3R(1,3)A (blue)‐, shIP3R(2,3)B (red)‐, and shIP3R(1,3)C (green)‐expressing DCs migrating in micro‐channels (n\xa0>\xa0100 cells per condition from 3 independent experiments for shIP3R(1,3)A and shIP3R(1,3)C and two independent experiments for shIP3R(2,3)B).Figure 3Analysis of shScramble (gray)‐, shIP3R(1,3) A (blue)‐, shIP3R(2,3) B (red)‐, and shIP3R(1,3) C (green)‐expressing DCs migrating in micro‐channels (n > 100 cells per condition from 3 independent experiments for shIP3R(1,3) A and shIP3R(1,3) C and two independent experiments for shIP3R(2,3) B).A Velocity fluctuations (ΔV/V0) of immature DCs migrating in micro‐channels. Boxes illustrate 10-90 percentiles of values, and whiskers represent the range of values. P‐values were calculated using a Kruskal-Wallis test. Analysis of shScramble (gray)‐, shIP3R(1,3) A (blue)‐, shIP3R(2,3) B (red)‐, and shIP3R(

In [53]:
text, predictions_transformer, labels = show_example(EXAMPLE)
print(len(tokenizer(ds[EXAMPLE]["words"], is_split_into_words=True)["input_ids"]))
html_print(text)

883


In [None]:
tokenizer.decode(tokenizer(ds[EXAMPLE]["text"])["input_ids"][512:])

In [1576]:
tagger(tokenizer.decode(tokenizer(ds[EXAMPLE]["text"])["input_ids"][512:]))

[]

In [1577]:
tokenizer.decode(tokenizer(ds[EXAMPLE]["text"])["input_ids"][512:])[114:120]

'ed, wi'

In [1578]:
panel_types = {0: "(a)_and_(a,b)",
               1: "(A)",
               2: "Figure XA_and_B",
               3: "(left)_(right)_labels_at_end",
               4: "(A)_labels_at_end",
               5: "Not clear for humans",
               6: "A or A,B",
               7: "No panels",
               8: "a, and a,b",
               9: "A)"}
panel_type_list = [0,0,0,1,2,2,0,1,0,3,0,0,1,4,5,6,2,7,1,1,
                   0,6,0,6,0,1,1,0,0,1,0,7,6,8,0,0,7,6,1,0,
                   7,8,8,1,0,1,0,8,8,9,4,6,7,7,0,7,7,6,9,0,
                   7,6,0,7,0,0,6,6,1,6,7,7,0,1,0,1,0,6,6,0,
                  0]

In [1579]:
total_good_labels = [5,6,2,4,7,5,6,3,3,1,8,3,2,7,2,7,5,1,2,2,
                     2,1,2,3,3,6,2,4,5,1,2,2,7,5,3,2,1,12,2,6,
                     1,3,4,2,0,2,2,7,4,7,3,2,4,4,4,4,1,5,5,4,
                     1,2,2,1,2,2,8,6,3,6,1,1,3,3,2,7,2,6,3,3,
                    9]

In [1580]:
total_labelled_panels = [5,6,2,4,4,5,5,5,3,1,8,3,2,7,3,8,5,1,2,0,
                         2,2,2,3,3,6,2,4,5,1,2,2,7,5,4,2,1,12,3,6,
                         1,2,4,2,0,2,2,4,4,7,3,2,4,4,4,4,1,4,5,4,
                        1,2,2,1,3,2,8,6,3,6,1,1,4,3,2,6,2,6,2,3,
                        7,]

In [1581]:
actual_panel_separations = [5,7,2,4,8,5,6,5,3,2,8,3,2,8,3,8,5,1,2,5,
                            2,2,3,3,3,6,2,4,5,4,2,2,7,5,4,2,1,12,3,6,
                            1,5,4,2,6,2,3,7,8,8,3,2,4,4,2,4,1,8,5,4,
                           1,2,3,3,2,2,8,6,3,5,1,1,6,3,2,7,5,6,3,3,
                           9]

In [1582]:
wrong_defined_labels = [0,1,0,0,3,0,1,0,0,2,0,0,1,0,3,0,0,0,0,6,
                        0,1,3,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,
                        0,4,0,0,0,0,1,3,4,1,0,0,0,0,2,0,0,4,0,0,
                        0,0,1,0,1,0,0,0,3,1,0,0,1,0,0,0,3,0,1,0,
                       2]

In [1583]:
len(panel_type_list), len(total_good_labels),len(total_labelled_panels),len(actual_panel_separations),len(wrong_defined_labels),

(81, 81, 81, 81, 81)

In [1584]:
special_examples = [9,13,14,29,38,46,54,74]
for idx in special_examples[::-1]:
    total_good_labels.pop(idx)
    total_labelled_panels.pop(idx)
    actual_panel_separations.pop(idx)
    wrong_defined_labels.pop(idx)

In [1585]:
len(total_good_labels),len(total_labelled_panels),len(actual_panel_separations),len(wrong_defined_labels),

(73, 73, 73, 73)

In [1586]:
sum(total_good_labels), sum(total_labelled_panels), sum(actual_panel_separations), sum(wrong_defined_labels)

(272, 264, 307, 45)

In [1587]:
print(f"% success with labeled data {100*sum(total_good_labels)/sum(total_labelled_panels)}")
print(f"% success with actual number of panels {100*sum(total_good_labels)/sum(actual_panel_separations)}")
print(f"% wrong defined labels from total panels {100*sum(wrong_defined_labels)/sum(actual_panel_separations)}")
print(f"% wrong defined labels from labels {100*sum(wrong_defined_labels)/sum(total_good_labels)}")

% success with labeled data 103.03030303030303
% success with actual number of panels 88.59934853420195
% wrong defined labels from total panels 14.657980456026058
% wrong defined labels from labels 16.544117647058822


In [1588]:
pattern = [["F","Immunofluorescence"],["H", "Bar"],["J",",","K"]] # Repeat this one
pattern = [["(","d", ")"],["(","e",")"]] 
#pattern = [["figf2Stable"]]
a = ds[EXAMPLE]["words"]
offsets = []
for b in pattern:
    offsets.append([(i, i+len(b)) for i in range(len(a)) if a[i:i+len(b)] == b][0])
offsets

[(229, 232), (316, 319)]

In [1589]:
label_list = ds[EXAMPLE]["labels"]
print(sum(label_list))
for offset in offsets:
    label_list[offset[0]] = 1
# label_list[371] = 0
print(sum(label_list))
temp_train_ds.iloc[EXAMPLE]["labels"] = np.array(label_list)
temp_train_ds.to_pickle("/app/data/modified_panelization_v2_labels_train.pkl")

7
9


In [57]:
pip install tensorflow

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting tensorflow
  Downloading tensorflow-2.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (578.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m578.1/578.1 MB[0m [31m99.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting libclang>=13.0.0
  Downloading libclang-14.0.6-py2.py3-none-manylinux2010_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting wrapt>=1.11.0
  Downloading wrap

[0mNote: you may need to restart the kernel to use updated packages.


In [58]:
from transformers import Pipeline
from torch import Tensor
import torch
import tensorflow as tf
import numpy as np

2022-10-04 13:19:56.354102: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-04 13:19:56.582057: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-04 13:19:57.478167: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/compat/lib.real:/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64


In [59]:
from transformers.pipelines.base import ChunkPipeline
from transformers.pipelines.token_classification import (TokenClassificationArgumentHandler, 
                                                         TokenClassificationPipeline, AggregationStrategy )
from transformers.models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
from typing import Optional, List, Tuple, Union, Any, Dict
from transformers.models.bert.tokenization_bert import BasicTokenizer
from math import ceil

In [66]:
class LongTextTokenClassificationPipeline(ChunkPipeline):
    """
    Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition
    examples](../task_summary#named-entity-recognition) for more information.
    Strings of any length can be passed. If they exceed `ModelForTokenClassification.config.max_position_embeddings` tokens,
    they will be divided into several parts text that will be passed to the `forward` method.
    The results will then be concatenated together and be sent back.
    *LongTextTokenClassificationPipeline* uses `offsets_mapping` and therefore is available only with `FastTokenizer`.
    The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
    up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=token-classification).
    """    
    default_input_names = "sequences"

    def __init__(self, args_parser=TokenClassificationArgumentHandler(), *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.check_model_type(
            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
            if self.framework == "tf"
            else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
        )

        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
        self._args_parser = args_parser
        if not self.tokenizer.is_fast:
            raise TypeError(
            """LongTextTokenClassificationPipeline works only with fast tokenizers.
            Please choose a fast tokenizer."""
            )

    def _sanitize_parameters(
        self,
        ignore_labels=None,
        grouped_entities: Optional[bool] = None,
        ignore_subwords: Optional[bool] = None,
        aggregation_strategy: Optional[AggregationStrategy] = None,
        offset_mapping: Optional[List[Tuple[int, int]]] = None,
        stride: Optional[int] = None,
    ):

        preprocess_params = {}
        if offset_mapping is not None:
            preprocess_params["offset_mapping"] = offset_mapping

        postprocess_params = {}
        if grouped_entities is not None or ignore_subwords is not None:
            if grouped_entities and ignore_subwords:
                aggregation_strategy = AggregationStrategy.FIRST
            elif grouped_entities and not ignore_subwords:
                aggregation_strategy = AggregationStrategy.SIMPLE
            else:
                aggregation_strategy = AggregationStrategy.NONE

            if grouped_entities is not None:
                warnings.warn(
                    "`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to"
                    f' `aggregation_strategy="{aggregation_strategy}"` instead.'
                )
            if ignore_subwords is not None:
                warnings.warn(
                    "`ignore_subwords` is deprecated and will be removed in version v5.0.0, defaulted to"
                    f' `aggregation_strategy="{aggregation_strategy}"` instead.'
                )

        if aggregation_strategy is not None:
            if isinstance(aggregation_strategy, str):
                aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()]
            if (
                aggregation_strategy
                in {AggregationStrategy.FIRST, AggregationStrategy.MAX, AggregationStrategy.AVERAGE}
                and not self.tokenizer.is_fast
            ):
                raise ValueError(
                    "Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option"
                    'to `"simple"` or use a fast tokenizer.'
                )
            postprocess_params["aggregation_strategy"] = aggregation_strategy
        if ignore_labels is not None:
            postprocess_params["ignore_labels"] = ignore_labels
            
            
        if stride is not None:
            if not isinstance(stride, int): 
                raise TypeError(
                    f"Strides must be of type `int`. {type(stride)} was given."
                )
            postprocess_params["stride"] = stride
            preprocess_params["stride"] = stride
            
        return preprocess_params, {}, postprocess_params
    
    def __call__(self, inputs: Union[str, List[str]], **kwargs):
        """
        Classify each token of the text(s) given as inputs.
        Args:
            inputs (`str` or `List[str]`):
                One or several texts (or one list of texts) for token classification.
        Return:
            A list or a list of list of `dict`: Each result comes as a list of dictionaries (one for each token in the
            corresponding input, or each entity if this pipeline was instantiated with an aggregation_strategy) with
            the following keys:
            - **word** (`str`) -- The token/word classified. This is obtained by decoding the selected tokens. If you
              want to have the exact string in the original sentence, use `start` and `stop`.
            - **score** (`float`) -- The corresponding probability for `entity`.
            - **entity** (`str`) -- The entity predicted for that token/word (it is named *entity_group* when
              *aggregation_strategy* is not `"none"`.
            - **index** (`int`, only present when `aggregation_strategy="none"`) -- The index of the corresponding
              token in the sentence.
            - **start** (`int`, *optional*) -- The index of the start of the corresponding entity in the sentence. Only
              exists if the offsets are available within the tokenizer
            - **end** (`int`, *optional*) -- The index of the end of the corresponding entity in the sentence. Only
              exists if the offsets are available within the tokenizer
        """

        _inputs, offset_mapping = self._args_parser(inputs, **kwargs)
        if offset_mapping:
            kwargs["offset_mapping"] = offset_mapping
            
        return super().__call__(inputs, **kwargs)

    def preprocess(self, sentence, offset_mapping=None, stride=0):
        truncation = False
        
        model_inputs = self.tokenizer(
            sentence,
            return_tensors=None,
            truncation=truncation,
            return_special_tokens_mask=True,
            return_offsets_mapping=self.tokenizer.is_fast,
        )

#         sentence_chunks = self._get_sentence_chunks(model_inputs["input_ids"], stride)
                        
        if offset_mapping:
            model_inputs["offset_mapping"] = offset_mapping
                    
        model_inputs["sentence"] = sentence
        
        idx_lookup = list(range(len(model_inputs["input_ids"])))[1:-1]
        first_token = 0
        bos_token = model_inputs["input_ids"][0]
        eos_token = model_inputs["input_ids"][-1]
        
        chunk_inputs = {}
        
        while first_token < len(idx_lookup):
            start = max(0,first_token-stride)
            end = min(start + self.model.config.max_length - 2, len(idx_lookup))
            
            chunk_inputs["input_ids"] = self._to_tensor(
                [bos_token] + model_inputs["input_ids"][1:-1][start:end] + [eos_token]
                )
            chunk_inputs["token_type_ids"] = self._to_tensor(
                [0] + model_inputs["token_type_ids"][1:-1][start:end] + [0]
                )
            chunk_inputs["attention_mask"] = self._to_tensor(
                [1] + model_inputs["attention_mask"][1:-1][start:end] + [1]
                )
            chunk_inputs["special_tokens_mask"] = self._to_tensor(
                [1] + model_inputs["special_tokens_mask"][1:-1][start:end] + [1]
                )
            chunk_inputs["offset_mapping"] = [(0,0)] + model_inputs["offset_mapping"][1:-1][start:end] + [(0,0)]
            chunk_inputs["chunk_sentence"] = tokenizer.decode(chunk_inputs["input_ids"][0])
            chunk_inputs["sentence"] = sentence
            
            first_token = end
                        
            yield {**chunk_inputs}
            
    def _forward(self, chunk_inputs: Dict[str, Any]) -> List[dict]:
        # Forward
        special_tokens_mask = chunk_inputs.pop("special_tokens_mask")
        offset_mapping = chunk_inputs.pop("offset_mapping", None)
        sentence = chunk_inputs.pop("sentence")
        chunk_sentence = chunk_inputs.pop("chunk_sentence")
        if self.framework == "tf":
            logits = self.model(chunk_inputs.data)[0]
        else:
            logits = self.model(**chunk_inputs)[0]

        return {
            "logits": logits,
            "special_tokens_mask": special_tokens_mask,
            "offset_mapping": offset_mapping,
            "sentence": sentence,
            "chunk_sentence": chunk_sentence,
            **chunk_inputs,
        }
    
    def postprocess(self, model_outputs: List[Dict[str, Any]], 
                    aggregation_strategy=AggregationStrategy.NONE, 
                    ignore_labels=None, 
                    stride=0):
        sentence = model_outputs[0]["sentence"]
        aggregated_tokenizer_outputs = tokenizer(sentence,
            return_tensors=self.framework,
            return_special_tokens_mask=True,
            return_offsets_mapping=self.tokenizer.is_fast,
        )
        input_ids = aggregated_tokenizer_outputs["input_ids"]
        offset_mapping = aggregated_tokenizer_outputs["offset_mapping"]
        special_tokens_mask = aggregated_tokenizer_outputs["special_tokens_mask"]
        
        logits = self._aggregate_chunk_outputs(model_outputs, stride)
        
        if ignore_labels is None:
            ignore_labels = ["O"]
        logits = logits.numpy()
        input_ids = input_ids[0]
        offset_mapping = offset_mapping[0] if offset_mapping is not None else None
        special_tokens_mask = special_tokens_mask[0].numpy()

        maxes = np.max(logits, axis=-1, keepdims=True)
        shifted_exp = np.exp(logits - maxes)
        scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
        
        pre_entities = self.gather_pre_entities(
            sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy
        )
        grouped_entities = self.aggregate(pre_entities, aggregation_strategy)
        print(grouped_entities)
        # Filter anything that is in self.ignore_labels
        entities = [
            entity
            for entity in grouped_entities
            if entity.get("entity", None) not in ignore_labels
            and entity.get("entity_group", None) not in ignore_labels
        ]
        return entities                
        
    def _to_tensor(self, inputs: List[Any]) -> Union[tf.Tensor, torch.tensor, np.ndarray]:
        if self.framework == "pt":
            return torch.tensor(inputs).unsqueeze(0)
        if self.framework == "tf":
            return tf.reshape(tf.convert_to_tensor(inputs), (1,-1))
        if self.framework == "np":
            return np.array(inputs).reshape(1,-1)

    def _aggregate_chunk_outputs(self, outputs: 
                                 List[Dict[str, Any]], 
                                 stride: int) -> Union[tf.Tensor, torch.tensor, np.ndarray]:
        """
        Change this to numpy or lits to save cuda space
        """
        for iter_, chunk_output in enumerate(outputs):
            is_first = (iter_ == 0)
            is_last = (iter_ == len(outputs)-1)
            if is_first:
                logits = chunk_output["logits"][0][:-1]
            elif is_last:
                logits = self._concat(logits, chunk_output["logits"][0][stride+1:])
            else:
                logits = self._concat(logits, chunk_output["logits"][0][stride+1:-1])
                
        return logits
            
    def _concat(self, 
                 t1: Union[tf.Tensor, torch.tensor, np.ndarray],
                 t2: Union[tf.Tensor, torch.tensor, np.ndarray],
                 axis: int  = 0
                ) -> Union[tf.Tensor, torch.tensor, np.ndarray]:
        if self.framework == "pt":
            concat = torch.concat([t1, t2], axis=axis)
        if self.framework == "tf":
            concat = tf.concat([t1, t2], axis=axis)
        if self.framework == "np":
            concat = np.concatenate([t1, t2], axis=axis)
        return concat
    
    def gather_pre_entities(
        self,
        sentence: str,
        input_ids: np.ndarray,
        scores: np.ndarray,
        offset_mapping: Optional[List[Tuple[int, int]]],
        special_tokens_mask: np.ndarray,
        aggregation_strategy: AggregationStrategy,
    ) -> List[dict]:
        """Fuse various numpy arrays into dicts with all the information needed for aggregation"""
        pre_entities = []
        for idx, token_scores in enumerate(scores):
            # Filter special_tokens, they should only occur
            # at the sentence boundaries since we're not encoding pairs of
            # sentences so we don't have to keep track of those.
            if special_tokens_mask[idx]:
                continue

            word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
            if offset_mapping is not None:
                start_ind, end_ind = offset_mapping[idx]
                if not isinstance(start_ind, int):
                    if self.framework == "pt":
                        start_ind = start_ind.item()
                        end_ind = end_ind.item()
                    else:
                        start_ind = int(start_ind.numpy())
                        end_ind = int(end_ind.numpy())
                word_ref = sentence[start_ind:end_ind]
                if getattr(self.tokenizer._tokenizer.model, "continuing_subword_prefix", None):
                    # This is a BPE, word aware tokenizer, there is a correct way
                    # to fuse tokens
                    is_subword = len(word) != len(word_ref)
                else:
                    # This is a fallback heuristic. This will fail most likely on any kind of text + punctuation mixtures that will be considered "words". Non word aware models cannot do better than this unfortunately.
                    if aggregation_strategy in {
                        AggregationStrategy.FIRST,
                        AggregationStrategy.AVERAGE,
                        AggregationStrategy.MAX,
                    }:
                        warnings.warn("Tokenizer does not support real words, using fallback heuristic", UserWarning)
                    is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1]

                if int(input_ids[idx]) == self.tokenizer.unk_token_id:
                    word = word_ref
                    is_subword = False
            else:
                start_ind = None
                end_ind = None
                is_subword = False

            pre_entity = {
                "word": word,
                "scores": token_scores,
                "start": start_ind,
                "end": end_ind,
                "index": idx,
                "is_subword": is_subword,
            }
            pre_entities.append(pre_entity)
        return pre_entities

    def aggregate(self, pre_entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]:
        if aggregation_strategy in {AggregationStrategy.NONE, AggregationStrategy.SIMPLE}:
            entities = []
            for pre_entity in pre_entities:
                entity_idx = pre_entity["scores"].argmax()
                score = pre_entity["scores"][entity_idx]
                entity = {
                    "entity": self.model.config.id2label[entity_idx],
                    "score": score,
                    "index": pre_entity["index"],
                    "word": pre_entity["word"],
                    "start": pre_entity["start"],
                    "end": pre_entity["end"],
                }
                entities.append(entity)
        else:
            entities = self.aggregate_words(pre_entities, aggregation_strategy)

        if aggregation_strategy == AggregationStrategy.NONE:
            return entities
        return self.group_entities(entities)

    def aggregate_word(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> dict:
        word = self.tokenizer.convert_tokens_to_string([entity["word"] for entity in entities])
        if aggregation_strategy == AggregationStrategy.FIRST:
            scores = entities[0]["scores"]
            idx = scores.argmax()
            score = scores[idx]
            entity = self.model.config.id2label[idx]
        elif aggregation_strategy == AggregationStrategy.MAX:
            max_entity = max(entities, key=lambda entity: entity["scores"].max())
            scores = max_entity["scores"]
            idx = scores.argmax()
            score = scores[idx]
            entity = self.model.config.id2label[idx]
        elif aggregation_strategy == AggregationStrategy.AVERAGE:
            scores = np.stack([entity["scores"] for entity in entities])
            average_scores = np.nanmean(scores, axis=0)
            entity_idx = average_scores.argmax()
            entity = self.model.config.id2label[entity_idx]
            score = average_scores[entity_idx]
        else:
            raise ValueError("Invalid aggregation_strategy")
        new_entity = {
            "entity": entity,
            "score": score,
            "word": word,
            "start": entities[0]["start"],
            "end": entities[-1]["end"],
        }
        return new_entity

    def aggregate_words(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]:
        """
        Override tokens from a given word that disagree to force agreement on word boundaries.
        Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft|
        company| B-ENT I-ENT
        """
        if aggregation_strategy in {
            AggregationStrategy.NONE,
            AggregationStrategy.SIMPLE,
        }:
            raise ValueError("NONE and SIMPLE strategies are invalid for word aggregation")

        word_entities = []
        word_group = None
        for entity in entities:
            if word_group is None:
                word_group = [entity]
            elif entity["is_subword"]:
                word_group.append(entity)
            else:
                word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
                word_group = [entity]
        # Last item
        word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
        return word_entities

    def group_sub_entities(self, entities: List[dict]) -> dict:
        """
        Group together the adjacent tokens with the same entity predicted.
        Args:
            entities (`dict`): The entities predicted by the pipeline.
        """
        # Get the first entity in the entity group
        entity = entities[0]["entity"].split("-")[-1]
        scores = np.nanmean([entity["score"] for entity in entities])
        tokens = [entity["word"] for entity in entities]

        entity_group = {
            "entity_group": entity,
            "score": np.mean(scores),
            "word": self.tokenizer.convert_tokens_to_string(tokens),
            "start": entities[0]["start"],
            "end": entities[-1]["end"],
        }
        return entity_group

    def get_tag(self, entity_name: str) -> Tuple[str, str]:
        if entity_name.startswith("B-"):
            bi = "B"
            tag = entity_name[2:]
        elif entity_name.startswith("I-"):
            bi = "I"
            tag = entity_name[2:]
        else:
            # It's not in B-, I- format
            # Default to I- for continuation.
            bi = "I"
            tag = entity_name
        return bi, tag

    def group_entities(self, entities: List[dict]) -> List[dict]:
        """
        Find and group together the adjacent tokens with the same entity predicted.
        Args:
            entities (`dict`): The entities predicted by the pipeline.
        """

        entity_groups = []
        entity_group_disagg = []

        for entity in entities:
            if not entity_group_disagg:
                entity_group_disagg.append(entity)
                continue

            # If the current entity is similar and adjacent to the previous entity,
            # append it to the disaggregated entity group
            # The split is meant to account for the "B" and "I" prefixes
            # Shouldn't merge if both entities are B-type
            bi, tag = self.get_tag(entity["entity"])
            last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"])

            if tag == last_tag and bi != "B":
                # Modify subword type to be previous_type
                entity_group_disagg.append(entity)
            else:
                # If the current entity is different from the previous entity
                # aggregate the disaggregated entity group
                entity_groups.append(self.group_sub_entities(entity_group_disagg))
                entity_group_disagg = [entity]
        if entity_group_disagg:
            # it's the last entity, add it to the entity groups
            entity_groups.append(self.group_sub_entities(entity_group_disagg))

        return entity_groups


In [73]:
SENTENCE = """Figure 2A. HEK293T cells were transfected with MYC-FOXP3 and FLAG-USP44 encoding expression constructs using Polyethylenimine. 48hrs post-transfection, cells were harvested, lysed, and anti-FLAG or anti-MYC antibody coated beads were used to immunoprecipitate the given labeled protein along with its binding partner. Co-IP' ed proteins were subjected to SDS PAGE followed by immunoblot analysis. Antibodies recognizing FLAG or MYC tags were used to probe for USP44 and FOXP3, respectively. B. Endogenous co-IP of USP44 and FOXP3 in murine iTregs. iTregs were generated as in Fig. 1 from naïve CD4+T cells FACS isolated from pooled suspensions of the lymph node and spleen cells of wild type C57BL/6 mice (n = 2-3 / experiment). iTregs were lysed and key proteins were immunoprecipitated using either anti-USP44 (right panel) or anti-FOXP3 (left panel) antibody. Proteins pulled-down in this experiment were then resolved and analyzed by immunoblot using anti-FOXP3 or anti-USP44 antibodies. C. Endogenous co-IP of USP44 and FOXP3 in murine nTregs. nTregs (CD4+CD25high) isolated by FACS were activated by anti-CD3 and anti-CD28 (1 and 4 ug/ml, respectively) overnight in the presence of IL-2 (100 U/ml). The cells were lysed and proteins were immunoprecipitated using either anti-Foxp3 (left panel) or anti-Usp44 (right panel). Proteins pulled down in this experiment were then resolved and identified with the indicated antibodies. D . Naïve murine CD4+T cells were isolated by FACS from lymph node and spleen cell suspension of USP44fl/fl CD4Cre+ mice and that of their wild type littermates (USP44fl/fl CD4Cre-mice; n = 2-3 / group / experiment) . iTreg cells were generated from these mice as described for Fig. 1 before incubation on a microscope slide pre-coated with poly-L lysine for 1h. Adhered cells were then fixed by PFA for 0.5 followed by blocking with 1% BSA for 1h, then incubation with the specified antibodies. Representative confocal microscopy images (40X) were visualized for endogenous USP44 (red) and FOXP3 Baxter et al (). DAPI was used to visualize cell nuclei (blue); scale bar 50μm."""
model = AutoModelForTokenClassification.from_pretrained("EMBO/sd-ner-v2")
pipe = LongTextTokenClassificationPipeline(task="token-classification", 
                     model=model, 
                     tokenizer=tokenizer,
                     device=0,
                     aggregation_strategy="simple")


outputs = pipe(SENTENCE, stride=50)
outputs

[{'entity_group': 'O', 'score': 0.9999321, 'word': 'figure 2a.', 'start': 0, 'end': 10}, {'entity_group': 'CELL', 'score': 0.9697927, 'word': 'hek293t', 'start': 11, 'end': 18}, {'entity_group': 'O', 'score': 0.9916782, 'word': 'cells were transfected with', 'start': 19, 'end': 46}, {'entity_group': 'GENEPROD', 'score': 0.9749185, 'word': 'myc', 'start': 47, 'end': 50}, {'entity_group': 'O', 'score': 0.99988616, 'word': '-', 'start': 50, 'end': 51}, {'entity_group': 'GENEPROD', 'score': 0.9739129, 'word': 'foxp3', 'start': 51, 'end': 56}, {'entity_group': 'O', 'score': 0.9999181, 'word': 'and', 'start': 57, 'end': 60}, {'entity_group': 'GENEPROD', 'score': 0.97236145, 'word': 'flag', 'start': 61, 'end': 65}, {'entity_group': 'O', 'score': 0.99988174, 'word': '-', 'start': 65, 'end': 66}, {'entity_group': 'GENEPROD', 'score': 0.9444879, 'word': 'usp44', 'start': 66, 'end': 71}, {'entity_group': 'O', 'score': 0.99732995, 'word': 'encoding expression constructs using', 'start': 72, 'end':

[{'entity_group': 'CELL',
  'score': 0.9697927,
  'word': 'hek293t',
  'start': 11,
  'end': 18},
 {'entity_group': 'GENEPROD',
  'score': 0.9749185,
  'word': 'myc',
  'start': 47,
  'end': 50},
 {'entity_group': 'GENEPROD',
  'score': 0.9739129,
  'word': 'foxp3',
  'start': 51,
  'end': 56},
 {'entity_group': 'GENEPROD',
  'score': 0.97236145,
  'word': 'flag',
  'start': 61,
  'end': 65},
 {'entity_group': 'GENEPROD',
  'score': 0.9444879,
  'word': 'usp44',
  'start': 66,
  'end': 71},
 {'entity_group': 'SMALL_MOLECULE',
  'score': 0.9096496,
  'word': 'polyethylenimine',
  'start': 109,
  'end': 125},
 {'entity_group': 'GENEPROD',
  'score': 0.9734517,
  'word': 'flag',
  'start': 190,
  'end': 194},
 {'entity_group': 'GENEPROD',
  'score': 0.9762305,
  'word': 'myc',
  'start': 203,
  'end': 206},
 {'entity_group': 'EXP_ASSAY',
  'score': 0.85167843,
  'word': 'immunoprecipitate',
  'start': 242,
  'end': 259},
 {'entity_group': 'EXP_ASSAY',
  'score': 0.91382194,
  'word': 'co 

In [227]:
len(tokenizer(SENTENCE)["offset_mapping"])

Token indices sequence length is longer than the specified maximum sequence length for this model (982 > 512). Running this sequence through the model will result in indexing errors


KeyError: 'offset_mapping'

In [72]:
tokenizer(["First sentence", "second sentence"], return_special_tokens_mask=True)

{'input_ids': [[2, 2389, 21011, 3], [2, 2702, 21011, 3]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1]], 'special_tokens_mask': [[1, 0, 0, 1], [1, 0, 0, 1]]}

In [38]:
outputs[1]["logits"].shape

torch.Size([1, 512, 3])

In [39]:
outputs[2]["logits"].shape

torch.Size([1, 62, 3])

In [46]:
outputs[0]["logits"][0,510,:], outputs[0]["input_ids"][0,510]

(tensor([ 8.3776, -3.5248, -5.2105]), tensor(2181))

In [53]:
outputs[1]["logits"][0,50,:], outputs[1]["input_ids"][0,50]

(tensor([ 8.5788, -4.2457, -4.9862]), tensor(2181))

In [55]:
outputs[1]["logits"][0,510,:], outputs[1]["input_ids"][0,510]

(tensor([ 7.9096, -2.7037, -5.1575]), tensor(1816))

In [56]:
outputs[2]["logits"][0,50,:], outputs[2]["input_ids"][0,50]

(tensor([ 8.0859, -2.9973, -4.8152]), tensor(1816))

In [93]:
a = [[1,2]]
b = [[3,4]]

In [95]:
tf.concat([tf.convert_to_tensor(a), tf.convert_to_tensor(b)], axis=1)

<tf.Tensor: shape=(1, 4), dtype=int32, numpy=array([[1, 2, 3, 4]], dtype=int32)>

In [96]:
torch.concat([torch.tensor(a), torch.tensor(b)], axis=1)

tensor([[1, 2, 3, 4]])

In [98]:
np.concatenate([np.array(a), np.array(b)], axis=1)

array([[1, 2, 3, 4]])

In [79]:
offset_mapping = [[(0, 1), (1, 2)], [(0, 1), (1, 2)]]
isinstance(offset_mapping[0], list), len(offset_mapping[0]) > 1, not isinstance(offset_mapping[0][0], tuple)

(True, True, False)

In [80]:
len(offset_mapping)

2

In [81]:
not isinstance(offset_mapping[0][0], tuple)

False