## Inspect Conll data

In [1]:
from datasets import load_dataset

conll_data = load_dataset("conll2003")

  from .autonotebook import tqdm as notebook_tqdm
Downloading data: 100%|██████████| 1.23M/1.23M [00:03<00:00, 351kB/s]
Downloading data: 100%|██████████| 312k/312k [00:00<00:00, 378kB/s]
Downloading data: 100%|██████████| 283k/283k [00:00<00:00, 386kB/s]
Generating train split: 14041 examples [00:00, 698346.07 examples/s]
Generating validation split: 3250 examples [00:00, 584290.10 examples/s]
Generating test split: 3453 examples [00:00, 797650.04 examples/s]


In [2]:
conll_data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [8]:
conll_data['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [13]:
test_i2b2 = load_dataset("text", data_files="../datasets/i2b2/PHI_Processed_data/train.txt")
## clearly not useful

Taking the code from https://huggingface.co/datasets/conll2003/blob/main/conll2003.py for prepping data

## Preparing dataset

In [14]:
import os

import datasets


logger = datasets.logging.get_logger(__name__)


_CITATION = """\
None right now
"""

_DESCRIPTION = """\
i2b2 2006
"""

In [15]:
_TRAINING_FILE = "../datasets/i2b2/PHI_Processed_data/train.txt"
_DEV_FILE = "../datasets/i2b2/PHI_Processed_data/dev.txt"
_TEST_FILE = "../datasets/i2b2/PHI_Processed_data/test.txt"

In [16]:
class i2b2deid2006Config(datasets.BuilderConfig):
    """BuilderConfig for Conll2003"""

    def __init__(self, **kwargs):
        """BuilderConfig forConll2003.
        Args:
          **kwargs: keyword arguments forwarded to super.
        """
        super(i2b2deid2006Config, self).__init__(**kwargs)

In [21]:
class i2b2deid2006(datasets.GeneratorBasedBuilder):
    """Conll2003 dataset."""

    BUILDER_CONFIGS = [
        i2b2deid2006Config(name="i2b2deid2006", version=datasets.Version("1.0.0"), description="i2b2 deid 2006 dataset"),
    ]

    def __init__(self,
                 *args,
                 cache_dir='./',
                 train_file="train.txt",
                 val_file="dev.txt",
                 test_file="test.txt",
                 ner_tags=("O",   "I-PHONE",   "I-PATIENT",   "I-LOCATION",   "I-ID",   "I-HOSPITAL",   "I-DOCTOR",   "I-DATE",   "B-PHONE",   "B-PATIENT",   "B-LOCATION",   "B-ID",   "B-HOSPITAL",   "B-DOCTOR",   "B-DATE", "B-AGE"),
                 **kwargs):
        self._ner_tags = ner_tags
        self._train_file = train_file
        self._val_file = val_file
        self._test_file = test_file
        super(i2b2deid2006, self).__init__(*args, cache_dir=cache_dir, **kwargs)

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "ner_tags": datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=sorted(list(self._ner_tags))
                        )
                    )
                }
            ),
            supervised_keys=None,
            homepage="",
            citation=_CITATION,
        )
    
    def _split_generators(self):
        """Returns SplitGenerators."""
        data_files = {
            "train": _TRAINING_FILE,
            "dev": _DEV_FILE,
            "test": _TEST_FILE,
        }

        return [
            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": data_files["train"]}),
            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": data_files["dev"]}),
            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": data_files["test"]}),
        ]
    
    def _generate_examples(self, filepath):
        logger.info("⏳ Generating examples from = %s", filepath)
        with open(filepath, encoding="utf-8") as f:
            guid = 0
            tokens = []
            ner_tags = []
            for line in f:
                if line == "" or line == "\n":
                    if tokens:
                        yield guid, {
                            "id": str(guid),
                            "tokens": tokens,
                            "ner_tags": ner_tags,
                        }
                        guid += 1
                        tokens = []
                        ner_tags = []
                else:
                    # i2b2 tokens are space separated
                    splits = line.split(" ")
                    tokens.append(splits[0])
                    ner_tags.append(splits[1].rstrip())
            # last example
            if tokens:
                yield guid, {
                    "id": str(guid),
                    "tokens": tokens,
                    "ner_tags": ner_tags,
                }