In [1]:
from datasets import load_dataset, load_from_disk
import torch

In [3]:
ds_receipts = load_dataset("./receiptdataset")

Using custom data configuration default


Downloading and preparing dataset receipt_dataset/default to /Users/simon/.cache/huggingface/datasets/receipt_dataset/default/0.0.0/833bfe31b21ca25ef9949979860f6a4ecc110542cf37b7fed0ad630270350823...


0 examples [00:00, ? examples/s]

Dataset receipt_dataset downloaded and prepared to /Users/simon/.cache/huggingface/datasets/receipt_dataset/default/0.0.0/833bfe31b21ca25ef9949979860f6a4ecc110542cf37b7fed0ad630270350823. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
ds_receipts.save_to_disk("../datasets/ds_receipts_base")

In [154]:
ds_receipts = load_from_disk("../datasets/ds_receipts_base")
ds_receipts

DatasetDict({
    train: Dataset({
        features: ['file_id', 'input_image', 'company', 'date', 'address', 'total'],
        num_rows: 725
    })
})

In [4]:
from transformers import LayoutLMv2FeatureExtractor, LayoutXLMTokenizer, LayoutXLMProcessor

feature_extractor_xlm = LayoutLMv2FeatureExtractor()
tokenizer_xlm = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
#processor_xlm = LayoutXLMProcessor(feature_extractor_xlm, tokenizer_xlm)

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/904 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LayoutLMv2Tokenizer'. 
The class this function is called from is 'LayoutXLMTokenizer'.


In [5]:
def extract_features(examples):
    features_dict = feature_extractor_xlm(examples['input_image'])
    features_dict['pixel_values'] = features_dict['pixel_values'][0]
    return features_dict

In [6]:
num_proc = 8
ds_receipts = ds_receipts.map(extract_features, batched=False, num_proc=num_proc)

In [7]:
num_proc = 8
ds_receipts = ds_receipts.filter(lambda examples: len(examples['words'][0])>0, batched=False, num_proc=num_proc)

In [8]:
ds_receipts.save_to_disk("../datasets/ds_receipts_features")

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [189]:
ds_receipts = load_from_disk("../datasets/ds_receipts_features")
ds_receipts

DatasetDict({
    train: Dataset({
        features: ['file_id', 'input_image', 'company', 'date', 'address', 'total', 'pixel_values', 'words', 'boxes'],
        num_rows: 717
    })
})

In [10]:
label_names = ['company', 'date', 'address', 'total']
labels = ['O'] + label_names
num_labels = len(labels)
ids_to_labels = {k: v for k, v in enumerate(labels)}
labels_to_ids = {v: k for k, v in enumerate(labels)}

In [11]:
def get_word_labels(examples):
    word_labels = [0]*len(examples['words'][0])
    for label in label_names[1:]:
        key_words = set(examples[label].split())
        for i, word in enumerate(examples['words'][0]):
            if word in key_words:
                word_labels[i] = labels_to_ids[label]
    return {'word_labels': [word_labels]}

In [12]:
num_proc = 8
ds_receipts = ds_receipts.map(get_word_labels, batched=False, num_proc=num_proc)

In [13]:
max_length = 512
def tokenize(examples):
    tokenizer_dict = tokenizer_xlm(
        text=examples['words'],
        word_labels=examples['word_labels'],
        boxes=examples['boxes'],
        is_split_into_words=True,
        padding='max_length',
        max_length=max_length,
        truncation=True,
        return_tensors='pt')
    tokenizer_dict['input_ids'] = tokenizer_dict['input_ids'][0]
    tokenizer_dict['bbox'] = tokenizer_dict['bbox'][0]
    tokenizer_dict['labels'] = tokenizer_dict['labels'][0]
    tokenizer_dict['attention_mask'] = tokenizer_dict['attention_mask'][0]
    return tokenizer_dict

In [14]:
num_proc = 8
ds_receipts = ds_receipts.map(tokenize, batched=False, num_proc=num_proc)

In [15]:
ds_receipts = ds_receipts.rename_column('pixel_values', 'image')
col_names_to_remove = label_names + ['words', 'boxes', 'word_labels', 'file_id', 'input_image']
ds_receipts = ds_receipts.remove_columns(col_names_to_remove)

In [16]:
test_share = 0.1
ds_receipts = ds_receipts['train'].train_test_split(test_share)

In [197]:
ds_receipts.save_to_disk("../datasets/ds_receipts_tokenized")

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [198]:
ds_receipts = load_from_disk("../datasets/ds_receipts_tokenized")
ds_receipts['train'].features

{'image': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='uint8', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'bbox': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [17]:
from datasets import Features, Array2D, Array3D, Sequence, Value
max_length = 512
features = Features({
    'image': Array3D(dtype="uint8", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype="int32"), length=max_length),
    'bbox': Array2D(dtype="int64", shape=(max_length, 4)),
    'labels': Sequence(feature=Value(dtype="int64"), length=max_length),
    'attention_mask': Sequence(feature=Value(dtype="int8"), length=max_length),
})

In [18]:
ds_receipts = ds_receipts.cast(features)

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

TypeError: Couldn't cast array of type
list<item: int32>
to
Sequence(feature=Value(dtype='int32', id=None), length=512, id=None)

In [19]:
ds_receipts.save_to_disk("../datasets/ds_receipts_final")

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [8]:
ds_receipts = load_from_disk("../datasets/ds_receipts_final")
ds_receipts['train'].features

{'image': Array3D(shape=(3, 224, 224), dtype='uint8', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=512, id=None),
 'bbox': Array2D(shape=(512, 4), dtype='int64', id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=512, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=512, id=None)}

In [20]:
ds_receipts.push_to_hub("sibrun/receipts", private=True)

Pushing split train to the Hub.
The repository already exists: the `private` keyword argument will be ignored.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.
The repository already exists: the `private` keyword argument will be ignored.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

In [203]:
print(ds_receipts['train']['input_ids'][0])

[0, 94536, 92717, 9713, 159, 51671, 335, 16291, 19107, 101800, 1837, 9, 594, 16, 54133, 60614, 162607, 74604, 276, 82302, 201, 11033, 4, 21951, 2975, 11, 1702, 64, 10837, 52063, 9292, 56049, 120689, 714, 4, 1112, 10700, 14318, 22756, 34712, 9344, 152, 5098, 10342, 1530, 2489, 134131, 5, 77153, 2819, 5098, 10342, 1530, 39425, 12975, 159424, 106, 397, 12, 84453, 7709, 4235, 156215, 192616, 101, 5, 101, 212430, 10931, 618, 124090, 12, 6, 161414, 20, 4, 45334, 758, 65619, 31, 7709, 11062, 468, 2819, 86565, 186898, 25512, 292, 74, 953, 64024, 3259, 5098, 6632, 12, 4828, 20266, 73879, 6815, 152, 14045, 36053, 128780, 276, 18148, 20, 19452, 841, 25019, 382, 65040, 97259, 119527, 176830, 787, 2525, 1892, 6463, 78426, 6, 108210, 116, 33867, 6, 82683, 162856, 214, 3145, 20314, 674, 89678, 1126, 33867, 24372, 162856, 214, 6, 82683, 6, 58745, 73879, 4068, 6463, 313, 24338, 11679, 1892, 6463, 159424, 131182, 2022, 18, 110823, 141, 159424, 132, 25656, 16, 87698, 132, 25656, 16, 190, 2203, 91739, 139

In [204]:
print(ds_receipts['train']['labels'][0])

[-100, 0, 0, -100, 0, -100, 0, -100, 0, -100, -100, -100, -100, -100, 0, -100, 0, 3, 3, -100, 3, -100, -100, 0, 0, -100, 0, -100, -100, 0, 0, 0, 0, 3, -100, 3, -100, 0, 0, 0, 0, 0, 0, -100, -100, -100, -100, -100, 0, 0, 0, -100, -100, -100, -100, 0, 0, -100, -100, -100, -100, -100, -100, -100, 0, -100, 0, 0, -100, -100, 0, -100, 0, -100, 0, -100, 0, -100, -100, -100, -100, 0, 0, 0, 0, -100, 0, 0, -100, 0, -100, -100, 0, -100, -100, 0, -100, 0, -100, 0, 0, -100, -100, 0, -100, 0, 0, -100, -100, 0, -100, -100, -100, -100, 0, -100, 0, -100, 0, 0, -100, 0, 0, 4, -100, 0, -100, 0, -100, -100, 0, -100, 0, 0, 0, -100, 4, -100, 0, -100, 0, 0, -100, 0, -100, -100, 0, -100, 0, 0, 0, -100, 0, -100, 0, -100, -100, -100, 0, -100, -100, -100, 0, 0, 0, 0, -100, 0, -100, 0, 0, 0, 0, -100, 0, -100, 0, 0, -100, 0, 0, -100, 0, -100, -100, -100, -100, 0, 0, -100, -100, 0, 0, -100, -100, 0, -100, 0, 0, -100, 0, -100, 0, -100, 0, -100, -100, -100, -100, -100, -100, -100, 0, 0, -100, 2, -100, -100, 0, -100, 

In [205]:
print(ds_receipts['train']['attention_mask'][0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 