# Preprocess

> 

In [1]:
#| default_exp preprocess

In [2]:
#| hide
from nbdev.showdoc import *

## Input

### Imports

In [3]:
#| export
import pandas as pd
import numpy as np
import os, argparse
from pathlib import Path
from datasets.features import ClassLabel
from transformers import AutoProcessor
from sklearn.model_selection import train_test_split
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D, Dataset
from datasets import Image as Img
from PIL import Image
import warnings
from typing import Union
#warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


### Functions

In [4]:
#| export

def read_text_file(file_path):
    with open(file_path, 'r') as f:
        return (f.readlines())

def prepare_examples(examples):
    images = examples[image_column_name]
    words = examples[text_column_name]
    boxes = examples[boxes_column_name]
    word_labels = examples[label_column_name]
    encoding = processor(images, words, 
      boxes=boxes, word_labels=word_labels,
      truncation=True, padding="max_length"
                      )
    return encoding

def get_zip_dir_name(data_directory: Union[str, Path]) -> Union[str, bool]:
    data_path = Path(data_directory)
    dir_list = [f.name for f in data_path.iterdir() if f.is_dir()]
    zip_dir_name = dir_list[0]
    if all([f.startswith(zip_dir_name) for f in dir_list]):
        return zip_dir_name
    return False


def filter_out_unannotated(example):
    tags = example['ner_tags']
    return not all([tag == label2id['O'] for tag in tags])

### Fetching files

In [5]:
#| export
TEST_SIZE = 0.33
PROJECT_HOME = Path('..')
INPUT_PATH = PROJECT_HOME/Path('data/doc-scanner/')
OUTPUT_PATH = PROJECT_HOME/Path('data/preprocessed/')
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

In [14]:
print(list(PROJECT_HOME.iterdir()))

[Path('../.git'), Path('../.gitignore'), Path('../README.md'), Path('../settings.ini'), Path('../nbs'), Path('../.github'), Path('../LICENSE'), Path('../MANIFEST.in'), Path('../setup.py'), Path('../paper_solver'), Path('../nbdev-template-1.1.1'), Path('../.ipynb_checkpoints'), Path('../.gitconfig'), Path('../.gitattributes'), Path('../paper_solver.egg-info'), Path('../_proc'), Path('../data'), Path('../preprocessed')]


In [31]:
INPUT_PATH.exists(), OUTPUT_PATH.exists()

(True, True)

In [7]:
#| export
files = {}
zip_dir_name = get_zip_dir_name(INPUT_PATH)

print('zip_dir_name', zip_dir_name)
if zip_dir_name:
    data_path = INPUT_PATH / zip_dir_name
    files['train_box']   = read_text_file(data_path / f'{zip_dir_name}_box.txt')
    files['train_image'] = read_text_file(data_path / f'{zip_dir_name}_image.txt')
    files['train']       = read_text_file(data_path / f'{zip_dir_name}.txt')
else:
    for f in Path('.').iterdir():
        if f.suffix == '.txt' and 'box' in f.name:
            files['train_box'] = read_text_file(f)
        elif f.suffix == '.txt' and 'image' in f.name:
            files['train_image'] = read_text_file(f)
        elif f.suffix == '.txt' and 'labels' not in f.name:
            files['train'] = read_text_file(f)

            
assert(len(files['train']) == len(files['train_box']))
assert(len(files['train_box']) == len(files['train_image']))
assert(len(files['train_image']) == len(files['train']))

zip_dir_name 5fe15b06-ee59-4461-9f88-505f3e4b2696


In [8]:
#| export
print('Length of box, image and txt', list(map(len, map(files.get, files.keys()))))

Length of box, image and txt [1415, 1415, 1415]


In [9]:
#| export
images = {}
for i, row in enumerate(files['train_image']):
    if row != '\n':
        image_name = row.split('\t')[-1]
        images.setdefault(image_name.replace('\n', ''), []).append(i)

In [15]:
pd.DataFrame(files).head()

NameError: name 'files' is not defined

In [11]:
pd.Series(images)

page_12_image_0.jpg    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
page_11_image_0.jpg    [119, 120, 121, 122, 123, 124, 125, 126, 127, ...
page_10_image_0.jpg    [288, 289, 290, 291, 292, 293, 294, 295, 296, ...
page_9_image_0.jpg     [366, 367, 368, 369, 370, 371, 372, 373, 374, ...
page_8_image_0.jpg     [482, 483, 484, 485, 486, 487, 488, 489, 490, ...
page_7_image_0.jpg     [546, 547, 548, 549, 550, 551, 552, 553, 554, ...
page_6_image_0.jpg     [623, 624, 625, 626, 627, 628, 629, 630, 631, ...
page_5_image_0.jpg     [721, 722, 723, 724, 725, 726, 727, 728, 729, ...
page_4_image_0.jpg     [797, 798, 799, 800, 801, 802, 803, 804, 805, ...
page_3_image_0.jpg     [956, 957, 958, 959, 960, 961, 962, 963, 964, ...
page_2_image_0.jpg     [1062, 1063, 1064, 1065, 1066, 1067, 1068, 106...
page_1_image_0.jpg     [1204, 1205, 1206, 1207, 1208, 1209, 1210, 121...
dtype: object

## Creating dataset from files

In [12]:
#| export
words, bboxes, ner_tags, image_path = [], [], [], []
for image, rows in images.items():
    words.append([row.split('\t')[0].replace('\n', '')
                 for row in files['train'][rows[0]:rows[-1]+1]])
    ner_tags.append([row.split('\t')[1].replace('\n', '')
                    for row in files['train'][rows[0]:rows[-1]+1]])
    bboxes.append([box.split('\t')[1].replace('\n', '')
                  for box in files['train_box'][rows[0]:rows[-1]+1]])
    image_path.append(str(data_path/image))

In [13]:
pd.Series(image_path)

0     ../../papers/annotations/doc-scanner/5fe15b06-...
1     ../../papers/annotations/doc-scanner/5fe15b06-...
2     ../../papers/annotations/doc-scanner/5fe15b06-...
3     ../../papers/annotations/doc-scanner/5fe15b06-...
4     ../../papers/annotations/doc-scanner/5fe15b06-...
5     ../../papers/annotations/doc-scanner/5fe15b06-...
6     ../../papers/annotations/doc-scanner/5fe15b06-...
7     ../../papers/annotations/doc-scanner/5fe15b06-...
8     ../../papers/annotations/doc-scanner/5fe15b06-...
9     ../../papers/annotations/doc-scanner/5fe15b06-...
10    ../../papers/annotations/doc-scanner/5fe15b06-...
11    ../../papers/annotations/doc-scanner/5fe15b06-...
dtype: object

In [14]:
pd.Series(ner_tags)

0     [O, O, B-SUB-SUB-Q, I-SUB-SUB-Q, I-SUB-SUB-Q, ...
1     [O, O, B-Q, I-Q, I-Q, I-Q, I-Q, I-Q, I-Q, I-Q,...
2     [O, O, B-SUB-SUB-Q, I-SUB-SUB-Q, I-SUB-SUB-Q, ...
3     [O, O, B-SUB-Q, I-SUB-Q, I-SUB-Q, I-SUB-Q, I-S...
4     [O, O, B-CHART, I-CHART, I-CHART, I-CHART, I-C...
5     [O, O, B-SUB-SUB-Q, I-SUB-SUB-Q, I-SUB-SUB-Q, ...
6     [O, O, B-SUB-SUB-Q, I-SUB-SUB-Q, I-SUB-SUB-Q, ...
7     [O, O, B-SUB-SUB-Q, I-SUB-SUB-Q, I-SUB-SUB-Q, ...
8     [O, O, B-Q, I-Q, I-Q, I-Q, I-Q, I-Q, I-Q, I-Q,...
9     [O, O, B-SUB-Q, I-SUB-Q, I-SUB-Q, I-SUB-Q, I-S...
10    [O, O, B-SUB-SUB-Q, I-SUB-SUB-Q, I-SUB-SUB-Q, ...
11    [O, O, B-SUBJECT NAME, E-SUBJECT NAME, B-Q, E-...
dtype: object

In [15]:
pd.Series(ner_tags[0])

0                O
1                O
2      B-SUB-SUB-Q
3      I-SUB-SUB-Q
4      I-SUB-SUB-Q
          ...     
113            I-Q
114            I-Q
115            I-Q
116            I-Q
117            E-Q
Length: 118, dtype: object

### Creating features from raw data

In [16]:
#| export
labels = list(set(tag for ner_tag in ner_tags for tag in ner_tag))
id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}

dataset_dict = {
    'id': range(len(words)),
    'tokens': words,
    'bboxes': [[list(map(int, bbox.split())) for bbox in doc] for doc in bboxes],
    'ner_tags': [[label2id[tag] for tag in ner_tag] for ner_tag in ner_tags],
    'image': [Image.open(path).convert("RGB") for path in image_path]
}

#raw features
features = Features({
    'id': Value(dtype='string', id=None),
    'tokens': Sequence(feature=Value(dtype='string', id=None), 
                       length=-1, id=None),
    'bboxes': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), 
                                        length=-1, id=None), 
                       length=-1, id=None),
    'ner_tags': Sequence(feature=ClassLabel(num_classes=len(labels), 
                                            names=labels, 
                                            names_file=None, id=None),
                         length=-1, id=None),
    'image': Img(decode=True, id=None)
})

In [17]:
#| export
full_data_set = Dataset.from_dict(dataset_dict, features=features)
dataset = full_data_set.train_test_split(test_size=TEST_SIZE)
dataset["train"] = dataset["train"].filter(filter_out_unannotated)
processor = AutoProcessor.from_pretrained(
    "microsoft/layoutlmv3-base", apply_ocr=False)

features = dataset["train"].features
column_names = dataset["train"].column_names
image_column_name = "image"
text_column_name = "tokens"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"

features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(ClassLabel(names=labels)),
})

train_dataset = dataset["train"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)
eval_dataset = dataset["test"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)
train_dataset.set_format("torch")

                                                                                                                                                                                             

## Output

In [29]:
OUTPUT_PATH

Path('../../preprocessed')

In [19]:
#| export
train_dataset.save_to_disk(OUTPUT_PATH/'train_split')
eval_dataset.save_to_disk(OUTPUT_PATH/'eval_split')
dataset.save_to_disk(OUTPUT_PATH/'raw_data')

                                                                                                                                                                                             

In [16]:
#| hide
import nbdev; nbdev.nbdev_export()