In [None]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install -q datasets seqeval

In [None]:
from datasets import load_dataset
dataset = load_dataset('SotiriosKastanas/try_groto_new')

In [None]:
from transformers import AutoProcessor

# we'll use the Auto API here - it will load LayoutLMv3Processor behind the scenes,
# based on the checkpoint we provide from the hub
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

In [None]:
from datasets.features import ClassLabel

features = dataset["train"].features
column_names = dataset["train"].column_names
image_column_name = "image"
text_column_name = "tokens"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"

# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
# unique labels.


label_list = ['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22']
id2label = {k: v for k,v in enumerate(label_list)}
label2id = {v: k for k,v in enumerate(label_list)}
num_labels = len(label_list)

In [None]:
def prepare_examples(examples):
  images = examples[image_column_name]
  words = examples[text_column_name]
  boxes = examples[boxes_column_name]
  word_labels = examples[label_column_name]

  encoding = processor(images, words, boxes=boxes, word_labels=word_labels,
                       truncation=True, padding="max_length")

  return encoding

In [None]:
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [None]:
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D

# we need to define custom features for `set_format` (used later on) to work properly
features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(feature=Value(dtype='int64')),
})

train_dataset = dataset["train"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)
eval_dataset = dataset["test"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

In [None]:
train_dataset.set_format("torch")
import torch


In [None]:
from datasets import load_metric

metric = load_metric("seqeval")

In [None]:
import numpy as np

return_entity_level_metrics = False

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [None]:
from transformers import LayoutLMv3ForTokenClassification

model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base",
                                                         id2label=id2label,
                                                         label2id=label2id)

In [None]:
train_dataset

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="CORRECT_TEST",
                                  max_steps=13700,
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  learning_rate=1e-5,
                                  evaluation_strategy="steps",
                                  eval_steps=100,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1")

In [None]:
from transformers.data.data_collator import default_data_collator

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [1]:
'''
count = 0

for index, row in df_train.iterrows():
    bboxes = row['bbox']
    
    bbox_found = False
    
    for bbox in bboxes:
        for element in bbox:
            if element > 1000:
                bbox_found = True
                break  
    
    if bbox_found:
        count += 1

print("Total rows with at least one bbox >1000:", count)





filtered_df = df_train.copy()

for index, row in df_train[::-1].iterrows():
    bboxes = row['bbox']
    
    bbox_found = False
    
    for bbox in bboxes:
        for element in bbox:
            if element > 1000:
                bbox_found = True
                break  
    
    if bbox_found:
        filtered_df = filtered_df.drop(index)

print(filtered_df)

import pandas as pd

df_test.reset_index(drop=True, inplace=True)

df_test.index += 11020

'''

'\ncount = 0\n\nfor index, row in df_train.iterrows():\n    bboxes = row[\'bbox\']\n    \n    bbox_found = False\n    \n    for bbox in bboxes:\n        for element in bbox:\n            if element > 1000:\n                bbox_found = True\n                break  \n    \n    if bbox_found:\n        count += 1\n\nprint("Total rows with at least one bbox >1000:", count)\n\n\n\n\n\nfiltered_df = df_train.copy()\n\nfor index, row in df_train[::-1].iterrows():\n    bboxes = row[\'bbox\']\n    \n    bbox_found = False\n    \n    for bbox in bboxes:\n        for element in bbox:\n            if element > 1000:\n                bbox_found = True\n                break  \n    \n    if bbox_found:\n        filtered_df = filtered_df.drop(index)\n\nprint(filtered_df)\n\nimport pandas as pd\n\ndf_test.reset_index(drop=True, inplace=True)\n\ndf_test.index += 11020\n\n'