In [44]:
#!pip install pdf2image
#!sudo apt-get install poppler-utils
#!pip install datasets
#!pip install pytesseract
#!apt install tesseract-ocr

In [45]:
classes = ['CV', 'invoive']
labels = ['CV', 'invoive']
idx2label = {v: k for v, k in enumerate(labels)}
label2idx = {k: v for v, k in enumerate(labels)}
label2idx

{'CV': 0, 'invoive': 1}

In [42]:
from transformers import LayoutLMTokenizer
import torch
model_path = '/content/drive/MyDrive/dataset/saved_model/'
tokenizer = LayoutLMTokenizer.from_pretrained('/content/drive/MyDrive/dataset/saved_model/')

def encode_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]):
  words = example['words']
  normalized_word_boxes = example['bbox']

  assert len(words) == len(normalized_word_boxes)

  token_boxes = []
  for word, box in zip(words, normalized_word_boxes):
      word_tokens = tokenizer.tokenize(word)
      token_boxes.extend([box] * len(word_tokens))

  # Truncation of token_boxes
  special_tokens_count = 2
  if len(token_boxes) > max_seq_length - special_tokens_count:
      token_boxes = token_boxes[: (max_seq_length - special_tokens_count)]

  # add bounding boxes of cls + sep tokens
  token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]

  encoding = tokenizer(' '.join(words), padding='max_length', truncation=True)
  # Padding of token_boxes up the bounding boxes to the sequence length.
  input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"]
  padding_length = max_seq_length - len(input_ids)
  token_boxes += [pad_token_box] * padding_length
  encoding['bbox'] = token_boxes
  encoding['label'] = label2idx[example['label']]

  assert len(encoding['input_ids']) == max_seq_length
  assert len(encoding['attention_mask']) == max_seq_length
  assert len(encoding['token_type_ids']) == max_seq_length
  assert len(encoding['bbox']) == max_seq_length

  return encoding

In [46]:
from datasets import Features, Sequence, ClassLabel, Value, Array2D
# we need to define the features ourselves as the bbox of LayoutLM are an extra feature
features = Features({
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'label': ClassLabel(names=['CV', 'Invoice','Bon','memo']),
    'image_path': Value(dtype='string'),
    'words': Sequence(feature=Value(dtype='string')),
})


encoded_test_dataset = updated_test_dataset.map(lambda example: encode_example(example),
                                      features=features)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

KeyError: None

In [47]:
updated_test_dataset

Dataset({
    features: ['image_path', 'label', 'words', 'bbox'],
    num_rows: 1
})

In [48]:
import torch
from transformers import LayoutLMForSequenceClassification, LayoutLMTokenizer

# Load the saved model and tokenizer
model_path = '/content/drive/MyDrive/dataset/saved_model/'
model = LayoutLMForSequenceClassification.from_pretrained(model_path)
tokenizer = LayoutLMTokenizer.from_pretrained(model_path)


In [49]:
# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model onto the device
model.to(device)

LayoutLMForSequenceClassification(
  (layoutlm): LayoutLMModel(
    (embeddings): LayoutLMEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (x_position_embeddings): Embedding(1024, 768)
      (y_position_embeddings): Embedding(1024, 768)
      (h_position_embeddings): Embedding(1024, 768)
      (w_position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LayoutLMEncoder(
      (layer): ModuleList(
        (0-11): 12 x LayoutLMLayer(
          (attention): LayoutLMAttention(
            (self): LayoutLMSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True

In [50]:
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

def preprocess_new_pdf_images(pdf_path):
    # Convert PDF to images
    images = convert_from_path(pdf_path)

    # Initialize a list to store preprocessed examples
    preprocessed_examples = []

    # Perform OCR on each image and create examples
    for i, image in enumerate(images):
        # Perform OCR to extract text
        text = pytesseract.image_to_string(image)

        # Extract bounding boxes and words using OCR data frame
        ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
        ocr_df = ocr_df.dropna().reset_index(drop=True)
        words = list(ocr_df.text)
        coordinates = ocr_df[['left', 'top', 'width', 'height']]
        actual_boxes = []
        for _, row in coordinates.iterrows():
            x, y, w, h = tuple(row)
            actual_box = [x, y, x + w, y + h]
            actual_boxes.append(actual_box)

        # Normalize bounding boxes
        width, height = image.size
        normalized_boxes = []
        for box in actual_boxes:
            normalized_box = normalize_box(box, width, height)
            normalized_boxes.append(normalized_box)

        # Create example dictionary
        example = {
            'image_path': pdf_path,  # Store the path to the PDF image
            'words': words,
            'bbox': normalized_boxes
        }

        preprocessed_examples.append(example)

    return preprocessed_examples

# Normalize bounding boxes
def normalize_box(box, width, height):
    return [
        int(1000 * (box[0] / width)),
        int(1000 * (box[1] / height)),
        int(1000 * (box[2] / width)),
        int(1000 * (box[3] / height)),
    ]

# Example usage:
pdf_path = "/content/drive/MyDrive/input/OD124156933233239000.pdf"
preprocessed_data = preprocess_new_pdf_images(pdf_path)
print(preprocessed_data)


[{'image_path': '/content/drive/MyDrive/input/OD124156933233239000.pdf', 'words': ['Tax', 'Invoice', 'Sold', 'By:', 'Shreyash', 'Retail', 'Private', 'Limited', ',', 'Ship-from', 'Address:', 'Sy', 'no', '18/2,18/3,18/4', '19/1,19/2,19/3,20/1,20/2,20/3,21/2,22/2,23/2,23/3,23/4', 'Taverekere', 'venkatapura,', 'hosakote', 'taluk,', 'nadagudi', 'hobli,bangalore', '562122,', 'Bangalore,', 'KARNATAKA,', 'India', '-', '562122,', 'IN-KA', 'GSTIN', '-', '29AAXCS0655F1ZU', 'FSSAI', 'License', 'No', '-', '13321999000230', 'iin', ' ', ' ', ' ', 'Order', 'ID:', '0D124156933233239000', 'Bil|', 'To', 'Order', 'Date:', '13-02-2022', 'Invoice', 'Date:', '13-02-2022', 'Anumula', 'Brahma', 'Chary', 'H.', 'No:136,', 'Marigudem,', 'abbayipalem,', 'korivi', 'road,', 'maripeda', 'bunglow.', 'Ship', 'To', 'Anumula', 'Brahma', 'Chary', 'H.', 'No:136,', 'Marigudem,', 'abbayipalem,', 'korivi', 'road,', 'maripeda', 'bunglow.', '*Keep', 'this', 'invoice', 'and', 'manufacturer', 'box', 'for', ' ', ' ', ' ', ' ', 'PA

In [52]:
def encode_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]):
    words = example['words']
    normalized_word_boxes = example['bbox']

    assert len(words) == len(normalized_word_boxes)

    token_boxes = []
    for word, box in zip(words, normalized_word_boxes):
        word_tokens = tokenizer.tokenize(word)
        token_boxes.extend([box] * len(word_tokens))

    # Truncation of token_boxes
    special_tokens_count = 2
    if len(token_boxes) > max_seq_length - special_tokens_count:
        token_boxes = token_boxes[: (max_seq_length - special_tokens_count)]

    # add bounding boxes of cls + sep tokens
    token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]

    encoding = tokenizer(' '.join(words), padding='max_length', truncation=True)
    # Padding of token_boxes up the bounding boxes to the sequence length.
    input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"]
    padding_length = max_seq_length - len(input_ids)
    token_boxes += [pad_token_box] * padding_length
    encoding['bbox'] = token_boxes

    # Check if 'label' key exists in example dictionary
    if 'label' in example:
        encoding['label'] = label2idx[example['label']]

    assert len(encoding['input_ids']) == max_seq_length
    assert len(encoding['attention_mask']) == max_seq_length
    assert len(encoding['token_type_ids']) == max_seq_length
    assert len(encoding['bbox']) == max_seq_length

    return encoding


In [53]:
# Encode the examples using the tokenizer
encoded_examples = []
for example in preprocessed_data:
    encoded_example = encode_example(example)  # Assuming you have encode_example function defined
    encoded_examples.append(encoded_example)

# Convert the encoded examples to PyTorch tensors
input_ids = torch.tensor([example['input_ids'] for example in encoded_examples], dtype=torch.long).to(device)
bbox = torch.tensor([example['bbox'] for example in encoded_examples], dtype=torch.long).to(device)
attention_mask = torch.tensor([example['attention_mask'] for example in encoded_examples], dtype=torch.long).to(device)
token_type_ids = torch.tensor([example['token_type_ids'] for example in encoded_examples], dtype=torch.long).to(device)

# Run inference using the loaded model
with torch.no_grad():
    outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
    predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

# Decode the predictions
predicted_labels = [idx2label[prediction] for prediction in predictions]

# Print the predicted labels
print("Predicted Labels:", predicted_labels)

Predicted Labels: ['invoive']
