In [3]:
# !pip install pdf2image
# !pip install datasets
# !sudo apt-get install poppler-utils
# !pip install pytesseract
# !apt install tesseract-ocr

In [4]:
import os
import pandas as pd
from pdf2image import convert_from_path

In [10]:
import os
import pandas as pd
from pdf2image import convert_from_path

def convert_pdf_to_images(pdf_path, output_directory):
    images = convert_from_path(pdf_path)
    image_paths = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_directory, f'{os.path.basename(pdf_path)}_{i}.jpg')
        image.save(image_path, 'JPEG')
        image_paths.append(image_path)
        #break
    return image_paths

def list_images_and_categories(root_dir, extensions=['.png', '.jpg']):
    image_paths = []
    categories = []
    #print(root_dir)
    tree_split = root_dir.split("/")[-1]

    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                #print(pdf_path)
                output_directory = os.path.splitext(pdf_path)[0]  # Use PDF filename as output directory
                if not os.path.exists(output_directory):
                    os.makedirs(output_directory)
                converted_image_paths = convert_pdf_to_images(pdf_path, output_directory)
                #category = converted_image_paths.split(tree_split)[-1]
                image_paths.extend(converted_image_paths)
                category = os.path.basename(root)
                #print(converted_image_paths[0])
                category = converted_image_paths[0].split(tree_split)[-1].strip("/")
                #print(category)
                category = category.split("/")[1]
                #print(category)
                categories.extend([category] * len(converted_image_paths))


    if not image_paths:  # If no JPG or PNG files were found
        raise ValueError("No image files found in the directory.")

    return pd.DataFrame({'image_path': image_paths, 'label': categories})

# Set your root directory where images are stored
root_directory = '/content/drive/MyDrive/layout_dataset_multi_dir

# Create a DataFrame with image paths and categories
data = list_images_and_categories(root_directory)

data



Unnamed: 0,image_path,label
0,/content/drive/MyDrive/layout_dataset_multi_di...,cat1
1,/content/drive/MyDrive/layout_dataset_multi_di...,cat1
2,/content/drive/MyDrive/layout_dataset_multi_di...,cat1
3,/content/drive/MyDrive/layout_dataset_multi_di...,cat1
4,/content/drive/MyDrive/layout_dataset_multi_di...,cat1
5,/content/drive/MyDrive/layout_dataset_multi_di...,cat1
6,/content/drive/MyDrive/layout_dataset_multi_di...,cat2
7,/content/drive/MyDrive/layout_dataset_multi_di...,cat2
8,/content/drive/MyDrive/layout_dataset_multi_di...,cat2
9,/content/drive/MyDrive/layout_dataset_multi_di...,cat2


In [12]:
labels = [label for label in set(data["label"].unique())]
idx2label = {v: k for v, k in enumerate(labels)}
label2idx = {k: v for v, k in enumerate(labels)}
labels,idx2label, label2idx

(['cat3', 'cat4', 'cat1', 'cat2'],
 {0: 'cat3', 1: 'cat4', 2: 'cat1', 3: 'cat2'},
 {'cat3': 0, 'cat4': 1, 'cat1': 2, 'cat2': 3})

In [13]:
from datasets import Dataset
from PIL import Image, ImageDraw, ImageFont
import pytesseract
import numpy as np

def normalize_box(box, width, height):
     return [
         int(1000 * (box[0] / width)),
         int(1000 * (box[1] / height)),
         int(1000 * (box[2] / width)),
         int(1000 * (box[3] / height)),
     ]

def apply_ocr(example):
        # get the image
        image = Image.open(example['image_path'])

        width, height = image.size

        # apply ocr to the image
        ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
        float_cols = ocr_df.select_dtypes('float').columns
        ocr_df = ocr_df.dropna().reset_index(drop=True)
        ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
        ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
        ocr_df = ocr_df.dropna().reset_index(drop=True)

        # get the words and actual (unnormalized) bounding boxes
        #words = [word for word in ocr_df.text if str(word) != 'nan'])
        words = list(ocr_df.text)
        words = [str(w) for w in words]
        coordinates = ocr_df[['left', 'top', 'width', 'height']]
        actual_boxes = []
        for idx, row in coordinates.iterrows():
            x, y, w, h = tuple(row) # the row comes in (left, top, width, height) format
            actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+width, top+height) to get the actual box
            actual_boxes.append(actual_box)

        # normalize the bounding boxes
        boxes = []
        for box in actual_boxes:
            boxes.append(normalize_box(box, width, height))

        # add as extra columns
        assert len(words) == len(boxes)
        example['words'] = words
        example['bbox'] = boxes
        return example
data.reset_index(drop=True, inplace=True)
dataset = Dataset.from_pandas(data)
updated_dataset = dataset.map(apply_ocr)

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [14]:
data

Unnamed: 0,image_path,label
0,/content/drive/MyDrive/layout_dataset_multi_di...,cat1
1,/content/drive/MyDrive/layout_dataset_multi_di...,cat1
2,/content/drive/MyDrive/layout_dataset_multi_di...,cat1
3,/content/drive/MyDrive/layout_dataset_multi_di...,cat1
4,/content/drive/MyDrive/layout_dataset_multi_di...,cat1
5,/content/drive/MyDrive/layout_dataset_multi_di...,cat1
6,/content/drive/MyDrive/layout_dataset_multi_di...,cat2
7,/content/drive/MyDrive/layout_dataset_multi_di...,cat2
8,/content/drive/MyDrive/layout_dataset_multi_di...,cat2
9,/content/drive/MyDrive/layout_dataset_multi_di...,cat2


In [15]:
updated_dataset

Dataset({
    features: ['image_path', 'label', 'words', 'bbox'],
    num_rows: 25
})

In [16]:
df = pd.DataFrame.from_dict(updated_dataset)
print(len(df["words"][0]))
print(df["words"][0])

117
['PRAVEEN', 'KALLURI', '+21', '7013835206', 'GB', 'Praveenkalluri.95@gmail.com', 'OBJECTIVE', 'To', 'be', 'a', 'part', 'of', 'Organization', 'that', 'provides', 'an', 'atmosphere', 'of', 'mutual', 'growth', 'and', 'benefits,', 'where', '|', 'can', 'show', 'my', 'talents', 'and', 'potential.', 'EXPERIENCE', 'Apr', '2018', '—', 'Aug', '2019', 'Computer', 'Operator', ',Global', 'Enterprises', ',Khammam', 'Jan', '2017', '—', 'Aug', '2017', 'Salesman', ',D-Mart', ',Madhapur', ',Hyderabad', 'STRENGTHS', '>', 'Hardworking', '>', 'Adoptability', '>', 'Work', 'under', 'pressure', 'PERSONAL', 'DETAILS', 'Dob', ':', '09-09-1996', 'Father', 'Name', ':', 'K.Srinivasachary', 'Sex', ':', 'Male', 'Martial', 'Status', ':', 'Single', 'Nationality', ':', 'Indian', 'Launguages', 'Known', ':', 'English', ',', 'Telugu', ',', 'Hindi', 'Suryapet', 'EDUCATION', 'Intermediate,', '(MPC),', '64%,', '2015,', 'M.K', 'Reddy', 'Jr.', 'College,', 'Suryapet', 'SSC,', '92%,', '2012', ',', 'Bharath', 'BalaMandir', 'H

In [17]:
from transformers import LayoutLMTokenizer
import torch

tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")

def encode_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]):
  words = example['words']
  normalized_word_boxes = example['bbox']

  assert len(words) == len(normalized_word_boxes)

  token_boxes = []
  for word, box in zip(words, normalized_word_boxes):
      word_tokens = tokenizer.tokenize(word)
      token_boxes.extend([box] * len(word_tokens))

  # Truncation of token_boxes
  special_tokens_count = 2
  if len(token_boxes) > max_seq_length - special_tokens_count:
      token_boxes = token_boxes[: (max_seq_length - special_tokens_count)]

  # add bounding boxes of cls + sep tokens
  token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]

  encoding = tokenizer(' '.join(words), padding='max_length', truncation=True)
  # Padding of token_boxes up the bounding boxes to the sequence length.
  input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"]
  padding_length = max_seq_length - len(input_ids)
  token_boxes += [pad_token_box] * padding_length
  encoding['bbox'] = token_boxes
  encoding['label'] = label2idx[example['label']]

  assert len(encoding['input_ids']) == max_seq_length
  assert len(encoding['attention_mask']) == max_seq_length
  assert len(encoding['token_type_ids']) == max_seq_length
  assert len(encoding['bbox']) == max_seq_length

  return encoding

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/170 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

In [18]:
from datasets import Features, Sequence, ClassLabel, Value, Array2D
# we need to define the features ourselves as the bbox of LayoutLM are an extra feature
features = Features({
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'label': ClassLabel(names=['CV', 'Invoice','Bon','memo']),
    'image_path': Value(dtype='string'),
    'words': Sequence(feature=Value(dtype='string')),
})

encoded_dataset = updated_dataset.map(lambda example: encode_example(example),
                                      features=features)

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [19]:
encoded_dataset.set_format(type='torch', columns=['input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'label'])
dataloader = torch.utils.data.DataLoader(encoded_dataset, batch_size=1, shuffle=True)
batch = next(iter(dataloader))

In [20]:
tokenizer.decode(batch['input_ids'][0].tolist())

'[CLS] amazon. in wee ] sold by : etrade marketing private limited * survey number 99 / 1, mamidipally village, shamshabad hyderabad, telangana, 500108 in pan no : aadcv4254h gst registration no : 36aadcv4254h1z9 order number : 407 - 2434953 - 0676359 order date : 05. 05. 2024 tax invoice / bill of supply / cash memo ( original for recipient ) billing address : venu pammi 18th ward, shivasai nagar, near ekart beside shiva tower road, khammam road kodad, telangana, 508206 in state / ut code : 36 shipping address : venu pammi venu pammi 18th ward, shivasai nagar, near ekart beside shiva tower road, khammam road kodad, telangana, 508206 in state / ut code : 36 place of supply : telangana place of delivery : telangana invoice number : hyd3 - 135239 invoice details : tg - hyd3 - 1317922175 - 2425 invoice date : 05. 05. 2024 si. no 1 | philips 9 - watts multipack b22 led cool day white led bulb, pack of 2, ( ace saver ) | bol6wozpvu ( bo16wozpvu ) hsn : 85395000 description total : amount in

In [22]:
from transformers import LayoutLMForSequenceClassification
import torch
from transformers import logging

logging.set_verbosity_warning()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=len(label2idx))
model.to(device)

Some weights of LayoutLMForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LayoutLMForSequenceClassification(
  (layoutlm): LayoutLMModel(
    (embeddings): LayoutLMEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (x_position_embeddings): Embedding(1024, 768)
      (y_position_embeddings): Embedding(1024, 768)
      (h_position_embeddings): Embedding(1024, 768)
      (w_position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LayoutLMEncoder(
      (layer): ModuleList(
        (0-11): 12 x LayoutLMLayer(
          (attention): LayoutLMAttention(
            (self): LayoutLMSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True

In [24]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=4e-5)

global_step = 0
num_train_epochs = 10
t_total = len(dataloader) * num_train_epochs # total number of training steps

#put the model in training mode
model.train()
for epoch in range(num_train_epochs):
  print("Epoch:", epoch)
  running_loss = 0.0
  correct = 0
  for batch in dataloader:
      input_ids = batch["input_ids"].to(device)
      bbox = batch["bbox"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      token_type_ids = batch["token_type_ids"].to(device)
      labels = batch["label"].to(device)

      # forward pass
      outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
                      labels=labels)
      loss = outputs.loss

      running_loss += loss.item()
      predictions = outputs.logits.argmax(-1)
      correct += (predictions == labels).float().sum()

      # backward pass to get the gradients
      loss.backward()

      # update
      optimizer.step()
      optimizer.zero_grad()
      global_step += 1

  print("Loss:", running_loss / batch["input_ids"].shape[0])
  accuracy = 100 * correct / len(data)
  print("Training accuracy:", accuracy.item())



Epoch: 0
Loss: 1.04060173407197
Training accuracy: 100.0
Epoch: 1
Loss: 0.380856407340616
Training accuracy: 100.0
Epoch: 2
Loss: 0.14392431895248592
Training accuracy: 100.0
Epoch: 3
Loss: 0.09302293020300567
Training accuracy: 100.0
Epoch: 4
Loss: 0.06906957109458745
Training accuracy: 100.0
Epoch: 5
Loss: 0.053313315962441266
Training accuracy: 100.0
Epoch: 6
Loss: 0.0439343344187364
Training accuracy: 100.0
Epoch: 7
Loss: 0.03861845494247973
Training accuracy: 100.0
Epoch: 8
Loss: 0.030951142078265548
Training accuracy: 100.0
Epoch: 9
Loss: 0.028761691704858094
Training accuracy: 100.0


In [25]:
model_path = '/content/drive/MyDrive/dataset/saved_model-level2/'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/content/drive/MyDrive/dataset/saved_model-level2/tokenizer_config.json',
 '/content/drive/MyDrive/dataset/saved_model-level2/special_tokens_map.json',
 '/content/drive/MyDrive/dataset/saved_model-level2/vocab.txt',
 '/content/drive/MyDrive/dataset/saved_model-level2/added_tokens.json')

In [26]:
import pickle

labels_path = '/content/drive/MyDrive/dataset/label2idx-level2.pkl'
with open(labels_path, 'wb') as f:
    pickle.dump(label2idx, f)