In [30]:
import os
import pandas as pd

def list_images_and_categories(root_dir, extensions=['.png', '.jpg']):
    image_paths = []
    categories = []

    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith(tuple(extensions)):
                image_path = os.path.join(root, file)
                category = os.path.basename(root)
                image_paths.append(image_path)
                categories.append(category)

    return pd.DataFrame({'image_path': image_paths, 'label': categories})

# Set your root directory where images are stored
root_directory = '/content/drive/MyDrive/dataset/doc_datset'

# Create a DataFrame with image paths and categories
image_df = list_images_and_categories(root_directory)

# Display the DataFrame
print(image_df.head())


                                          image_path label
0  /content/drive/MyDrive/dataset/doc_datset/CV/2...    CV
1  /content/drive/MyDrive/dataset/doc_datset/CV/2...    CV
2  /content/drive/MyDrive/dataset/doc_datset/CV/2...    CV
3  /content/drive/MyDrive/dataset/doc_datset/CV/2...    CV
4  /content/drive/MyDrive/dataset/doc_datset/CV/2...    CV


In [10]:
!pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10


In [6]:
image_df

Unnamed: 0,image_path,label
0,/content/drive/MyDrive/dataset/doc_datset/CV/2...,CV
1,/content/drive/MyDrive/dataset/doc_datset/CV/2...,CV
2,/content/drive/MyDrive/dataset/doc_datset/CV/2...,CV
3,/content/drive/MyDrive/dataset/doc_datset/CV/2...,CV
4,/content/drive/MyDrive/dataset/doc_datset/CV/2...,CV
5,/content/drive/MyDrive/dataset/doc_datset/bon/...,bon
6,/content/drive/MyDrive/dataset/doc_datset/bon/...,bon
7,/content/drive/MyDrive/dataset/doc_datset/bon/...,bon
8,/content/drive/MyDrive/dataset/doc_datset/bon/...,bon
9,/content/drive/MyDrive/dataset/doc_datset/bon/...,bon


In [31]:
dataset_path = "/content/drive/MyDrive/dataset/doc_datset"
labels = [label for label in os.listdir(dataset_path)]
idx2label = {v: k for v, k in enumerate(labels)}
label2idx = {k: v for v, k in enumerate(labels)}
label2idx

{'CV': 0, 'bon': 1, 'invoice': 2, 'memo': 3}

In [15]:
images = []
labels = []

for label_folder, _, file_names in os.walk(dataset_path):
  if label_folder != dataset_path:
    label = label_folder[10:]
    for _, _, image_names in os.walk(label_folder):
      relative_image_names = []
      for image in image_names:
        relative_image_names.append(dataset_path + "/" + label + "/" + image)
      images.extend(relative_image_names)
      labels.extend([label] * len (relative_image_names))

data = pd.DataFrame.from_dict({'image_path': images, 'label': labels})
data.head()

Unnamed: 0,image_path,label
0,/content/drive/MyDrive/dataset/doc_datset/rive...,rive/MyDrive/dataset/doc_datset/CV
1,/content/drive/MyDrive/dataset/doc_datset/rive...,rive/MyDrive/dataset/doc_datset/CV
2,/content/drive/MyDrive/dataset/doc_datset/rive...,rive/MyDrive/dataset/doc_datset/CV
3,/content/drive/MyDrive/dataset/doc_datset/rive...,rive/MyDrive/dataset/doc_datset/CV
4,/content/drive/MyDrive/dataset/doc_datset/rive...,rive/MyDrive/dataset/doc_datset/CV


In [21]:
data = image_df
data.head()

Unnamed: 0,image_path,label
0,/content/drive/MyDrive/dataset/doc_datset/CV/2...,CV
1,/content/drive/MyDrive/dataset/doc_datset/CV/2...,CV
2,/content/drive/MyDrive/dataset/doc_datset/CV/2...,CV
3,/content/drive/MyDrive/dataset/doc_datset/CV/2...,CV
4,/content/drive/MyDrive/dataset/doc_datset/CV/2...,CV


In [18]:
#!pip install datasets

In [23]:
#!apt install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 0s (14.0 MB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 121752 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-e

In [24]:
from datasets import Dataset

def normalize_box(box, width, height):
     return [
         int(1000 * (box[0] / width)),
         int(1000 * (box[1] / height)),
         int(1000 * (box[2] / width)),
         int(1000 * (box[3] / height)),
     ]

def apply_ocr(example):
        # get the image
        image = Image.open(example['image_path'])

        width, height = image.size

        # apply ocr to the image
        ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
        float_cols = ocr_df.select_dtypes('float').columns
        ocr_df = ocr_df.dropna().reset_index(drop=True)
        ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
        ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
        ocr_df = ocr_df.dropna().reset_index(drop=True)

        # get the words and actual (unnormalized) bounding boxes
        #words = [word for word in ocr_df.text if str(word) != 'nan'])
        words = list(ocr_df.text)
        words = [str(w) for w in words]
        coordinates = ocr_df[['left', 'top', 'width', 'height']]
        actual_boxes = []
        for idx, row in coordinates.iterrows():
            x, y, w, h = tuple(row) # the row comes in (left, top, width, height) format
            actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+width, top+height) to get the actual box
            actual_boxes.append(actual_box)

        # normalize the bounding boxes
        boxes = []
        for box in actual_boxes:
            boxes.append(normalize_box(box, width, height))

        # add as extra columns
        assert len(words) == len(boxes)
        example['words'] = words
        example['bbox'] = boxes
        return example

dataset = Dataset.from_pandas(data)
updated_dataset = dataset.map(apply_ocr)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [27]:
df = pd.DataFrame.from_dict(updated_dataset)
print(len(df["words"][0]))
print(df["words"][0])

121
['RESUME', 'Mynamexyz', 'Mb', '(+91)', '0000000005', 'Res.No', '+91-0000-0000000', 'aASIHH@nnfdni.com', 'Objective', 'Seeking.', 'for', 'a', 'position', 'as', 'Systems', 'Engineer', 'to', 'provide', 'value', 'addition', 'to', "organization's", 'products', 'and', 'services', 'that', 'will', 'provide', 'exposure', 'to', 'a', 'broad', 'range', 'of', 'responsibilities,', 'in', 'the', 'field', 'of', 'software', 'development', 'and', 'maintenance.', 'Technical', 'Exposure', 'Operating', 'System', ':', 'DOS,', 'UNIX,', 'WINDOWS', 'NT', 'Back', 'End', ':', 'Oracle', '8,', 'MS-SQL', 'SERVER', 'Front', 'End', 'Tools', ':', 'Developer', '2000,', 'VB', '6.0,', 'VB.NET', 'Web', 'Technologies', ':', 'ASP.NET,IIS,', 'HTML,XML,', 'Package', ':', 'FoxPro', '2.6', 'Tools', '&', 'Utiliti', ':', 'Crystal', 'Report,', 'ActivexReport', 'Relevant', 'Ex;', '*', '3', 'years', 'of', 'IT', 'experience', 'in', 'Application', 'Development,', 'Design,', 'Testing', 'and', 'Implementation.', '*', 'Exposure', 'to'

In [28]:
from transformers import LayoutLMTokenizer
import torch

tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")

def encode_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]):
  words = example['words']
  normalized_word_boxes = example['bbox']

  assert len(words) == len(normalized_word_boxes)

  token_boxes = []
  for word, box in zip(words, normalized_word_boxes):
      word_tokens = tokenizer.tokenize(word)
      token_boxes.extend([box] * len(word_tokens))

  # Truncation of token_boxes
  special_tokens_count = 2
  if len(token_boxes) > max_seq_length - special_tokens_count:
      token_boxes = token_boxes[: (max_seq_length - special_tokens_count)]

  # add bounding boxes of cls + sep tokens
  token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]

  encoding = tokenizer(' '.join(words), padding='max_length', truncation=True)
  # Padding of token_boxes up the bounding boxes to the sequence length.
  input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"]
  padding_length = max_seq_length - len(input_ids)
  token_boxes += [pad_token_box] * padding_length
  encoding['bbox'] = token_boxes
  encoding['label'] = label2idx[example['label']]

  assert len(encoding['input_ids']) == max_seq_length
  assert len(encoding['attention_mask']) == max_seq_length
  assert len(encoding['token_type_ids']) == max_seq_length
  assert len(encoding['bbox']) == max_seq_length

  return encoding

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/170 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

In [33]:
from datasets import Features, Sequence, ClassLabel, Value, Array2D
# we need to define the features ourselves as the bbox of LayoutLM are an extra feature
features = Features({
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'label': ClassLabel(names=['CV', 'Invoice','Bon','memo']),
    'image_path': Value(dtype='string'),
    'words': Sequence(feature=Value(dtype='string')),
})

encoded_dataset = updated_dataset.map(lambda example: encode_example(example),
                                      features=features)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [34]:
encoded_dataset.set_format(type='torch', columns=['input_ids', 'bbox', 'attention_mask', 'token_type_ids', 'label'])
dataloader = torch.utils.data.DataLoader(encoded_dataset, batch_size=1, shuffle=True)
batch = next(iter(dataloader))

In [35]:
tokenizer.decode(batch['input_ids'][0].tolist())

'[CLS] ‘ onder 10 : 00122016524004910000 bit to ‘ sipto owier dat 0410202 ) ‘ tyga ward svssape, stb eh ward shas : se serv 208, et ‘ invoice date : 05 - 10 - 2021 “ khammam road. “ khammam road marten bor ft pan angctn odd s00 tergene — — — _ kooad susie teangana = rove rorenoe trove ‘ one raso. 2nr6ercstsas tlie : prodi tie gy goss dicontt tonle 1651? toul? anmutt viet ‘ owacergmion ery rapid 7500 + misogo oom o7es 36182900 pscrsneonnes. oer me onde heustc esa ee ’ blac ) tol 1 e900 1000 mm072 36138200800 grand total £269. 00 ent roos pate lied ahr sty fiplort therk you! fetus plc pec ye in pr aan ey ee fas nen ip it. ginal brand bo pice ‘ a, orginal packing animes sta imation mnt pa hep fem sn etn hl de mgt fr de fe cnt cn a mtd fe 60h a ig 3 er im ig, come, er ei cri de ( contact fipkar : 1800 708 9898 oe fipkar - con / hpeente eroe paste [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [36]:
from transformers import LayoutLMForSequenceClassification
import torch
from transformers import logging

logging.set_verbosity_warning()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=len(label2idx))
model.to(device)

model.safetensors:   0%|          | 0.00/451M [00:00<?, ?B/s]

Some weights of LayoutLMForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LayoutLMForSequenceClassification(
  (layoutlm): LayoutLMModel(
    (embeddings): LayoutLMEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (x_position_embeddings): Embedding(1024, 768)
      (y_position_embeddings): Embedding(1024, 768)
      (h_position_embeddings): Embedding(1024, 768)
      (w_position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LayoutLMEncoder(
      (layer): ModuleList(
        (0-11): 12 x LayoutLMLayer(
          (attention): LayoutLMAttention(
            (self): LayoutLMSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True

In [38]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=4e-5)

global_step = 0
num_train_epochs = 5
t_total = len(dataloader) * num_train_epochs # total number of training steps

#put the model in training mode
model.train()
for epoch in range(num_train_epochs):
  print("Epoch:", epoch)
  running_loss = 0.0
  correct = 0
  for batch in dataloader:
      input_ids = batch["input_ids"].to(device)
      bbox = batch["bbox"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      token_type_ids = batch["token_type_ids"].to(device)
      labels = batch["label"].to(device)

      # forward pass
      outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
                      labels=labels)
      loss = outputs.loss

      running_loss += loss.item()
      predictions = outputs.logits.argmax(-1)
      correct += (predictions == labels).float().sum()

      # backward pass to get the gradients
      loss.backward()

      # update
      optimizer.step()
      optimizer.zero_grad()
      global_step += 1

  print("Loss:", running_loss / batch["input_ids"].shape[0])
  accuracy = 100 * correct / len(data)
  print("Training accuracy:", accuracy.item())



Epoch: 0
Loss: 0.6177377598360181
Training accuracy: 100.0
Epoch: 1
Loss: 0.2265232433564961
Training accuracy: 100.0
Epoch: 2
Loss: 0.11397573119029403
Training accuracy: 100.0
Epoch: 3
Loss: 0.06839154986664653
Training accuracy: 100.0
Epoch: 4
Loss: 0.050723695079796016
Training accuracy: 100.0


In [39]:
model.save_pretrained('/content/drive/MyDrive/dataset/saved_model/')

In [42]:
classes = ['CV', 'Invoice','Bon','memo']



In [43]:
images = []
labels = []
dataset_path = '/content/drive/MyDrive/dataset/doc_datset/invoice'

for label_folder, _, file_names in os.walk(dataset_path):
    print(label_folder, _, file_names)
    print(file_names)
    relative_image_names = []
    relative_image_names.append(dataset_path + "/" + file_names[0])
    images.extend(relative_image_names)
test_data = pd.DataFrame.from_dict({'image_path': images})
test_data.head()


/content/drive/MyDrive/dataset/doc_datset/invoice [] ['25.04.2024_14.41.08_REC.png', '25.04.2024_14.42.33_REC.png', '25.04.2024_14.42.58_REC.png', '25.04.2024_14.43.23_REC.png', '25.04.2024_14.43.45_REC.png', 'OD122113790943409000.pdf', 'OD123000879013753000.pdf', 'OD123016624004910000.pdf', 'OD123018989184857000.pdf', 'OD123029621447684000.pdf', 'OD123121458248064000.pdf', 'OD123130986629533000.pdf', 'OD123243504374210000.pdf', 'OD123172848494727000.pdf', 'OD123267753020750000.pdf', 'OD123277877417980000.pdf', 'OD123181642400128000.pdf', 'OD123278380975635000.pdf', 'OD123279485307984000.pdf', 'OD123416449931105000.pdf', 'OD124156933233239000.pdf', 'OD123638836522575000.pdf', 'OD124655475133321000.pdf']
['25.04.2024_14.41.08_REC.png', '25.04.2024_14.42.33_REC.png', '25.04.2024_14.42.58_REC.png', '25.04.2024_14.43.23_REC.png', '25.04.2024_14.43.45_REC.png', 'OD122113790943409000.pdf', 'OD123000879013753000.pdf', 'OD123016624004910000.pdf', 'OD123018989184857000.pdf', 'OD1230296214476840

Unnamed: 0,image_path
0,/content/drive/MyDrive/dataset/doc_datset/invo...
