In [1]:
!pip install pdf2image
!sudo apt-get install poppler-utils
!pip install datasets
!pip install pytesseract
!apt install tesseract-ocr

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 45 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.3 [186 kB]
Fetched 186 kB in 1s (233 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: fallin

In [2]:
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import torch
from transformers import LayoutLMForSequenceClassification, LayoutLMTokenizer
import pickle

In [3]:
# Load the saved model and tokenizer

def load_model(model_path):
    model = LayoutLMForSequenceClassification.from_pretrained(model_path)
    tokenizer = LayoutLMTokenizer.from_pretrained(model_path)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Load the model onto the device
    model.to(device)

    return model, tokenizer,device


In [13]:
def preprocess_new_pdf_images(pdf_path):
    # Convert PDF to images
    images = convert_from_path(pdf_path)

    # Normalize bounding boxes
    def normalize_box(box, width, height):
        return [
            int(1000 * (box[0] / width)),
            int(1000 * (box[1] / height)),
            int(1000 * (box[2] / width)),
            int(1000 * (box[3] / height)),
        ]


    # Initialize a list to store preprocessed examples
    preprocessed_examples = []

    # Perform OCR on each image and create examples
    for i, image in enumerate(images):
        # Perform OCR to extract text
        text = pytesseract.image_to_string(image)

        # Extract bounding boxes and words using OCR data frame
        ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
        ocr_df = ocr_df.dropna().reset_index(drop=True)
        words = list(ocr_df.text)
        coordinates = ocr_df[['left', 'top', 'width', 'height']]
        actual_boxes = []
        for _, row in coordinates.iterrows():
            x, y, w, h = tuple(row)
            actual_box = [x, y, x + w, y + h]
            actual_boxes.append(actual_box)

        # Normalize bounding boxes
        width, height = image.size
        normalized_boxes = []
        for box in actual_boxes:
            normalized_box = normalize_box(box, width, height)
            normalized_boxes.append(normalized_box)

        # Create example dictionary
        example = {
            'image_path': pdf_path,  # Store the path to the PDF image
            'words': words,
            'bbox': normalized_boxes
        }

        preprocessed_examples.append(example)

    return preprocessed_examples


    preprocessed_data = preprocess_new_pdf_images(pdf_path)

    return preprocessed_data

#Tokenization
def tokenization(preprocessed_data, tokenizer):
    #Encode inputs
    def encode_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]):
        words = example['words']
        normalized_word_boxes = example['bbox']

        assert len(words) == len(normalized_word_boxes)

        token_boxes = []
        for word, box in zip(words, normalized_word_boxes):
            word_tokens = tokenizer.tokenize(word)
            token_boxes.extend([box] * len(word_tokens))

        # Truncation of token_boxes
        special_tokens_count = 2
        if len(token_boxes) > max_seq_length - special_tokens_count:
            token_boxes = token_boxes[: (max_seq_length - special_tokens_count)]

        # add bounding boxes of cls + sep tokens
        token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]

        encoding = tokenizer(' '.join(words), padding='max_length', truncation=True)
        # Padding of token_boxes up the bounding boxes to the sequence length.
        input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"]
        padding_length = max_seq_length - len(input_ids)
        token_boxes += [pad_token_box] * padding_length
        encoding['bbox'] = token_boxes

        # Check if 'label' key exists in example dictionary
        if 'label' in example:
            encoding['label'] = label2idx[example['label']]

        assert len(encoding['input_ids']) == max_seq_length
        assert len(encoding['attention_mask']) == max_seq_length
        assert len(encoding['token_type_ids']) == max_seq_length
        assert len(encoding['bbox']) == max_seq_length

        return encoding
    # Encode the examples using the tokenizer
    encoded_examples = []
    for example in preprocessed_data:
        encoded_example = encode_example(example)  # Assuming you have encode_example function defined
        encoded_examples.append(encoded_example)
        #break
    return encoded_examples

In [18]:
def predict_label(device,model,tokenizer,pdf_path,label2idx, encoded_examples):

    classes= list(label2idx.keys())
    idx2label = {v: k for k, v in label2idx.items()}


    def prediction(encoded_examples):
        # Convert the encoded examples to PyTorch tensors
        input_ids = torch.tensor([example['input_ids'] for example in encoded_examples], dtype=torch.long).to(device)
        bbox = torch.tensor([example['bbox'] for example in encoded_examples], dtype=torch.long).to(device)
        attention_mask = torch.tensor([example['attention_mask'] for example in encoded_examples], dtype=torch.long).to(device)
        token_type_ids = torch.tensor([example['token_type_ids'] for example in encoded_examples], dtype=torch.long).to(device)

        # Run inference using the loaded model
        with torch.no_grad():
            outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
            predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

        classification_logits = outputs.logits
        classification_results = torch.softmax(classification_logits, dim=1).tolist()[0]
        result = {}

        for i in range(len(classes)):
            #print(f"{classes[i]}: {int(round(classification_results[i] * 100))}%")
            result[classes[i]] = int(round(classification_results[i] * 100))
        #print(result)

        #highest_index = classification_results.index(max(classification_results))
        #predicted_label = idx2label[highest_index]

        #return predicted_label
        return result

    #claculate mean probs
    sum_values = {}
    def cal_probs(probs):
        #global sum_values
        # Initialize sum_values dictionary
        for entry in probs:
            for key, value in entry.items():
                if key not in sum_values:
                    sum_values[key] = 0

        # Calculate the sum of values for each key across all dictionaries
        for entry in probs:
            for key, value in entry.items():
                sum_values[key] += value

        # Calculate the mean values for each key
        mean_values = {key: value / len(probs) for key, value in sum_values.items()}

        # Find the key with the highest mean value
        max_key = max(mean_values, key=mean_values.get)
        max_value = mean_values[max_key]

        return max_key#, max_value

    probs = []
    for example in encoded_examples:
        result = prediction([example])
        probs.append(result)

    #print(probs)
    predicted_label = cal_probs(probs)

    return predicted_label

In [20]:
def start_prediction(pdf_path):
    model_path_level1 = '/content/drive/MyDrive/dataset/saved_model/'
    model_level1, tokenizer_level1,device_level1 = load_model(model_path_level1)

    labels_path_level1 = '/content/drive/MyDrive/dataset/label2idx.pkl'
    with open(labels_path_level1, 'rb') as f:
        label2idx_level1 = pickle.load(f)

    model_path_level2 = '/content/drive/MyDrive/dataset/saved_model-level2/'
    model_level2, tokenizer_level2,device_level2 = load_model(model_path_level2)

    labels_path_level2 = '/content/drive/MyDrive/dataset/label2idx-level2.pkl'
    with open(labels_path_level2, 'rb') as f:
        label2idx_level2 = pickle.load(f)


    preprocessed_data = preprocess_new_pdf_images(pdf_path)

    encoded_examples_level1 = tokenization(preprocessed_data, tokenizer_level1)
    encoded_examples_level2 = tokenization(preprocessed_data, tokenizer_level2)

    level1_label = predict_label(device_level1 ,model_level1,tokenizer_level1 ,pdf_path, label2idx_level1,encoded_examples_level1)
    level2_label = predict_label(device_level2 ,model_level2,tokenizer_level2 ,pdf_path, label2idx_level2,encoded_examples_level2)

    return level1_label, level2_label


In [8]:



# model_path_level1 = '/content/drive/MyDrive/dataset/saved_model/'
# model_level1, tokenizer_level1,device_level1 = load_model(model_path_level1)

# labels_path_level1 = '/content/drive/MyDrive/dataset/label2idx.pkl'
# with open(labels_path_level1, 'rb') as f:
#     label2idx_level1 = pickle.load(f)

# model_path_level2 = '/content/drive/MyDrive/dataset/saved_model-level2/'
# model_level2, tokenizer_level2,device_level2 = load_model(model_path_level2)

# labels_path_level2 = '/content/drive/MyDrive/dataset/label2idx-level2.pkl'
# with open(labels_path_level2, 'rb') as f:
#     label2idx_level2 = pickle.load(f)


# pdf_path = "/content/drive/MyDrive/layout_dataset_multi_dir/cv/cat1/SUBBARAOGOGULAMUDI 4Y_6M.pdf"

# preprocessed_data = preprocess_new_pdf_images(pdf_path)

# encoded_examples_level1 = tokenization(preprocessed_data, tokenizer_level1)
# encoded_examples_level2 = tokenization(preprocessed_data, tokenizer_level2)

# predict_label(device_level1 ,model_level1,tokenizer_level1 ,pdf_path, label2idx_level1,encoded_examples_level1)

In [27]:
#Evalution

import os
import pandas as pd

def list_pdf_files_and_categories(root_dir):
    pdf_paths = []
    categories = []
    categories2 = []
    tree_split = root_dir.split("/")[-1]
    #print(tree_split )
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                pdf_paths.append(pdf_path)
                category = os.path.basename(root)
                category = pdf_path.split(tree_split)[-1].strip("/").split("/")[0]
                #print(category)
                categories.append(category)
                category = pdf_path.split(tree_split)[-1].strip("/").split("/")[1]
                categories2.append(category)

    if not pdf_paths:
        raise ValueError("No PDF files found in the directory.")

    return pd.DataFrame({'pdf_path': pdf_paths, 'level1_actual_label': categories, 'level2_actual_label': categories2})



# Set your root directory where PDF files are stored
root_directory = '/content/drive/MyDrive/layout_dataset_multi_dir'


data = list_pdf_files_and_categories(root_directory)

data

Unnamed: 0,pdf_path,level1_actual_label,level2_actual_label
0,/content/drive/MyDrive/layout_dataset_multi_di...,cv,cat1
1,/content/drive/MyDrive/layout_dataset_multi_di...,cv,cat1
2,/content/drive/MyDrive/layout_dataset_multi_di...,cv,cat2
3,/content/drive/MyDrive/layout_dataset_multi_di...,cv,cat2
4,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,cat4
5,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,cat4
6,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,cat4
7,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,cat4
8,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,cat4
9,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,cat3


In [32]:
data[["level1_predicted_label","level2_predicted_label"]] = data["pdf_path"].apply(lambda pdf_path: pd.Series(start_prediction(pdf_path)))
data

Unnamed: 0,pdf_path,level1_actual_label,level2_actual_label,level1_predicted_label,level2_predicted_label
0,/content/drive/MyDrive/layout_dataset_multi_di...,cv,cat1,cv,cat1
1,/content/drive/MyDrive/layout_dataset_multi_di...,cv,cat1,cv,cat1
2,/content/drive/MyDrive/layout_dataset_multi_di...,cv,cat2,cv,cat2
3,/content/drive/MyDrive/layout_dataset_multi_di...,cv,cat2,cv,cat2
4,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,cat4,invoive,cat4
5,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,cat4,invoive,cat4
6,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,cat4,invoive,cat4
7,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,cat4,invoive,cat4
8,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,cat4,invoive,cat4
9,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,cat3,invoive,cat3


In [34]:
data[['pdf_path', 'level1_actual_label','level1_predicted_label',  'level2_actual_label',
       'level2_predicted_label']]

Unnamed: 0,pdf_path,level1_actual_label,level1_predicted_label,level2_actual_label,level2_predicted_label
0,/content/drive/MyDrive/layout_dataset_multi_di...,cv,cv,cat1,cat1
1,/content/drive/MyDrive/layout_dataset_multi_di...,cv,cv,cat1,cat1
2,/content/drive/MyDrive/layout_dataset_multi_di...,cv,cv,cat2,cat2
3,/content/drive/MyDrive/layout_dataset_multi_di...,cv,cv,cat2,cat2
4,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,invoive,cat4,cat4
5,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,invoive,cat4,cat4
6,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,invoive,cat4,cat4
7,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,invoive,cat4,cat4
8,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,invoive,cat4,cat4
9,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,invoive,cat3,cat3


In [None]:
#Usage

In [21]:
pdf_path = "/content/drive/MyDrive/layout_dataset_multi_dir/cv/cat1/SUBBARAOGOGULAMUDI 4Y_6M.pdf"

label1 , label2 = start_prediction(pdf_path)

print(f"Document belongs to {label2}  in {label1}")

Document belongs to cat1  in cv


In [22]:
pdf_path = "/content/drive/MyDrive/input/OD124156933233239000.pdf"

label1 , label2 = start_prediction(pdf_path)

print(f"Document belongs to {label2}  in {label1}")


Document belongs to cat4  in invoive


In [24]:
pdf_path = "/content/drive/MyDrive/input/invoice (4).pdf"

label1 , label2 = start_prediction(pdf_path)

print(f"Document belongs to {label2}  in {label1}")

Document belongs to cat3  in invoive
