In [8]:
# !pip install pdf2image
# !sudo apt-get install poppler-utils
# !pip install datasets
# !pip install pytesseract
# !apt install tesseract-ocr

In [9]:
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import torch
from transformers import LayoutLMForSequenceClassification, LayoutLMTokenizer

In [10]:
# Load the saved model and tokenizer

def load_model(model_path):
    model = LayoutLMForSequenceClassification.from_pretrained(model_path)
    tokenizer = LayoutLMTokenizer.from_pretrained(model_path)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Load the model onto the device
    model.to(device)

    return model, tokenizer,device


In [53]:
def predict_label(device,model,tokenizer,pdf_path):

    def preprocess_new_pdf_images(pdf_path):
        # Convert PDF to images
        images = convert_from_path(pdf_path)

        # Initialize a list to store preprocessed examples
        preprocessed_examples = []

        # Perform OCR on each image and create examples
        for i, image in enumerate(images):
            # Perform OCR to extract text
            text = pytesseract.image_to_string(image)

            # Extract bounding boxes and words using OCR data frame
            ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
            ocr_df = ocr_df.dropna().reset_index(drop=True)
            words = list(ocr_df.text)
            coordinates = ocr_df[['left', 'top', 'width', 'height']]
            actual_boxes = []
            for _, row in coordinates.iterrows():
                x, y, w, h = tuple(row)
                actual_box = [x, y, x + w, y + h]
                actual_boxes.append(actual_box)

            # Normalize bounding boxes
            width, height = image.size
            normalized_boxes = []
            for box in actual_boxes:
                normalized_box = normalize_box(box, width, height)
                normalized_boxes.append(normalized_box)

            # Create example dictionary
            example = {
                'image_path': pdf_path,  # Store the path to the PDF image
                'words': words,
                'bbox': normalized_boxes
            }

            preprocessed_examples.append(example)

        return preprocessed_examples

    # Normalize bounding boxes
    def normalize_box(box, width, height):
        return [
            int(1000 * (box[0] / width)),
            int(1000 * (box[1] / height)),
            int(1000 * (box[2] / width)),
            int(1000 * (box[3] / height)),
        ]


    preprocessed_data = preprocess_new_pdf_images(pdf_path)
    #Encode inputs
    def encode_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]):
        words = example['words']
        normalized_word_boxes = example['bbox']

        assert len(words) == len(normalized_word_boxes)

        token_boxes = []
        for word, box in zip(words, normalized_word_boxes):
            word_tokens = tokenizer.tokenize(word)
            token_boxes.extend([box] * len(word_tokens))

        # Truncation of token_boxes
        special_tokens_count = 2
        if len(token_boxes) > max_seq_length - special_tokens_count:
            token_boxes = token_boxes[: (max_seq_length - special_tokens_count)]

        # add bounding boxes of cls + sep tokens
        token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]

        encoding = tokenizer(' '.join(words), padding='max_length', truncation=True)
        # Padding of token_boxes up the bounding boxes to the sequence length.
        input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"]
        padding_length = max_seq_length - len(input_ids)
        token_boxes += [pad_token_box] * padding_length
        encoding['bbox'] = token_boxes

        # Check if 'label' key exists in example dictionary
        if 'label' in example:
            encoding['label'] = label2idx[example['label']]

        assert len(encoding['input_ids']) == max_seq_length
        assert len(encoding['attention_mask']) == max_seq_length
        assert len(encoding['token_type_ids']) == max_seq_length
        assert len(encoding['bbox']) == max_seq_length

        return encoding
    # Encode the examples using the tokenizer
    encoded_examples = []
    for example in preprocessed_data:
        encoded_example = encode_example(example)  # Assuming you have encode_example function defined
        encoded_examples.append(encoded_example)

    # Convert the encoded examples to PyTorch tensors
    input_ids = torch.tensor([example['input_ids'] for example in encoded_examples], dtype=torch.long).to(device)
    bbox = torch.tensor([example['bbox'] for example in encoded_examples], dtype=torch.long).to(device)
    attention_mask = torch.tensor([example['attention_mask'] for example in encoded_examples], dtype=torch.long).to(device)
    token_type_ids = torch.tensor([example['token_type_ids'] for example in encoded_examples], dtype=torch.long).to(device)

    # Run inference using the loaded model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
        predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

    # Decode the predictions
    #predicted_labels = [idx2label[prediction] for prediction in predictions]

    # Print the predicted labels
    #print("Predicted Labels:", predicted_labels)
    classification_logits = outputs.logits
    classification_results = torch.softmax(classification_logits, dim=1).tolist()[0]
    highest_index = classification_results.index(max(classification_results))
    predicted_label = idx2label[highest_index]

    return predicted_label

In [None]:
#Prediction

In [59]:
model_path = '/content/drive/MyDrive/dataset/saved_model/'
model, tokenizer,device = load_model(model_path)


labels = ['cv', 'invoive']
idx2label = {v: k for v, k in enumerate(labels)}
label2idx = {k: v for v, k in enumerate(labels)}
label2idx

{'cv': 0, 'invoive': 1}

In [55]:
pdf_path = "/content/drive/MyDrive/layout_dataset_multi_dir/cv/cat1/SUBBARAOGOGULAMUDI 4Y_6M.pdf"
predict_label(device,model,tokenizer,pdf_path)

'CV'

In [56]:
import os
import pandas as pd

def list_pdf_files_and_categories(root_dir):
    pdf_paths = []
    categories = []
    tree_split = root_dir.split("/")[-1]
    #print(tree_split )
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                pdf_paths.append(pdf_path)
                category = os.path.basename(root)
                category = pdf_path.split(tree_split)[-1].strip("/").split("/")[0]
                #print(category)
                categories.append(category)

    if not pdf_paths:
        raise ValueError("No PDF files found in the directory.")

    return pd.DataFrame({'pdf_path': pdf_paths, 'actual_label': categories})



# Set your root directory where PDF files are stored
root_directory = '/content/drive/MyDrive/layout_dataset_multi_dir'


data = list_pdf_files_and_categories(root_directory)

print(data)

                                            pdf_path actual_label
0  /content/drive/MyDrive/layout_dataset_multi_di...           cv
1  /content/drive/MyDrive/layout_dataset_multi_di...           cv
2  /content/drive/MyDrive/layout_dataset_multi_di...           cv
3  /content/drive/MyDrive/layout_dataset_multi_di...           cv
4  /content/drive/MyDrive/layout_dataset_multi_di...      invoive
5  /content/drive/MyDrive/layout_dataset_multi_di...      invoive
6  /content/drive/MyDrive/layout_dataset_multi_di...      invoive
7  /content/drive/MyDrive/layout_dataset_multi_di...      invoive
8  /content/drive/MyDrive/layout_dataset_multi_di...      invoive


In [57]:
data["predicted_label"] = data["pdf_path"].apply(lambda pdf_path: predict_label(device,model,tokenizer,pdf_path))
data

Unnamed: 0,pdf_path,actual_label,predicted_label
0,/content/drive/MyDrive/layout_dataset_multi_di...,cv,CV
1,/content/drive/MyDrive/layout_dataset_multi_di...,cv,CV
2,/content/drive/MyDrive/layout_dataset_multi_di...,cv,CV
3,/content/drive/MyDrive/layout_dataset_multi_di...,cv,CV
4,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,invoive
5,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,invoive
6,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,invoive
7,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,invoive
8,/content/drive/MyDrive/layout_dataset_multi_di...,invoive,invoive
