<a href="https://colab.research.google.com/github/rahulgourshetty/collegezone/blob/master/Copy_of_Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    LayoutLMv2Processor,
    LayoutLMv2ImageProcessor,
    LayoutLMv2ForTokenClassification,
    TrainingArguments,
    Trainer,
    EvalPrediction
)
import numpy as np
from sklearn.metrics import classification_report

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Load dataset (first 100 samples for training, next 20 for testing)
dataset = load_dataset("mathieu1256/FATURA2-invoices")
train_dataset = dataset["train"].select(range(10000))
test_dataset = dataset["train"].select(range(100, 120))

# Create a label mapping for contiguous labels.
unique_labels = sorted(set(label for sublist in train_dataset["ner_tags"] for label in sublist))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for idx, label in enumerate(unique_labels)}
num_labels = len(unique_labels)

print("Original unique labels:", unique_labels)
print("Mapping (label2id):", label2id)
print("num_labels:", num_labels)

# Load processor & image processor
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", apply_ocr=False)
image_processor = LayoutLMv2ImageProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")

# Load LayoutLMv2 model with the correct number of labels
model = LayoutLMv2ForTokenClassification.from_pretrained(
    "microsoft/layoutlmv2-base-uncased", num_labels=num_labels
)
model.config.label2id = label2id
model.config.id2label = id2label
model.to(device)

# Preprocessing function
def preprocess_data(example):
    image = image_processor(example["image"], return_tensors="pt")["pixel_values"][0]
    encoding = processor(
        images=[image],
        text=example["tokens"],
        boxes=example["bboxes"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt",
    )
    encoding = {k: v.squeeze(0) for k, v in encoding.items()}

    original_labels = example["ner_tags"]
    remapped_labels = [label2id[label] for label in original_labels]
    remapped_labels += [0] * (512 - len(remapped_labels)) if len(remapped_labels) < 512 else remapped_labels[:512]
    encoding["labels"] = torch.tensor(remapped_labels, dtype=torch.long)

    bbox = example["bboxes"]
    bbox += [[0, 0, 0, 0]] * (512 - len(bbox)) if len(bbox) < 512 else bbox[:512]
    encoding["bbox"] = torch.tensor(bbox, dtype=torch.long)

    return encoding

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_data, remove_columns=["image", "tokens", "bboxes", "id", "ner_tags"])
test_dataset = test_dataset.map(preprocess_data, remove_columns=["image", "tokens", "bboxes", "id", "ner_tags"])

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=3,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    save_steps=50,
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),
    logging_steps=10,
    evaluation_strategy="epoch",
    save_total_limit=1,
)

# Compute metrics
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=2).flatten()
    labels = p.label_ids.flatten()
    mask = labels != -100  # Ignore padded labels
    preds = preds[mask]
    labels = labels[mask]

    report = classification_report(
        labels,
        preds,
        target_names=[id2label[i] for i in range(num_labels)],
        output_dict=True
    )

    # Convert all keys in the report to strings
    report = {str(key): value for key, value in report.items()}
    return report

# Setup Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# Save the model
trainer.save_model("./saved_model")
print("✅ Training and evaluation complete! Model saved in './saved_model'")


Using device: cuda
Original unique labels: [1, 2, 3, 4, 5, 6, 10, 11, 12, 13]
Mapping (label2id): {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 10: 6, 11: 7, 12: 8, 13: 9}
num_labels: 10


Some weights of LayoutLMv2ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv2-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



KeyboardInterrupt: 

In [None]:
!pip install pytesseract
!apt-get install -y tesseract-ocr


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 19 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
!pip install torch torchvision torchaudio
!pip install 'git+https://github.com/facebookresearch/detectron2.git'


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
# Install Poppler (needed by pdf2image) and the required Python packages.
!apt-get install -y poppler-utils
!pip install pdf2image pytesseract transformers datasets pandas


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 19 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Fetched 186 kB in 0s (444 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 124973 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.6_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.6) ...
Setting up poppler-utils (22.02.0-2ubuntu0.6) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Success

In [None]:
import torch
import pytesseract
from pdf2image import convert_from_path
from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
import pandas as pd

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Define the path where your fine-tuned model is saved.
model_path = "./saved_model"

# Load the processor.
# Since the saved_model folder does not include processor configuration files,
# we load the processor from the original checkpoint.
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", apply_ocr=False)

# Load the fine-tuned model.
model = LayoutLMv2ForTokenClassification.from_pretrained(model_path)
model.to(device)
model.eval()

# The id2label mapping is stored in the model configuration.
id2label = model.config.id2label

def extract_text_and_boxes(image):
    """
    Uses pytesseract to perform OCR on a PIL image.
    Returns a list of tokens and a list of bounding boxes (scaled to 0-1000).
    """
    # Run OCR on the image.
    ocr_result = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
    tokens = []
    boxes = []
    num_boxes = len(ocr_result['text'])
    width, height = image.size
    for i in range(num_boxes):
        token = ocr_result['text'][i].strip()
        if token:  # Only consider non-empty tokens.
            tokens.append(token)
            # Get box values provided by Tesseract.
            x, y, w, h = ocr_result['left'][i], ocr_result['top'][i], ocr_result['width'][i], ocr_result['height'][i]
            # Convert to [x1, y1, x2, y2].
            x1, y1, x2, y2 = x, y, x + w, y + h
            # Scale the coordinates to the range 0-1000 (LayoutLMv2 expects normalized coordinates).
            x1 = int(1000 * (x1 / width))
            y1 = int(1000 * (y1 / height))
            x2 = int(1000 * (x2 / width))
            y2 = int(1000 * (y2 / height))
            boxes.append([x1, y1, x2, y2])
    return tokens, boxes

def process_pdf(pdf_path):
    """
    Processes a PDF file page by page:
      1. Converts each page to an image.
      2. Uses OCR to extract tokens and bounding boxes.
      3. Uses the LayoutLMv2 processor and model to predict token labels.
      4. Aggregates tokens by label (ignoring those with label "O").
    Returns a list of dictionaries (one per page) containing the extracted fields.
    """
    # Convert PDF pages to images.
    pages = convert_from_path(pdf_path)
    extracted_data_list = []

    for page in pages:
        # Extract tokens and bounding boxes from the image using OCR.
        tokens, boxes = extract_text_and_boxes(page)

        # Process the image and OCR output with the processor.
        # Note: We pass a list of tokens and boxes (wrapped in another list) because the processor expects a batch.
        encoding = processor(
            images=[page],
            text=[tokens],
            boxes=[boxes],
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt"
        )
        # Move tensors to the same device as the model.
        encoding = {k: v.to(device) for k, v in encoding.items()}

        # Run the model to get predictions.
        with torch.no_grad():
            outputs = model(**encoding)
        # Get predictions (logits) and convert to label IDs.
        preds = outputs.logits.argmax(-1).squeeze(0).tolist()

        # The first token is often a special token ([CLS]) and the last ([SEP]), so we assume the predictions
        # for the first len(tokens) tokens are valid.
        predicted_labels = [id2label.get(p, "O") for p in preds][1:len(tokens)+1]

        # Aggregate tokens by their predicted label (skip those with label "O").
        extracted = {}
        for token, label in zip(tokens, predicted_labels):
            if label != "O":
                if label in extracted:
                    extracted[label] += " " + token
                else:
                    extracted[label] = token
        extracted_data_list.append(extracted)

    return extracted_data_list

# --- Main Execution ---

# Provide the path to your PDF file.
pdf_path = "/content/sample.pdf"  # Replace with your PDF file path

# Process the PDF and extract structured data.
extracted_data_list = process_pdf(pdf_path)

# Save the extracted data to a CSV file (one row per PDF page).
df = pd.DataFrame(extracted_data_list)
df.to_csv("extracted_data.csv", index=False)
print("Data extracted and saved to extracted_data.csv")


Using device: cuda
Data extracted and saved to extracted_data.csv
