<a href="https://colab.research.google.com/github/Rishal14/data_preprocessor/blob/base/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import fitz  # PyMuPDF
import json
import re

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file and returns it as a string."""
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def is_heading(line):
    """Checks if a line is a section heading.
    - Either all caps or starts with a number (e.g., 1. Introduction).
    """
    heading_pattern = r'^\d+\.\s*'  # Matches lines starting with a number followed by a period
    return line.isupper() or re.match(heading_pattern, line)

def convert_to_json(text):
    """Converts the unstructured text into a structured JSON format."""
    lines = text.split('\n')

    structured_data = {"sections": []}
    current_section = None

    for line in lines:
        line = line.strip()  # Remove leading/trailing spaces

        if not line:
            continue  # Skip empty lines

        # If the line looks like a heading (all caps or starts with a number)
        if is_heading(line):
            if current_section:  # If there's an ongoing section, append it
                structured_data["sections"].append(current_section)

            # Start a new section
            current_section = {"heading": line, "content": []}

        # If it's content (paragraph or list under a heading)
        elif current_section:
            current_section["content"].append(line)

    # Append the last section after finishing
    if current_section:
        structured_data["sections"].append(current_section)

    return structured_data

def save_json(data, output_path):
    """Saves the structured data to a JSON file."""
    with open(output_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

# Example usage
pdf_file_path = "acetonitrile-hplc-grade-l (1).pdf"
output_json_path = "output.json"

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_file_path)

# Convert the extracted text to a structured JSON format
structured_json = convert_to_json(pdf_text)

# Save the JSON to a file
save_json(structured_json, output_json_path)

# Display the JSON output
print(json.dumps(structured_json, indent=4))


{
    "sections": [
        {
            "heading": "SAFETY DATA SHEET",
            "content": [
                "Creation Date  16-Jun-2009",
                "Revision Date  13-Oct-2023",
                "Revision Number  8"
            ]
        },
        {
            "heading": "1. Identification",
            "content": [
                "Product Name",
                "Acetonitrile",
                "Cat No. :"
            ]
        },
        {
            "heading": "A21-1; A21-4; A21-20; A21-200; A21-200LC; A21FB-19; A21FB-50;",
            "content": []
        },
        {
            "heading": "A21FB-115; A21FB-200; A21RB-115; A21RS-19; A21RS-28; A21RS-50;",
            "content": []
        },
        {
            "heading": "A21RS-115; A21RS-200; A21RS-1350; A21FB-445; XXA21PD200LI;",
            "content": []
        },
        {
            "heading": "A993-1; A993RS-19; A996-1; A996-4; A996-4LC; A996N2-19;",
            "content": []
        },
        {
         

In [None]:
!pip install pytesseract
!sudo apt install tesseract-ocr
from transformers import LayoutLMv3Tokenizer, LayoutLMv3ForTokenClassification
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import torch

# Load the LayoutLM tokenizer and model from Hugging Face
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=5)  # Adjust num_labels if needed

def pdf_to_images(pdf_path):
    """Convert PDF to images using pdf2image."""
    images = convert_from_path(pdf_path)
    return images

def preprocess_images(images):
    """Prepare images for OCR and layout extraction."""
    ocr_data = []
    for image in images:
        image_rgb = image.convert("RGB")
        # Use Tesseract to perform OCR
        text = pytesseract.image_to_string(image_rgb)
        boxes = pytesseract.image_to_boxes(image_rgb)  # Get bounding box data
        ocr_data.append((text, boxes))
    return ocr_data

def convert_to_layoutlm_input(text, boxes, image):
    """Convert OCR results and boxes to LayoutLM input format."""
    words = text.split()
    normalized_boxes = []

    # Normalize the box coordinates
    width, height = image.size
    for b in boxes.splitlines():
        b = b.split(' ')
        xmin, ymin, xmax, ymax = int(b[1]), height - int(b[2]), int(b[3]), height - int(b[4])
        normalized_boxes.append([xmin, ymin, xmax, ymax])

    # Tokenize the words and prepare inputs for LayoutLM
    encoded_inputs = tokenizer(words, boxes=normalized_boxes, return_tensors="pt", padding="max_length", truncation=True)
    return encoded_inputs

def extract_structured_data_from_pdf(pdf_path):
    images = pdf_to_images(pdf_path)
    ocr_data = preprocess_images(images)

    structured_data = []
    for idx, (text, boxes) in enumerate(ocr_data):
        input_data = convert_to_layoutlm_input(text, boxes, images[idx])
        input_ids = input_data['input_ids']
        attention_mask = input_data['attention_mask']
        bbox = input_data['bbox']

        # Make predictions using LayoutLMv3 model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, bbox=bbox)
        predictions = outputs.logits.argmax(dim=-1).tolist()[0]

        structured_page = {
            "page_number": idx + 1,
            "text": text,
            "predictions": predictions,
            "bounding_boxes": bbox.tolist()
        }
        structured_data.append(structured_page)

    return structured_data

# Example usage
pdf_file_path = "acetonitrile-hplc-grade-l (1).pdf"  # Upload your PDF to Colab
structured_data = extract_structured_data_from_pdf(pdf_file_path)

# Output the structured text and layout information
for page in structured_data:
    print(f"Page {page['page_number']} Content:")
    print("Text:", page['text'])
    print("Predictions:", page['predictions'])
    print("Bounding Boxes:", page['bounding_boxes'])


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (4,036 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: You must provide as many words as there are bounding boxes