<a href="https://colab.research.google.com/github/Rishal14/data_preprocessor/blob/base/data_preprocessor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
# Install required packages
!apt-get install -y poppler-utils
!pip install pdf2image pytesseract
!apt-get install tesseract-ocr

from pdf2image import convert_from_path
import pytesseract
import json
import re
from google.colab import files

# Step 1: Upload PDF file
uploaded = files.upload()

# Get the uploaded file name
pdf_file_path = next(iter(uploaded))  # Get the first uploaded file

def extract_text_with_ocr(pdf_path):
    """Converts PDF pages to images and applies OCR to extract text."""
    images = convert_from_path(pdf_path, dpi=300)
    text_blocks = []

    for img in images:
        text = pytesseract.image_to_string(img)
        for line in text.splitlines():
            if line.strip():  # Only add non-empty lines
                text_blocks.append({
                    "text": line.strip(),
                    "font_size": 12,  # Default size as we won't get font info from OCR
                    "is_bold": False  # Assuming Tesseract doesn't detect font styles
                })
    return text_blocks

def is_page_number(text):
    """Checks if a line is likely to be a page number."""
    page_number_patterns = [
        r'^\d+$',  # Standalone number (e.g., '1')
        r'^Page \d+',  # 'Page 1', 'Page 2', etc.
        r'^\d+ of \d+$'  # '1 of 10', '2 of 10', etc.
    ]

    for pattern in page_number_patterns:
        if re.match(pattern, text.strip()):
            return True
    return False

def is_heading(text):
    """Identifies if a line is a heading based on its formatting."""
    # You can adjust this logic to better fit your PDF's style
    return text.isupper() or re.match(r'^\d+\.', text) is not None

def convert_to_json(text_blocks):
    """Converts the detailed text blocks into a structured JSON format."""
    structured_data = {"sections": []}
    current_section = None
    current_content = []

    for block in text_blocks:
        text = block["text"]

        if not text or is_page_number(text):  # Skip empty lines and page numbers
            continue

        # Identify sections by headings
        if is_heading(text):
            # If there's an ongoing section, append it
            if current_section:
                current_section["content"] = current_content
                structured_data["sections"].append(current_section)

            # Start a new section
            current_section = {"heading": text, "content": []}
            current_content = []
        else:
            # Treat the content as general content for the current section
            current_content.append(text)

    # Append the last section after finishing
    if current_section:
        current_section["content"] = current_content
        structured_data["sections"].append(current_section)

    return structured_data

def save_json(data, output_path):
    """Saves the structured data to a JSON file."""
    with open(output_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

# Step 2: Extract text with OCR from the uploaded PDF
pdf_text_blocks = extract_text_with_ocr(pdf_file_path)

# Step 3: Convert the extracted text to a structured JSON format
structured_json = convert_to_json(pdf_text_blocks)

# Step 4: Save the JSON to a file
output_json_path = "output.json"
save_json(structured_json, output_json_path)

# Step 5: Display the structured JSON output
print(json.dumps(structured_json, indent=4))


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


Saving water-hplc-nowpak-l (1).pdf to water-hplc-nowpak-l (1) (5).pdf
{
    "sections": [
        {
            "heading": "SCIENTIFIC",
            "content": []
        },
        {
            "heading": "SAFETY DATA SHEET",
            "content": [
                "Creation Date 26-Jan-2010 Revision Date 24-Dec-2021 Revision Number 9"
            ]
        },
        {
            "heading": "1. Identification",
            "content": [
                "Product Name Water",
                "Cat No. : W5-1; W5-4; W5-4LC; W5N1-19; W5N2-19; W5SK-1; W5SK-4",
                "CAS No 7732-18-5",
                "Synonyms No information available",
                "Recommended Use Laboratory chemicals.",
                "Uses advised against Food, drug, pesticide or biocidal product use.",
                "Details of the supplier of the safety data sheet",
                "Company",
                "Fisher Scientific Company",
                "One Reagent Lane",
                "Fair Lawn