In [1]:
import pdfplumber
import pandas as pd
import re
import json

headers = [
    r"Product Name", r"Cat No.", r"Company", r"Synonyms"
]

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = []
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                full_text.append(page_text)
    return "\n".join(full_text)

def segment_text_based_on_headers(text):
    segments = re.split('|'.join(headers), text, flags=re.IGNORECASE)
    headers_found = re.findall('|'.join(headers), text, flags=re.IGNORECASE)
    
    categorized_text = {}
    for i, header in enumerate(headers_found):
        categorized_text[header.strip()] = segments[i + 1].strip() if i + 1 < len(segments) else ""
    
    return categorized_text

def extract_table_data(pdf_path):
    table_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                table_data.extend(table)
    return table_data

def get_cleaned_text_remove_paragraph(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_text = []
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                page_text = re.sub(r"Page \d+ of \d+", "", page_text) 
                page_text = re.sub(r"Specification File", "", page_text)
                page_text = re.sub(r"(?s)Disclaimer.*?(\n\n|\Z)", "", page_text) 
                all_text.append(page_text.strip())
    return "\n\n".join(all_text)

def save_pdf_data_to_json(pdf_text_path, pdf_table_path, json_file):
    cleaned_text = get_cleaned_text_remove_paragraph(pdf_text_path)
    segmented_text = segment_text_based_on_headers(cleaned_text)
    table_data = extract_table_data(pdf_table_path)
    
    data = {
        "text_data": segmented_text,
        "table_data": table_data
    }
    
    with open(json_file, "w") as file:
        json.dump(data, file, indent=4)

pdf_path_text = 'A:/dp/data_preprocessor/data/citric-acid-gran-cert-acs-kg.pdf'
pdf_path_table = 'A:/dp/data_preprocessor/data/phenol-liquid-cert-.pdf'
json_file = 'A:/dp/data_preprocessor/combined_data.json'

save_pdf_data_to_json(pdf_path_text, pdf_path_table, json_file)