In [21]:
import pdfplumber
import json
import re
import os

def extract_text_from_pdf_plumber(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            page_text = page.extract_text()
            text += page_text + "\n"
    return text

def load_patterns(config_path):
    with open(config_path, 'r') as config_file:
        return json.load(config_file)

def parse_sds_text(text, patterns):
    sds_dict = {}

    for section, field_patterns in patterns.items():
        sds_dict[section] = {}

        for field, pattern in field_patterns.items():
            if isinstance(pattern, dict):
                # Handle nested dictionary (e.g., Company information)
                sds_dict[section][field] = {}
                for sub_field, sub_pattern in pattern.items():
                    match = re.search(sub_pattern, text)
                    if match:
                        sds_dict[section][field][sub_field] = match.group(1).strip()
            else:
                # Handle regular pattern matching
                match = re.search(pattern, text)
                if match:
                    sds_dict[section][field] = match.group(1).strip()

    return sds_dict


def process_multiple_pdfs(pdf_folder, config_path, output_folder):
    # Load regex patterns from config
    section_patterns = load_patterns(config_path)
    
    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Loop through each PDF file in the folder
    for pdf_filename in os.listdir(pdf_folder):
        if pdf_filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_filename)
            
            # Extract text from the PDF
            extracted_text = extract_text_from_pdf_plumber(pdf_path)
            
            # Parse the extracted text using the patterns
            sds_data = parse_sds_text(extracted_text, section_patterns)
            
            # Save the parsed data as JSON
            json_filename = os.path.splitext(pdf_filename)[0] + ".json"
            json_path = os.path.join(output_folder, json_filename)
            with open(json_path, 'w') as json_file:
                json.dump(sds_data, json_file, indent=4)
            
            print(f"Processed {pdf_filename} and saved JSON data to {json_filename}")

pdf_folder = "/home/sabrina/Documents/Project/data/"
config_path = "/home/sabrina/Documents/Project/config.json"
output_folder = "/home/sabrina/Documents/Project/json_output/"

process_multiple_pdfs(pdf_folder, config_path, output_folder)


Processed edta--ph--lt.pdf and saved JSON data to edta--ph--lt.json
