In [23]:
import pdfplumber
import json
import re
import os

def extract_text_from_pdf_plumber(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            page_text = page.extract_text()
            text += page_text + "\n"
    return text

def load_patterns(config_path):
    with open(config_path, 'r') as config_file:
        return json.load(config_file)

def parse_sds_text(text, patterns):
    sds_dict = {}

    for section, field_patterns in patterns.items():
        sds_dict[section] = {}

        for field, pattern in field_patterns.items():
            if isinstance(pattern, dict):
                sds_dict[section][field] = {}
                for sub_field, sub_pattern in pattern.items():
                    match = re.search(sub_pattern, text)
                    if match:
                        sds_dict[section][field][sub_field] = match.group(1).strip()
            else:
                match = re.search(pattern, text)
                if match:
                    sds_dict[section][field] = match.group(1).strip()

    return sds_dict


def process_multiple_pdfs(pdf_folder, config_path, output_folder):
    section_patterns = load_patterns(config_path)
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for pdf_filename in os.listdir(pdf_folder):
        if pdf_filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_filename)
            
            extracted_text = extract_text_from_pdf_plumber(pdf_path)
            
            sds_data = parse_sds_text(extracted_text, section_patterns)
            
            json_filename = os.path.splitext(pdf_filename)[0] + ".json"
            json_path = os.path.join(output_folder, json_filename)
            with open(json_path, 'w') as json_file:
                json.dump(sds_data, json_file, indent=4)
            
            print(f"Processed {pdf_filename} and saved JSON data to {json_filename}")

pdf_folder = "/home/sabrina/data_preprocessor/data"
config_path = "/home/sabrina/data_preprocessor/code/config.json"
output_folder = "/home/sabrina/data_preprocessor/output"

process_multiple_pdfs(pdf_folder, config_path, output_folder)


Processed acetone-acs-l (1).pdf and saved JSON data to acetone-acs-l (1).json
Processed acetone-acs-l.pdf and saved JSON data to acetone-acs-l.json
Processed acetonitrile-hplc-grade-l (1).pdf and saved JSON data to acetonitrile-hplc-grade-l (1).json
Processed acetonitrile-hplc-grade-l.pdf and saved JSON data to acetonitrile-hplc-grade-l.json
Processed ammonium-hydroxide-acs-lb.pdf and saved JSON data to ammonium-hydroxide-acs-lb.json
Processed buffer-x-tbs-ph--ml.pdf and saved JSON data to buffer-x-tbs-ph--ml.json
Processed chloroform-certified-acs-l.pdf and saved JSON data to chloroform-certified-acs-l.json
Processed citric-acid-gran-cert-acs-kg.pdf and saved JSON data to citric-acid-gran-cert-acs-kg.json
Processed dichloromethane.pdf and saved JSON data to dichloromethane.json
Processed edta--ph--lt.pdf and saved JSON data to edta--ph--lt.json
Processed ethyl-acetate-cr-acs-l.pdf and saved JSON data to ethyl-acetate-cr-acs-l.json
Processed hexanes-acs-l.pdf and saved JSON data to hex