In [13]:
pip install PyPDF2




In [14]:
import PyPDF2
import json
import re

class SafetyDataSheetParser:
    def __init__(self, pdf_path, output_json_path):
        self.pdf_path = pdf_path
        self.output_json_path = output_json_path
        self.sds_data = {
            "SafetyDataSheet": {
                "Identification": {},
                "HazardIdentification": {},
                "Composition/Information on Ingredients":{},
                "First-aid measures":{},
                "Fire-fighting measures":{},
                "Accidental release measures":{},
                "Handling and storage":{},
                "Exposure controls/personal protection":{},
                "Physical and chemical properties":{},
                "Stability and reactivity":{},
                "Toxicological information":{},
                "Ecological information":{},
                "Disposal considerations":{},
                "Transport information":{},
                "Regulatory information":{},
                "Other information":{}
            }
        }

    def extract_pdf_text(self):
        try:
            with open(self.pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() or ""  # Avoid NoneType errors
            if not text:
                raise ValueError("PDF text extraction failed or PDF is empty.")
            print("PDF text extraction successful!")
            return text
        except Exception as e:
            print(f"Error during PDF extraction: {e}")
            return ""

    def parse_identification_section(self, text):
        """Parses the Identification section."""
        try:
            product_name_match = re.search(r'Product Name\s+(.+)', text)
            cat_no_match = re.search(r'Cat No. :\s+[A-Z]+\d+[A-Z]*-\d+[A-Z\d\-]*',text)
            cas_no_match = re.search(r'CAS No\s+(\d+-\d+-\d+)', text)
            recommended_use_match = re.search(r'Recommended Use\s+(.+)', text)
            supplier_match = re.search(r'Details of the supplier.+\n(.+)\n(.+)\n(.+)', text)

            self.sds_data["SafetyDataSheet"]["Identification"] = {
                "ProductName": product_name_match.group(1) if product_name_match else "N/A",
                "Cat No.": cat_no_match.group(0) if cat_no_match else "N/A",
                "CASNo": cas_no_match.group(1) if cas_no_match else "N/A",
                "RecommendedUse": recommended_use_match.group(1) if recommended_use_match else "N/A",
                "Supplier": {
                    "Name": supplier_match.group(1) if supplier_match else "N/A",
                    "Address": supplier_match.group(2) if supplier_match else "N/A",
                    "Telephone": supplier_match.group(3) if supplier_match else "N/A"
                }
            }
            print("Identification section parsed successfully!")
        except Exception as e:
            print(f"Error during Identification section parsing: {e}")

    def parse_hazard_identification_section(self, text):
        """Parses the Hazard Identification section."""
        try:
            signal_word_match = re.search(r'Signal Word\s+(.+)', text)
            hazard_statements = re.findall(r'Hazard Statements\s+(.+)', text)
            precautionary_statements = re.findall(r'Precautionary Statements\s+(.+)', text)

            self.sds_data["SafetyDataSheet"]["HazardIdentification"] = {
                "SignalWord": signal_word_match.group(1) if signal_word_match else "N/A",
                "HazardStatements": hazard_statements if hazard_statements else ["N/A"],
                "PrecautionaryStatements": precautionary_statements if precautionary_statements else ["N/A"]
            }
            print("Hazard Identification section parsed successfully!")
        except Exception as e:
            print(f"Error during Hazard Identification section parsing: {e}")

    def process_sds(self):
        """Processes the SDS and saves it as a JSON."""
        try:
            text = self.extract_pdf_text()

            if text:
                # Parse relevant sections
                self.parse_identification_section(text)
                self.parse_hazard_identification_section(text)

                # Save the structured data as JSON
                with open(self.output_json_path, 'w') as json_file:
                    json.dump(self.sds_data, json_file, indent=4)

                print(f"SDS data successfully saved to {self.output_json_path}")
            else:
                print("No text extracted from the PDF.")
        except Exception as e:
            print(f"Error during SDS processing: {e}")

# Example Usage:
if __name__ == "__main__":
    pdf_file = "data/acetone-acs-l.pdf"  # Path to your PDF file
    output_json = "output_sds.json"  # Path to save the output JSON

    parser = SafetyDataSheetParser(pdf_file, output_json)
    parser.process_sds()


PDF text extraction successful!
Identification section parsed successfully!
Hazard Identification section parsed successfully!
SDS data successfully saved to output_sds.json
