In [None]:
# Install necessary libraries (uncomment if needed)
# !pip install langchain openai PyPDF2

import os
import json
import PyPDF2
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Set your OpenAI API key (replace with your actual key)
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"

In [None]:
# --------------------------------------------------
# Step 1: Extract Text from the PDF
# --------------------------------------------------
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract text from a PDF file.
    """
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# Replace 'regulatory_rules.pdf' with your actual PDF file path.
pdf_path = "regulatory_rules.pdf"
pdf_text = extract_text_from_pdf(pdf_path)
print("Extracted PDF text (first 500 characters):")
print(pdf_text[:500])

In [None]:
# --------------------------------------------------
# Step 2: Leverage the LLM to Extract Field Records
# --------------------------------------------------
# Define a prompt that asks the LLM to parse the extracted PDF text and output the field records in JSON format.
extraction_prompt_template = """
You are given the extracted text from a regulatory PDF document containing field definitions.
Your task is to extract all field records from the text.
For each field record, extract the following details:
- field_no: the field number.
- field_name: the field name along with its technical name in parentheses.
- description: a short description of the field.
- constraints: any validation constraints (for example, "Must not contain a carriage return, line feed, comma or any unprintable character.")

Return the results as valid JSON in the following format:
[
  {
    "field_no": "1",
    "field_name": "Customer ID (CustomerID)",
    "description": "Report the unique internal identifier for the customer relationship...",
    "constraints": "Must not contain a carriage return, line feed, comma or any unprintable character."
  },
  ...
]

Only output the JSON without any additional commentary.

Extract the information from the text below:
====================
{pdf_text}
====================
"""

extraction_prompt = PromptTemplate(
    template=extraction_prompt_template,
    input_variables=["pdf_text"]
)