In [None]:
!pip install langchain openai PyPDF2

In [4]:
import google.generativeai as genai

# Set up your API key
genai.configure(api_key="AIzaSyCaNhpuTswnTijamkAF-Sfw7-D3prA3x8I")
model = genai.GenerativeModel("gemini-2.0-flash")

In [None]:
# Install necessary libraries (uncomment if needed)
# !pip install langchain openai PyPDF2

import os
import json
import PyPDF2
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Set your OpenAI API key (replace with your actual key)
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"

In [None]:
# --------------------------------------------------
# Step 1: Extract Text from the PDF
# --------------------------------------------------
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract text from a PDF file.
    """
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# Replace 'regulatory_rules.pdf' with your actual PDF file path.
pdf_path = "FR_Y-14Q20240331_i-167-168.pdf"
pdf_text = extract_text_from_pdf(pdf_path)
print("Extracted PDF text (first 500 characters):")
print(pdf_text)

In [None]:
# ----------------------------------------
# Step 2: Extract Field Validation Rules Using LLM
# ----------------------------------------
# Notice the double curly braces {{ ... }} to escape literal curly braces.
def get_validation_rules(pdf_text):
    extraction_prompt_template = f"""
    You are an AI that extracts structured field validation rules from financial regulatory documents (Extracted pdf text are given below).
    Extract the validation rules for each field and return JSON format.
    
    The JSON should look like this:
    [
      {{
        "field_name": "Customer ID",
        "technical_name": "CustomerID",
        "description": "Report the unique internal identifier...",
        "constraints": "Must not contain a carriage return, line feed, comma, or any unprintable character."
      }},
      ...
    ]
    Just give me the JSON nothing else.
    Extract the rules from the text below:  
    ====================
    {pdf_text}
    ====================
    """
    
    # extraction_prompt = PromptTemplate(template=extraction_prompt_template, input_variables=["pdf_text"])
    # llm = OpenAI(temperature=0.2)
    # extraction_chain = LLMChain(llm=llm, prompt=extraction_prompt)
    
    data = {"model": "mistral", "prompt": extraction_prompt_template, "stream": False}
    response = requests.post(OLLAMA_API_URL, json=data)
    
    if response.status_code == 200:
        return response.json()["response"]
    else:
        return f"Error: {response.text}"

# Extract rules
validation_rules = get_validation_rules(pdf_text)
print(validation_rules)  # Display the extracted rules

In [None]:
# Initialize the LLM (using OpenAI)
llm = OpenAI(temperature=0.2)

# Create a chain for field extraction
extraction_chain = LLMChain(llm=llm, prompt=extraction_prompt)

# Run the chain to get JSON formatted field records
extracted_fields_json = extraction_chain.run(pdf_text=pdf_text)

# Parse the JSON output
try:
    field_records = json.loads(extracted_fields_json)
    print("\nExtracted Field Records:")
    for record in field_records:
        print(record)
except json.JSONDecodeError as e:
    print("Error parsing JSON from LLM output:", e)
    field_records = []  # Fallback to an empty list if parsing fails

In [None]:
# --------------------------------------------------
# Step 3: Generate Python Validation Code Using LangChain
# --------------------------------------------------
# Define a prompt template to instruct the LLM to generate validation code for a given field record.
code_generation_prompt_template = """
You are provided with the following field details:

Field No: {field_no}
Field Name: {field_name}
Description: {description}
Constraints: {constraints}

Generate a Python function that validates an input string for this field.
- The function should return True if the input complies with the constraint, and False otherwise.
- Name the function based on the field name. For example, for "Customer ID (CustomerID)", name it validate_customer_id.
- Include clear comments in the code.

Provide only the Python function code.
"""

code_generation_prompt = PromptTemplate(
    template=code_generation_prompt_template,
    input_variables=["field_no", "field_name", "description", "constraints"]
)

# Create a chain for code generation
code_generation_chain = LLMChain(llm=llm, prompt=code_generation_prompt)

# Dictionary to hold generated validation code for each field
validation_code_dict = {}

for record in field_records:
    try:
        generated_code = code_generation_chain.run(**record)
        validation_code_dict[record["field_name"]] = generated_code
        print(f"\nGenerated validation code for {record['field_name']}:\n")
        print(generated_code)
        print("-" * 60)
    except Exception as e:
        print(f"Error generating code for {record['field_name']}: {e}")

In [None]:
# --------------------------------------------------
# Step 4: Load & Validate CSV Data
# --------------------------------------------------
def validate_csv(csv_path, field_rules):
    """Validates a CSV file against extracted rules."""
    df = pd.read_csv(csv_path)
    
    # Get available columns from CSV
    csv_columns = set(df.columns)
    
    # Get required columns from regulatory instructions
    required_columns = {field["technical_name"] for field in field_rules}

    # Identify missing fields
    missing_fields = required_columns - csv_columns
    if missing_fields:
        print("\n⚠️ Missing Fields in CSV:", missing_fields)

    # Validate each row
    validation_results = []
    for index, row in df.iterrows():
        row_errors = []
        
        for field in field_rules:
            field_name = field["technical_name"]
            if field_name in df.columns:
                value = row[field_name]
                
                # Construct function name dynamically
                function_name = f"validate_{field_name}"
                if function_name in globals():
                    is_valid = globals()[function_name](value)
                    if not is_valid:
                        row_errors.append(f"Validation failed for {field_name}: {value}")
        
        validation_results.append({"row": index, "errors": row_errors})

    # Print results
    for result in validation_results:
        if result["errors"]:
            print(f"\n❌ Row {result['row']} Errors:", result["errors"])
        else:
            print(f"\n✅ Row {result['row']} passed all validations.")

# Example CSV file containing transaction data
csv_path = "banking_report.csv"  # Replace with actual CSV file
validate_csv(csv_path, field_rules)

print("\nValidation Complete ✅")