In [None]:
!pip install langchain openai PyPDF2

In [None]:
!pip install pandas

In [145]:
import pandas as pd
import json
import re
import ast

In [7]:
import google.generativeai as genai
import PyPDF2
# Set up your API key
genai.configure(api_key="AIzaSyCaNhpuTswnTijamkAF-Sfw7-D3prA3x8I")
model = genai.GenerativeModel("gemini-2.0-flash")

In [57]:
# --------------------------------------------------
# Step 1: Extract Text from the PDF
# --------------------------------------------------

def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract text from a PDF file.
    """
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# Replace 'regulatory_rules.pdf' with your actual PDF file path.
# pdf_path = "FR_Y-14Q20240331_i-167-168.pdf"
# pdf_text = extract_text_from_pdf(pdf_path)
# print("Extracted PDF text (first 500 characters):")
# print(pdf_text)

In [77]:
# --------------------------------------------------
# Step 2: Leverage the LLM to Extract Field Records
# --------------------------------------------------
# Define a prompt that asks the LLM to parse the extracted PDF text and output the field records in JSON format.
def generate_rules(pdf_text):
    extraction_prompt_template = f"""
    You are given the extracted text from a regulatory PDF document containing field definitions.
    Your task is to extract all field records from the text.
    For each field record, extract the following details:
    - field_no: the field number.
    - field_name: the field name.
    - technical_name : The technical name is in parenthesis beside the fieldName, Note that there should not be any space in technical name.
    - description: a short description of the field.
    - constraints: any validation constraints (for example, "Must not contain a carriage return, line feed, comma or any unprintable character.")
    - if you feels like any word is broken fix it.
    
    Return the results as valid JSON in the following format:
    [
      {{
        "field_no": "1"
        "field_name": "Customer ID",
        "technical_name": "CustomerID"
        "description": "Report the unique internal identifier for the customer relationship...",
        "constraints": "Must not contain a carriage return, line feed, comma or any unprintable character."
      }},
      ...
    ]
    
    Only output the JSON without any additional commentary, also dont't include ```josn ...... ``````` as i will parse this json response directly.
    
    Extract the information from the text below:
    ====================
    {pdf_text}
    ====================
    """
    
    response = model.generate_content(extraction_prompt_template)
    return json.loads(response.text.strip("```").strip("json"))

In [78]:
rules = generate_rules(pdf_text)
print(rules)

[{'field_no': '1', 'field_name': 'Customer ID', 'technical_name': 'CustomerID', 'description': "Report the unique internal identifier for the customer relationship under which the obligor's exposure is aggregated in the reporting entity 's credit systems. Customer ID is a relationship concept under which multiple borrowers are aggregated because they have related risks, including, but not limited to parent/subsidiary relationships. For stand-alone or ultimate parent obligors, the Customer ID may be the same as the unique internal identifier for the obligor provided in Field 2.", 'constraints': 'Must not contain a carriage return, line feed, comma or any unprintable character.'}, {'field_no': '2', 'field_name': 'Internal ID', 'technical_name': 'InternalObligorID', 'description': 'Report the reporting entity’s unique internal identifier for the obligor. Internal ID is a borrower concept that identifies the entity under which multiple loans are aggregated.', 'constraints': 'Must not conta

In [167]:
def generate_python_code(rules):
    validation_code_dict = {}
    
    code_generation_prompt_template = f"""
    You are given a field specification for regulatory data validation. The rules are given belowin json format, in which contains details like Field No, Field Name, Description and constraint.:
    
    {rules}
    
    Generate the Python function that validates an input string for this field.
    - Return a python dictionary wich contains the python code for all the the fields.
    - The function should return True if the input complies with the constraint, and False otherwise.
    - Name the function based on the field name. For example, for "Customer ID (CustomerID)", name it validate_CustomerID. (Basically validate_technicalName)
    - The dictionary should conatins the Technical_name as key then the python code as value
    - The fucntion should be in such a way that i can parse them while using.
    - use triple qoutes to properly handle new lines, don't use \n in the function
    - Use proper Python syntax with correct indentation
    
    Provide only the python dictonary nothing else, as I will directly use the response in a python code..
    """
    
    validation_code_dict= model.generate_content(code_generation_prompt_template)
    return ast.literal_eval(validation_code_dict.text.strip("```").strip("python"))

In [168]:
code = generate_python_code(rules)

In [169]:
print(code)

{'CustomerID': '\ndef validate_CustomerID(input_string):\n    """\n    Validates that the input string for Customer ID does not contain a carriage return, line feed, comma or any unprintable character.\n    """\n    invalid_chars = [chr(i) for i in range(32) if chr(i) not in [\'\\t\', \'\\n\', \'\\r\']]\n    invalid_chars.append(\',\')\n    for char in invalid_chars:\n        if char in input_string:\n            return False\n    return True\n', 'InternalObligorID': '\ndef validate_InternalObligorID(input_string):\n    """\n    Validates that the input string for Internal ID does not contain a carriage return, line feed, comma or any unprintable character.\n    """\n    invalid_chars = [chr(i) for i in range(32) if chr(i) not in [\'\\t\', \'\\n\', \'\\r\']]\n    invalid_chars.append(\',\')\n    for char in invalid_chars:\n        if char in input_string:\n            return False\n    return True\n', 'OriginalInternalObligorID': '\ndef validate_OriginalInternalObligorID(input_string):

In [171]:
# --------------------------------------------------
# Step 4: Load & Validate CSV Data
# --------------------------------------------------
def validate_csv(csv_path, field_rules):
    """Validates a CSV file against extracted rules."""
    df = pd.read_csv(csv_path)
    
    # Get available columns from CSV
    csv_columns = set(df.columns)
    
    # Get required columns from regulatory instructions
    required_columns = {field["technical_name"] for field in field_rules}

    # validation_code = generate_python_code(field_rules)
    validation_code = code

    # print(validation_code['CustomerID'])
    
    missing_fields = required_columns - csv_columns
    if missing_fields:
        print("\n⚠️ Missing Fields in CSV:", missing_fields)

    # Validate each row
    validation_results = []
    for index, row in df.iterrows():
        row_errors = []

        for field in field_rules:
            field_name = field["technical_name"]
            if field_name in df.columns:
                value = row[field_name]

                # Construct function name dynamically
                function_name = f"validate_{field_name}"
                if field_name in validation_code:
                    namespace = {}
                    exec( validation_code[field_name],namespace)
                    is_valid = namespace[function_name](value)
                    if not is_valid:
                        row_errors.append(f"Validation failed for {field_name}: {value}")
        
        validation_results.append({"row": index, "errors": row_errors})

    # Print results
    for result in validation_results:
        if result["errors"]:
            print(f"\n❌ Row {result['row']} Errors:", result["errors"])
        else:
            print(f"\n✅ Row {result['row']} passed all validations.")

# Example CSV file containing transaction data
csv_path = "llm_generated_dummy_data.csv"  # Replace with actual CSV file
# validate_csv(csv_path, generate_rules(pdf_text))
validate_csv(csv_path, rules)

print("\nValidation Complete ✅")


✅ Row 0 passed all validations.

✅ Row 1 passed all validations.

✅ Row 2 passed all validations.

✅ Row 3 passed all validations.

✅ Row 4 passed all validations.

✅ Row 5 passed all validations.

✅ Row 6 passed all validations.

✅ Row 7 passed all validations.

✅ Row 8 passed all validations.

✅ Row 9 passed all validations.

✅ Row 10 passed all validations.

✅ Row 11 passed all validations.

✅ Row 12 passed all validations.

✅ Row 13 passed all validations.

✅ Row 14 passed all validations.

✅ Row 15 passed all validations.

✅ Row 16 passed all validations.

✅ Row 17 passed all validations.

✅ Row 18 passed all validations.

Validation Complete ✅
