In [1]:
import json
import glob
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

JSON_FOLDER = "StructuredRegulatoryDocuments"
TXT_FOLDER = "Testset"

print("Libraries imported and paths set.")

Libraries imported and paths set.


In [2]:
def load_knowledge_base(folder_path):
    # Reads all JSON files from the specified folder
    print(f"Loading regulatory rules from: {folder_path}...")
    data_rows = []
    
    json_files = glob.glob(os.path.join(folder_path, "*.json"))
    
    if not json_files:
        print(f"Error: No JSON files found in {folder_path}")
        return pd.DataFrame()

    for path in json_files:
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
                for item in data:
                    # Extract only valid passages
                    if item.get("Passage") and len(item["Passage"].strip()) > 10:
                        data_rows.append({
                            "source_file": os.path.basename(path),
                            "rule_id": item.get("PassageID", "Unknown"),
                            "text": item["Passage"]
                        })
        except Exception as e:
            print(f"Error reading {path}: {e}")

    print(f"Successfully loaded {len(data_rows)} regulations.")
    return pd.DataFrame(data_rows)

def load_test_documents(folder_path):
    # Reads all text files from the specified folder
    print(f"Loading test documents from: {folder_path}...")
    docs = {}
    
    txt_files = glob.glob(os.path.join(folder_path, "*.txt"))
    
    if not txt_files:
        print(f"Error: No TXT files found in {folder_path}")
        return {}

    for path in txt_files:
        with open(path, "r", encoding="utf-8") as f:
            docs[os.path.basename(path)] = f.read()
            
    return docs

print("Data loading functions defined.")

Data loading functions defined.


In [3]:
class ComplianceMatcher:
    def __init__(self, df):
        self.df = df
        # Initialize the Vectorizer to convert text to numbers
        self.vectorizer = TfidfVectorizer(stop_words="english")
        
        # Train the model on the Regulatory Knowledge Base
        print("Training NLP model on regulatory rules...")
        self.rule_vectors = self.vectorizer.fit_transform(self.df["text"])

    def find_relevant_rules(self, doc_text, top_k=3, threshold=0.15):
        results = []
        # Split document into paragraphs for granular analysis
        paragraphs = [p for p in doc_text.split("\n\n") if len(p) > 50]

        for para in paragraphs:
            para_vec = self.vectorizer.transform([para])
            
            # Compare paragraph against all rules
            scores = cosine_similarity(para_vec, self.rule_vectors).flatten()
            
            # Find top matches
            best_indices = scores.argsort()[-top_k:][::-1]

            matches = []
            for idx in best_indices:
                score = scores[idx]
                if score > threshold:
                    row = self.df.iloc[idx]
                    matches.append({
                        "rule_id": row["rule_id"],
                        "source": row["source_file"],
                        "rule_text": row["text"],
                        "confidence": round(score * 100, 2)
                    })

            if matches:
                results.append({
                    "paragraph_snippet": para[:200] + "...",
                    "matches": matches
                })

        return results

print("Compliance Engine class defined.")

Compliance Engine class defined.


In [4]:
# 1. Load the Data
kb_df = load_knowledge_base(JSON_FOLDER)
documents = load_test_documents(TXT_FOLDER)

# 2. Check if data exists
if kb_df.empty or not documents:
    print("Execution Stopped: Missing Data. Please check your folders.")
else:
    # 3. Initialize Engine
    matcher = ComplianceMatcher(kb_df)

    # 4. Run Analysis
    print("\n" + "="*60)
    print("STARTING COMPLIANCE AUDIT REPORT")
    print("="*60)

    for doc_name, content in documents.items():
        print(f"\nAnalyzing Document: {doc_name}")
        print("-" * 30)
        
        analysis_results = matcher.find_relevant_rules(content)
        
        if not analysis_results:
            print("  No significant regulatory risks found.")
        else:
            for item in analysis_results:
                print(f"\n[Document Section]: \"{item['paragraph_snippet']}\"")
                
                for m in item['matches']:
                    print(f"   >>> Match: Rule {m['rule_id']} (Confidence: {m['confidence']}%)")
                    print(f"       Source: {m['source']}")
                    print(f"       Text: {m['rule_text'][:150]}...")

Loading regulatory rules from: StructuredRegulatoryDocuments...
Successfully loaded 12857 regulations.
Loading test documents from: Testset...
Training NLP model on regulatory rules...

STARTING COMPLIANCE AUDIT REPORT

Analyzing Document: T1.txt
------------------------------

[Document Section]: "Inter-Affiliate Service and Financial Support Agreement
Preamble: This Agreement is entered into to formalize the business and financial relationships between related entities within the corporate Gro..."
   >>> Match: Rule Part 17.202.(3) (Confidence: 35.62%)
       Source: 17.json
       Text: A direction to an Affiliate under subsection ‎(2) may include, without limitation, a requirement that the Affiliate—
(a)	limit any activities it under...
   >>> Match: Rule Part 17.202.(2) (Confidence: 31.03%)
       Source: 17.json
       Text: The Regulator may direct an Affiliate of a Financial Institution or Recognised Body to take Specified steps or not to carry out Specified activities i...
   