# Automated Regulatory Data Profiling System
## End-to-End Compliance Pipeline with AI/ML Components

### System Architecture:
1. PDF Parser & Rule Extractor
2. Validation Code Generator
3. Anomaly Detection Engine
4. Risk Scoring System
5. Remediation Advisor
6. Interactive Auditor UI

In [None]:
import re
from typing import List, Dict
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

def enhanced_pdf_parser(pdf_path: str) -> List[Dict]:
    """Improved PDF text extraction with preprocessing"""
    raw_text = extract_text_from_pdf(pdf_path)
    
    # Clean text artifacts
    cleaned_text = re.sub(r'(\w)\s+(\w)', r'\1\2', raw_text)  # Fix mid-word spaces
    cleaned_text = re.sub(r'\n\s+\n', '\n', cleaned_text)  # Remove empty lines
    
    # Enhanced LLM prompt with formatting examples
    prompt = f"""Extract fields from this regulatory document. 
    Handle variations in spacing and line breaks. Example conversion:
    'Obli gor Name' → 'ObligorName'
    Document Content:\n{cleaned_text}\n"""
    
    return process_with_llm(prompt)

In [None]:
def generate_validation_functions(rules: List[Dict]) -> Dict:
    """Create reusable validation functions with shared constraints"""
    validation_map = {}
    
    # Shared constraint checker
    invalid_chars = ['\r', '\n', ','] + [chr(i) for i in range(0,32)]
    
    for rule in rules:
        tech_name = rule['technical_name']
        
        if "Must not contain" in rule.get('constraints', ''):
            code = f"""def validate_{tech_name}(value):
                return not any(char in value for char in {invalid_chars})"""
        elif "country code" in rule.get('description', ''):
            code = """def validate_{}(value):
                return len(value) == 2 and value.isalpha()""".format(tech_name)
        else:
            code = f"def validate_{tech_name}(value): return True"
        
        validation_map[tech_name] = code
    
    return validation_map

In [None]:
class RiskAnalyzer:
    def __init__(self):
        self.scaler = StandardScaler()
        self.cluster_model = DBSCAN(eps=0.5, min_samples=5)
    
    def calculate_risk_scores(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate anomaly scores using unsupervised learning"""
        numeric_cols = df.select_dtypes(include='number').columns
        scaled_data = self.scaler.fit_transform(df[numeric_cols])
        
        # Cluster analysis
        clusters = self.cluster_model.fit_predict(scaled_data)
        df['cluster_group'] = clusters
        
        # Distance from cluster centroids as risk score
        df['risk_score'] = df.apply(lambda row: 
            np.linalg.norm(scaled_data[row.name] - 
            scaled_data[clusters == row['cluster_group']].mean(axis=0)), axis=1)
        
        return df

In [None]:
def generate_remediation(failed_rules: List[str]) -> str:
    """AI-powered remediation suggestions"""
    prompt = f"Generate remediation steps for these validation failures: {failed_rules}"
    return model.generate_content(prompt).text

In [None]:
import gradio as gr

def create_auditor_interface():
    with gr.Blocks() as ui:
        with gr.Row():
            pdf_input = gr.File(label="Upload Regulatory PDF")
            csv_input = gr.File(label="Upload Transaction Data")
        
        with gr.Accordion("Advanced Settings"):
            risk_threshold = gr.Slider(0, 1, value=0.7, label="Risk Threshold")
            
        report_output = gr.JSON(label="Compliance Report")
        
        gr.Interface(
            fn=process_files,
            inputs=[pdf_input, csv_input, risk_threshold],
            outputs=report_output,
            live=True
        )
    
    return ui