In [7]:
!pip install -q -U google-generativeai pdfplumber pandas regex

import os
import re
import json
import pandas as pd
import pdfplumber
import google.generativeai as genai
from google.colab import drive
from google.colab import userdata

drive.mount('/content/drive')

BASE_DIR = "/content/drive/MyDrive/sale_deeds"

PATHS = {
    "clause_corpus": os.path.join(BASE_DIR, "clause_corpus.csv"),
    "doc_texts": os.path.join(BASE_DIR, "doc_texts.csv"),
    "property_dataset": os.path.join(BASE_DIR, "property_dataset_1.csv")
}

def load_reference_data():
    """
    Loads the datasets generated in the previous step to serve as
    the 'Knowledge Base' for the AI.
    """
    data = {}
    print(f"Loading Reference Data from: {BASE_DIR}")

    try:
        if os.path.exists(PATHS["clause_corpus"]):
            df_clauses = pd.read_csv(PATHS["clause_corpus"])
            data["clauses"] = df_clauses
            print(f"Loaded 'clause_corpus.csv' ({len(df_clauses)} examples found)")
        else:
            print(f"File not found: {PATHS['clause_corpus']}")

        if os.path.exists(PATHS["property_dataset"]):
            df_props = pd.read_csv(PATHS["property_dataset"])
            data["properties"] = df_props
            print(f"Loaded 'property_dataset_1.csv' ({len(df_props)} docs indexed)")
        else:
            print(f"File not found: {PATHS['property_dataset']}")

    except Exception as e:
        print(f"Error loading data: {e}")

    return data

REFERENCE_DB = load_reference_data()

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                extracted = page.extract_text()
                if extracted:
                    text += extracted + "\n"
    except Exception as e:
        return None, f"Error reading PDF: {e}"
    return text, None

def run_rule_engine(text):
    """
    Deterministic checks for High-Risk items.
    """
    flags = []
    text_lower = text.lower()

    date_pattern = r"\d{1,2}[-/thstndrd]+\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{2,4}|\d{1,2}[-/]\d{1,2}[-/]\d{2,4}"
    if len(re.findall(date_pattern, text, re.IGNORECASE)) < 1:
        flags.append({"risk": "Possession Date Missing", "severity": "HIGH"})

    if "indemni" not in text_lower:
        flags.append({"risk": "Missing Indemnity Clause", "severity": "HIGH"})

    if "arbitration" not in text_lower and "court" not in text_lower:
        flags.append({"risk": "No Dispute Resolution Clause", "severity": "MEDIUM"})

    return flags

def build_system_prompt(rule_flags):
    """
    Injects REAL examples from your 'clause_corpus.csv' into the prompt.
    This proves the AI is using your extracted data.
    """
    try:
        df = REFERENCE_DB.get("clauses")
        if df is not None and not df.empty:
            normal_txt = df[df['label'] == 0]['raw_text'].iloc[0][:600]
            fraud_txt = df[df['label'] == 1]['raw_text'].iloc[0][:600]

            examples_block = f"""
            YOUR TRAINING DATA (Ground Truth):
            --- EXAMPLE 1: NORMAL (SAFE) ---
            "{normal_txt}..."

            --- EXAMPLE 2: FRAUD (RISKY) ---
            "{fraud_txt}..."
            """
        else:
            examples_block = "No reference data loaded."
    except:
        examples_block = "Error reading reference data."

    return f"""
    You are an AI Legal Auditor (Project: Property Transactions).

    {examples_block}

    INPUT DATA:
    1. AUTOMATED RULE FLAGS: {json.dumps(rule_flags)}
    2. DOCUMENT TEXT: (Provided below)

    YOUR TASK:
    1. **Extract**: Parties, Property, Price.
    2. **Explain**: Convert legalese to simple English (Section 3.E).
    3. **Verify**: Check if the Rule Flags are correct or False Positives.
    4. **Verdict**: Compare the text against the TRAINING DATA examples above.

    OUTPUT FORMAT (JSON):
    {{
      "summary": {{ "parties": "...", "financials": "..." }},
      "clause_audit": [
        {{ "clause": "Indemnity", "status": "Present", "explanation": "Simple English..." }}
      ],
      "risks": [ {{ "issue": "...", "severity": "High" }} ],
      "final_verdict": "NORMAL / SUSPICIOUS"
    }}
    """

def run_gemini_combined_analysis(text, rule_flags):
    """
    LLM PIPELINE:
    1. Summarizes the Deal (Executive Summary).
    2. Explains Every Section in Simple English (NLP Module).
    3. Analyzes Risks & Verifies Rule Engine Flags.
    """

    system_prompt = f"""
    You are an AI Legal Assistant for Indian Property Law.

    INPUT CONTEXT:
    1. AUTOMATED RULE FLAGS: {json.dumps(rule_flags)}
    2. RAW TEXT: (Provided below)

    YOUR TASK:
    Perform a complete legal audit and output a JSON with exactly these three sections:

    PART 1: EXECUTIVE SUMMARY
    - Identify Parties (Seller, Buyer).
    - Identify Property (Address, Area, Survey No).
    - Identify Financials (Total Price, Advance, Balance).

    PART 2: SECTION-BY-SECTION EXPLAINER (The NLP Module)
    - Walk through every major clause found (e.g. Indemnity, Possession, Termination).
    - For each clause, provide:
      A. **Legal Meaning**: A brief technical summary.
      B. **Simple Explanation**: A 1-sentence translation for a layperson (e.g. "This protects you if...").

    PART 3: RISK ASSESSMENT
    - Evaluate the 'Rule Flags'. Are they correct or False Positives?
    - Identify hidden risks (vague timelines, missing dispute resolution).
    - Assign a Severity (High/Medium/Low) to each risk.

    OUTPUT FORMAT (Strict JSON):
    {{
      "executive_summary": {{
        "parties": "Seller: [Name] | Buyer: [Name]",
        "property_details": "[Address & Area]",
        "financial_terms": "[Total Consideration]"
      }},
      "section_analysis": [
        {{
          "section_title": "e.g. Indemnity Clause",
          "legal_summary": "Seller agrees to compensate for title defects...",
          "simple_explanation": "If someone claims they own this land later, the seller must pay you back."
        }},
        {{
          "section_title": "e.g. Possession Clause",
          "legal_summary": "...",
          "simple_explanation": "..."
        }}
      ],
      "risk_report": [
        {{ "issue": "...", "severity": "High/Med", "advice": "..." }}
      ],
      "final_verdict": "NORMAL / SUSPICIOUS"
    }}
    """

    try:
        api_key = userdata.get('GEMINI_API_KEY')
        genai.configure(api_key=api_key)

        model = genai.GenerativeModel('gemini-flash-latest', system_instruction=system_prompt)

        response = model.generate_content(
            text,
            generation_config=genai.types.GenerationConfig(
                temperature=0.2, # Low temp for factual accuracy
                response_mime_type="application/json"
            )
        )
        return json.loads(response.text)
    except Exception as e:
        return {"error": f"LLM Pipeline Failed: {str(e)}"}

target_file = "/content/DEED OF ABSOLUTE SALE.pdf"

if os.path.exists(os.path.join(BASE_DIR, target_file)):
    report = analyze_document(target_file)
    print(json.dumps(report, indent=2))
else:
    print(f"File '{target_file}' not found in {BASE_DIR}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading Reference Data from: /content/drive/MyDrive/sale_deeds
Loaded 'clause_corpus.csv' (91 examples found)
Loaded 'property_dataset_1.csv' (60 docs indexed)

ðŸ“„ Processing: /content/DEED OF ABSOLUTE SALE.pdf
{
  "summary": {
    "parties": "Seller: Mr. Aniket Vishwas Deshmukh (45, Business, Pune). Purchaser: Mrs. Priya Rajesh Kulkarni (38, Service, Pune).",
    "financials": "Total Consideration: Rs. 85,00,000/- (Eighty-Five Lakhs Only). Payment was made via traceable methods: Cheque (Rs. 5,00,000) and RTGS Transfer (Rs. 80,00,000, UTR HDFCR520251210).",
    "property": "Residential Flat No. A-502, 5th Floor, 'Silver Oak Enclave', Survey No. 45/1A, Baner, Pune. Built-up Area: 1150 Sq. Ft."
  },
  "clause_audit": [
    {
      "clause": "Indemnity",
      "status": "Present",
      "explanation": "The Seller guarantees that they will financially protect t