In [None]:
!pip install PyPDF2 pandas numpy scikit-learn tqdm joblib

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import re
import pandas as pd
from tqdm import tqdm
from PyPDF2 import PdfReader

from google.colab import drive
drive.mount('/content/drive')

BASE_DIR = "/content/drive/MyDrive/sale_deeds"
NON_FRAUD_DIR = os.path.join(BASE_DIR, "non_fraud")
FRAUD_DIR     = os.path.join(BASE_DIR, "fraud")

print("Non-fraud dir:", NON_FRAUD_DIR)
print("Fraud dir:", FRAUD_DIR)
print("Non-fraud files:", os.listdir(NON_FRAUD_DIR))
print("Fraud files:", os.listdir(FRAUD_DIR))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Non-fraud dir: /content/drive/MyDrive/sale_deeds/non_fraud
Fraud dir: /content/drive/MyDrive/sale_deeds/fraud
Non-fraud files: ['SALE DEED 1.pdf', 'SALE DEED 2.pdf', 'SALE DEED 3.pdf', 'SALE DEED 4.pdf', 'SALE DEED 5.pdf', 'SALE DEED 6.pdf', 'SALE DEED 7.pdf', 'SALE DEED 8.pdf', 'SALE DEED 9.pdf', 'SALE DEED 10.pdf', 'SALE DEED 11.pdf', 'SALE DEED 12.pdf', 'SALE DEED 13.pdf', 'SALE DEED 14.pdf', 'SALE DEED 15.pdf', 'SALE DEED 16.pdf', 'SALE DEED 17.pdf', 'SALE DEED 18.pdf', 'SALE DEED 19.pdf', 'SALE DEED 20.pdf', 'SALE DEED 21.pdf', 'SALE DEED 22.pdf', 'SALE DEED 23.pdf', 'SALE DEED 24.pdf', 'SALE DEED 25.pdf', 'SALE DEED 26.pdf', 'SALE DEED 27.pdf', 'SALE DEED 28.pdf', 'SALE DEED 29.pdf', 'SALE DEED 30.pdf', 'SALE DEED 31.pdf', 'SALE DEED 32.pdf', 'SALE DEED 33.pdf', 'SALE DEED 34.pdf', 'SALE DEED 35.pdf', 'SALE DEED 36.pdf', 'SALE DEED 37.pdf', 'SALE DEED 3

In [None]:
def extract_text_from_pdf(path: str) -> str:
    text_chunks = []
    with open(path, "rb") as f:
        reader = PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text_chunks.append(page_text)
    return "\n".join(text_chunks)


def extract_text_generic(path: str) -> str:
    path_lower = path.lower()
    if path_lower.endswith(".pdf"):
        try:
            return extract_text_from_pdf(path)
        except Exception as e:
            print(f"[WARN] Failed to read {path}: {e}")
            return ""
    else:
        print(f"[WARN] Unsupported file type: {path}")
        return ""


In [None]:
def compute_text_stats(text: str) -> dict:
    lower = text.lower()

    date_pattern = r"\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b"
    num_dates = len(re.findall(date_pattern, lower))

    amounts = re.findall(r"\b\d{5,}\b", lower)
    num_large_amounts = len(amounts)

    doc_length_chars = len(text)

    return {
        "num_dates_mentioned": num_dates,
        "num_large_amounts": num_large_amounts,
        "doc_length_chars": doc_length_chars,
    }


In [None]:
def extract_simple_features(text: str) -> dict:
    lower = text.lower()

    #Parties
    has_seller = "hereinafter called the \"seller\"" in lower or "hereinafter referred to as the \"seller\"" in lower or " seller" in lower
    has_buyer  = "hereinafter called the \"purchaser\"" in lower or "purchaser" in lower or " buyer" in lower

    # Property details
    has_flat_or_plot = ("flat no" in lower or "flat no." in lower or
                        "plot no" in lower or "plot no." in lower)
    has_survey = "survey no" in lower or "s.no" in lower or "gut no" in lower
    has_cts = "cts no" in lower or "c.t.s. no" in lower
    has_location = "situated at" in lower or "lying and being at" in lower or "within the limits of" in lower

    # Financials
    total_consideration = None
    m = re.search(r"(total sale consideration|total consideration|sale consideration)\s*(of)?\s*(rs\.?|rupees)?\s*([\d,]+)", lower)
    if m:
        amt_str = m.group(4).replace(",", "")
        try:
            total_consideration = int(amt_str)
        except:
            total_consideration = None

    has_token = "token amount" in lower or "earnest money" in lower or "bayana" in lower
    has_schedule = "shall be paid" in lower or "paid as under" in lower or "balance amount" in lower

    # Clauses
    has_indemnity = "indemnity" in lower or "indemnify" in lower
    has_dispute = "arbitration" in lower or "dispute" in lower
    has_jurisdiction = "jurisdiction" in lower
    has_possession_clause = "possession" in lower
    has_cancellation = "cancellation" in lower or "termination" in lower or "rescission" in lower

    return {
        "has_seller": int(has_seller),
        "has_buyer": int(has_buyer),
        "has_flat_or_plot": int(has_flat_or_plot),
        "has_survey": int(has_survey),
        "has_cts": int(has_cts),
        "has_location": int(has_location),
        "total_consideration": total_consideration or 0,
        "has_token": int(has_token),
        "has_schedule": int(has_schedule),
        "has_indemnity_clause": int(has_indemnity),
        "has_dispute_clause": int(has_dispute),
        "has_jurisdiction_clause": int(has_jurisdiction),
        "has_possession_clause": int(has_possession_clause),
        "has_cancellation_clause": int(has_cancellation),
    }


In [None]:
def make_feature_row(doc_id: str, label: int, text: str) -> dict:
    stats = compute_text_stats(text)
    feats = extract_simple_features(text)

    row = {
        "doc_id": doc_id,
        "label": label,

        # Parties
        "seller_present": feats["has_seller"],
        "buyer_present": feats["has_buyer"],

        # Property
        "flat_or_plot_present": feats["has_flat_or_plot"],
        "survey_no_present": feats["has_survey"],
        "cts_no_present": feats["has_cts"],
        "location_present": feats["has_location"],

        # Financials
        "total_consideration": feats["total_consideration"],
        "token_present": feats["has_token"],
        "payment_schedule_present": feats["has_schedule"],

        # Clauses
        "indemnity_clause": feats["has_indemnity_clause"],
        "dispute_clause": feats["has_dispute_clause"],
        "jurisdiction_clause": feats["has_jurisdiction_clause"],
        "possession_clause": feats["has_possession_clause"],
        "cancellation_clause": feats["has_cancellation_clause"],

        # Text stats
        "num_dates": stats["num_dates_mentioned"],
        "num_large_amounts": stats["num_large_amounts"],
        "doc_length_chars": stats["doc_length_chars"],
    }

    return row


In [None]:
rows = []

# Non-fraud: 0
for fname in tqdm(os.listdir(NON_FRAUD_DIR), desc="Non-fraud docs"):
    path = os.path.join(NON_FRAUD_DIR, fname)
    if not os.path.isfile(path) or not fname.lower().endswith(".pdf"):
        continue

    text = extract_text_generic(path)
    if not text.strip():
        print(f"[WARN] No text extracted from {fname}")
        continue

    row = make_feature_row(doc_id=fname, label=0, text=text)
    rows.append(row)

# Fraud docs: 1
for fname in tqdm(os.listdir(FRAUD_DIR), desc="Fraud docs"):
    path = os.path.join(FRAUD_DIR, fname)
    if not os.path.isfile(path) or not fname.lower().endswith(".pdf"):
        continue

    text = extract_text_generic(path)
    if not text.strip():
        print(f"[WARN] No text extracted from {fname}")
        continue

    row = make_feature_row(doc_id=fname, label=1, text=text)
    rows.append(row)

print("Total rows:", len(rows))


Non-fraud docs: 100%|██████████| 50/50 [00:49<00:00,  1.01it/s]
Fraud docs: 100%|██████████| 10/10 [00:09<00:00,  1.03it/s]

Total rows: 60





In [None]:
df = pd.DataFrame(rows)
print(df.head())
print("Shape:", df.shape)

out_path = os.path.join(BASE_DIR, "property_dataset_1.csv")
df.to_csv(out_path, index=False)
print("Saved dataset to:", out_path)


            doc_id  label  seller_present  buyer_present  \
0  SALE DEED 1.pdf      0               1              1   
1  SALE DEED 2.pdf      0               1              1   
2  SALE DEED 3.pdf      0               1              1   
3  SALE DEED 4.pdf      0               1              1   
4  SALE DEED 5.pdf      0               1              1   

   flat_or_plot_present  survey_no_present  cts_no_present  location_present  \
0                     0                  0               1                 0   
1                     1                  1               1                 0   
2                     1                  1               0                 0   
3                     0                  0               1                 0   
4                     1                  1               1                 0   

   total_consideration  token_present  payment_schedule_present  \
0                    0              1                         0   
1                    0  

In [None]:
BASE_DIR = "/content/drive/MyDrive/sale_deeds"

PDF_DIRS = [
    os.path.join(BASE_DIR, "non_fraud"),
    os.path.join(BASE_DIR, "fraud"),
]

def extract_text_from_pdf(path: str) -> str:
    """Extract full text from a text-based PDF using PyPDF2."""
    text_chunks = []
    with open(path, "rb") as f:
        reader = PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text_chunks.append(page_text.strip())
    return "\n\n".join(text_chunks)


rows = []

for pdf_dir in PDF_DIRS:
    if not os.path.isdir(pdf_dir):
        continue

    label = 0 if os.path.basename(pdf_dir).lower() == "non_fraud" else 1

    for fname in tqdm(os.listdir(pdf_dir), desc=f"Processing {pdf_dir}"):
        if not fname.lower().endswith(".pdf"):
            continue

        path = os.path.join(pdf_dir, fname)
        full_text = extract_text_from_pdf(path)

        if not full_text.strip():
            print(f"[WARN] No text from {fname}")
            continue

        rows.append({
            "doc_id": fname,
            "label": label,
            "full_text": full_text
        })

doc_texts_1_df = pd.DataFrame(rows)
out_path = os.path.join(BASE_DIR, "doc_texts_1.csv")
doc_texts_1_df.to_csv(out_path, index=False, encoding="utf-8")
print("Saved doc_texts.csv to:", out_path)
print(doc_texts_1_df.head(3))


Processing /content/drive/MyDrive/sale_deeds/non_fraud: 100%|██████████| 50/50 [01:27<00:00,  1.76s/it]
Processing /content/drive/MyDrive/sale_deeds/fraud: 100%|██████████| 10/10 [00:17<00:00,  1.73s/it]

Saved doc_texts.csv to: /content/drive/MyDrive/sale_deeds/doc_texts_1.csv
            doc_id  label                                          full_text
0  SALE DEED 1.pdf      0  SALE DEED  \nThis Deed of Absolute Sale is exe...
1  SALE DEED 2.pdf      0  SALE DEED  \nExecuted on 4th August 2022 , by:...
2  SALE DEED 3.pdf      0  SALE DEED  \nThis Deed of Absolute Sale is exe...





In [None]:
DOC_TEXTS_PATH = os.path.join(BASE_DIR, "doc_texts.csv")
doc_texts_df = pd.read_csv(DOC_TEXTS_PATH)

def extract_clause_snippets(text: str, keywords, window_paragraphs: int = 1):
    """
    Split text into rough 'paragraphs' and return snippets around any keyword.
    window_paragraphs controls how much context (before/after) to include.
    """
    if not isinstance(text, str):
        text = str(text)

    paragraphs = re.split(r"\n\s*\n|\r\n\s*\r\n", text)
    snippets = []

    for i, para in enumerate(paragraphs):
        low = para.lower()
        if any(kw in low for kw in keywords):
            start = max(0, i - window_paragraphs)
            end = min(len(paragraphs), i + window_paragraphs + 1)
            chunk = "\n\n".join(p.strip() for p in paragraphs[start:end] if p.strip())
            if chunk and chunk not in snippets:
                snippets.append(chunk)

    return snippets

CLAUSE_KEYWORDS = {
    "indemnity": ["indemnity", "indemnify"],
    "possession": ["possession"],
    "jurisdiction": ["jurisdiction"],
    "dispute_resolution": ["arbitration", "dispute resolution", "dispute"],
    "cancellation": ["cancellation", "termination", "rescission"],
}

clause_rows = []

for _, row in doc_texts_df.iterrows():
    doc_id = row["doc_id"]
    label = row.get("label", None)
    full_text = row.get("full_text", "")

    if not isinstance(full_text, str) or not full_text.strip():
        continue

    for clause_type, kw_list in CLAUSE_KEYWORDS.items():
        kws = [k.lower() for k in kw_list]
        snippets = extract_clause_snippets(full_text, kws, window_paragraphs=1)

        for snip in snippets:
            clause_rows.append({
                "doc_id": doc_id,
                "label": label,
                "clause_type": clause_type,
                "raw_text": snip
            })

clause_df = pd.DataFrame(clause_rows)
clause_out_path = os.path.join(BASE_DIR, "clause_corpus.csv")
clause_df.to_csv(clause_out_path, index=False, encoding="utf-8")

print("Saved clause_corpus.csv to:", clause_out_path)
print(clause_df.head(10))
print("Total snippets:", len(clause_df))

Saved clause_corpus.csv to: /content/drive/MyDrive/sale_deeds/clause_corpus.csv
             doc_id  label         clause_type  \
0   SALE DEED 1.pdf      0          possession   
1   SALE DEED 2.pdf      0          possession   
2   SALE DEED 2.pdf      0  dispute_resolution   
3   SALE DEED 3.pdf      0  dispute_resolution   
4   SALE DEED 5.pdf      0          possession   
5   SALE DEED 6.pdf      0          possession   
6   SALE DEED 6.pdf      0  dispute_resolution   
7   SALE DEED 8.pdf      0          possession   
8   SALE DEED 9.pdf      0          possession   
9  SALE DEED 11.pdf      0          possession   

                                            raw_text  
0  SALE DEED  \nThis Deed of Absolute Sale is exe...  
1  SALE DEED  \nExecuted on 4th August 2022 , by:...  
2  SALE DEED  \nExecuted on 4th August 2022 , by:...  
3  SALE DEED  \nThis Deed of Absolute Sale is exe...  
4  SALE DEED  \nThis Deed of Absolute Sale is exe...  
5  SALE DEED  \nExecuted on 3rd Decembe

In [None]:
from typing import Dict, Any, List

MANDATORY_CLAUSES = {
    "indemnity":        ["indemnity", "indemnify"],
    "encumbrance":      ["encumbrance", "encumbrances", "clear and marketable title"],
    "possession":       ["possession", "vacant and peaceful possession"],
    "consideration":    ["sale consideration", "total consideration", "consideration"],
    "property_schedule":["schedule of property", "schedule", "property description"],
    "boundaries":       ["boundaries", "bounded as follows"],
    "jurisdiction":     ["jurisdiction"],
    "dispute_resolution":["arbitration", "dispute resolution"],
    "witnesses":        ["witness", "witnesses"],
    "parties":          ["seller", "purchaser", "buyer"],
}

SUSPICIOUS_PHRASES = [
    "seller shall not be responsible",
    "buyer shall bear all past dues",
    "possession shall be deemed",
    "jurisdiction shall be decided later",
    "as may be mutually decided in future",
    "without any further liability of the seller",
    "seller will not be liable for any past or future claims",
]

WEIGHTS = {
    "missing_mandatory_clause": 3,
    "too_short": 12,
    "no_dates": 8,
    "few_dates": 4,
    "no_numbers": 6,
    "very_few_numbers": 3,
    "missing_boundaries": 6,
    "missing_witnesses": 5,
    "missing_parties_block": 6,
    "suspicious_phrase": 5,
}

THRESHOLDS = {
    "NORMAL": 0,
    "LOW_RISK": 12,
    "MEDIUM_RISK": 25,
    "FRAUD": 40,
}

def count_dates(text: str) -> int:
    """
    Rough count of date-like patterns:
    12/02/2023, 12-02-2023, 14.02.2023 etc.
    """
    pattern = r"\b\d{1,2}[-/\.]\d{1,2}[-/\.]\d{2,4}\b"
    return len(re.findall(pattern, text))

def count_big_numbers(text: str) -> int:
    """
    Count numbers with 5+ digits.
    These usually correspond to amounts, CTS numbers etc.
    """
    return len(re.findall(r"\b\d{5,}\b", text))

def detect_boundaries(text_lower: str) -> bool:
    """
    Check if all four directions are mentioned at least once.
    Very typical in proper sale deeds.
    """
    return all(d in text_lower for d in ["east", "west", "north", "south"])

def detect_witness_section(text_lower: str) -> bool:
    return "witness" in text_lower or "witnesses" in text_lower

def detect_parties_block(text_lower: str) -> bool:
    """
    Very loose: just check if we have seller & purchaser/buyer present somewhere.
    """
    has_seller = "seller" in text_lower
    has_buyer  = "purchaser" in text_lower or "buyer" in text_lower
    return has_seller and has_buyer

def check_clause_presence(text_lower: str) -> Dict[str, bool]:
    presence = {}
    for clause_name, keywords in MANDATORY_CLAUSES.items():
        presence[clause_name] = any(kw in text_lower for kw in keywords)
    return presence

def find_suspicious_phrases(text_lower: str) -> List[str]:
    hits = []
    for phrase in SUSPICIOUS_PHRASES:
        if phrase in text_lower:
            hits.append(phrase)
    return hits

def analyze_deed(text: str) -> Dict[str, Any]:
    """
    Heuristic rule engine for a single sale deed text.
    Returns dict with risk score, label, and explanations.
    """
    if not isinstance(text, str):
        text = str(text or "")

    text = text.strip()
    t = text.lower()

    doc_len = len(text)
    num_dates = count_dates(text)
    num_big_numbers = count_big_numbers(text)
    has_boundaries = detect_boundaries(t)
    has_witnesses = detect_witness_section(t)
    has_parties   = detect_parties_block(t)

    clause_presence = check_clause_presence(t)
    missing_clauses = [name for name, present in clause_presence.items() if not present]

    suspicious_hits = find_suspicious_phrases(t)

    score = 0
    reasons = []

    if doc_len < 2500:
        score += WEIGHTS["too_short"]
        reasons.append(f"Document very short: {doc_len} characters. (+{WEIGHTS['too_short']})")

    if missing_clauses:
        delta = WEIGHTS["missing_mandatory_clause"] * len(missing_clauses)
        score += delta
        reasons.append(f"{len(missing_clauses)} mandatory clauses not detected: {', '.join(missing_clauses)} (+{delta})")

    if num_dates == 0:
        score += WEIGHTS["no_dates"]
        reasons.append(f"No date patterns detected. (+{WEIGHTS['no_dates']})")
    elif num_dates < 3:
        score += WEIGHTS["few_dates"]
        reasons.append(f"Very few dates detected ({num_dates}). (+{WEIGHTS['few_dates']})")

    if num_big_numbers == 0:
        score += WEIGHTS["no_numbers"]
        reasons.append(f"No large numbers (amounts/CTS/etc.) detected. (+{WEIGHTS['no_numbers']})")
    elif num_big_numbers < 3:
        score += WEIGHTS["very_few_numbers"]
        reasons.append(f"Very few large numbers detected ({num_big_numbers}). (+{WEIGHTS['very_few_numbers']})")

    if not has_boundaries:
        score += WEIGHTS["missing_boundaries"]
        reasons.append(f"Full four-direction boundaries (East/West/North/South) not found. (+{WEIGHTS['missing_boundaries']})")

    if not has_witnesses:
        score += WEIGHTS["missing_witnesses"]
        reasons.append(f"No explicit witness section detected. (+{WEIGHTS['missing_witnesses']})")

    if not has_parties:
        score += WEIGHTS["missing_parties_block"]
        reasons.append(f'Could not detect clear "Seller/Purchaser" party block. (+{WEIGHTS["missing_parties_block"]})')

    if suspicious_hits:
        extra = WEIGHTS["suspicious_phrase"] * len(suspicious_hits)
        score += extra
        reasons.append(f"Suspicious phrases found: {', '.join(suspicious_hits)} (+{extra})")

    if score >= THRESHOLDS["FRAUD"]:
        level = "FRAUD"
    elif score >= THRESHOLDS["MEDIUM_RISK"]:
        level = "MEDIUM_RISK"
    elif score >= THRESHOLDS["LOW_RISK"]:
        level = "LOW_RISK"
    else:
        level = "NORMAL"

    return {
        "risk_score": score,
        "risk_level": level,
        "missing_mandatory_clauses": missing_clauses,
        "suspicious_phrases": suspicious_hits,
        "details": {
            "doc_length_chars": doc_len,
            "num_dates": num_dates,
            "num_big_numbers": num_big_numbers,
            "has_boundaries": has_boundaries,
            "has_witness_section": has_witnesses,
            "has_parties_block": has_parties,
            "clause_presence": clause_presence,
            "reasons": reasons,
        }
    }


In [None]:
from PyPDF2 import PdfReader
import os

def pdf_to_text(path: str) -> str:
    reader = PdfReader(path)
    chunks = []
    for page in reader.pages:
        t = page.extract_text() or ""
        chunks.append(t)
    return "\n\n".join(chunks)

sample_normal = os.path.join(NON_FRAUD_DIR, os.listdir(NON_FRAUD_DIR)[0])
sample_fraud  = os.path.join(FRAUD_DIR, os.listdir(FRAUD_DIR)[0])

print("Testing normal file:", sample_normal)
print("Testing fraud file:", sample_fraud)

normal_text = pdf_to_text(sample_normal)
fraud_text  = pdf_to_text(sample_fraud)

normal_res = analyze_deed(normal_text)
fraud_res  = analyze_deed(fraud_text)

print("\n===== NORMAL RESULT =====")
print("Risk level:", normal_res["risk_level"])
print("Risk score:", normal_res["risk_score"])
print("\nReasons:")
for r in normal_res["details"]["reasons"]:
    print("-", r)

print("\n===== FRAUD RESULT =====")
print("Risk level:", fraud_res["risk_level"])
print("Risk score:", fraud_res["risk_score"])
print("\nReasons:")
for r in fraud_res["details"]["reasons"]:
    print("-", r)

Testing normal file: /content/drive/MyDrive/sale_deeds/non_fraud/SALE DEED 1.pdf
Testing fraud file: /content/drive/MyDrive/sale_deeds/fraud/FRAUD SALE DEED 1.pdf

===== NORMAL RESULT =====
Risk level: LOW_RISK
Risk score: 20

Reasons:
- 3 mandatory clauses not detected: indemnity, jurisdiction, dispute_resolution (+9)
- No date patterns detected. (+8)
- Very few large numbers detected (2). (+3)

===== FRAUD RESULT =====
Risk level: NORMAL
Risk score: 11

Reasons:
- No date patterns detected. (+8)
- Very few large numbers detected (2). (+3)


In [None]:
import os
import pandas as pd

results = []

for fname in os.listdir(NON_FRAUD_DIR):
    if not fname.lower().endswith(".pdf"):
        continue

    path = os.path.join(NON_FRAUD_DIR, fname)
    try:
        text = pdf_to_text(path)
        res = analyze_deed(text)
    except Exception as e:
        print(f"[ERROR] Failed on non-fraud file {fname}: {e}")
        continue

    results.append({
        "file": fname,
        "true_label": "NORMAL",
        "predicted_label": res["risk_level"],
        "risk_score": res["risk_score"],
        "doc_length_chars": res["details"]["doc_length_chars"],
        "num_dates": res["details"]["num_dates"],
        "num_big_numbers": res["details"]["num_big_numbers"],
        "missing_clauses": ", ".join(res["missing_mandatory_clauses"]),
        "suspicious_phrases": ", ".join(res["suspicious_phrases"]),
    })

for fname in os.listdir(FRAUD_DIR):
    if not fname.lower().endswith(".pdf"):
        continue

    path = os.path.join(FRAUD_DIR, fname)
    try:
        text = pdf_to_text(path)
        res = analyze_deed(text)
    except Exception as e:
        print(f"[ERROR] Failed on fraud file {fname}: {e}")
        continue

    results.append({
        "file": fname,
        "true_label": "FRAUD",
        "predicted_label": res["risk_level"],
        "risk_score": res["risk_score"],
        "doc_length_chars": res["details"]["doc_length_chars"],
        "num_dates": res["details"]["num_dates"],
        "num_big_numbers": res["details"]["num_big_numbers"],
        "missing_clauses": ", ".join(res["missing_mandatory_clauses"]),
        "suspicious_phrases": ", ".join(res["suspicious_phrases"]),
    })

df_results = pd.DataFrame(results)
print(df_results.head())
print("Total docs processed:", len(df_results))

df_results["correct"] = df_results["true_label"] == df_results["predicted_label"]
print("\nAccuracy:", df_results["correct"].mean())

out_path = os.path.join(BASE_DIR, "rule_engine_results_60docs.csv")
df_results.to_csv(out_path, index=False)
print("Saved results to:", out_path)


              file true_label predicted_label  risk_score  doc_length_chars  \
0  SALE DEED 1.pdf     NORMAL        LOW_RISK          20              3453   
1  SALE DEED 2.pdf     NORMAL        LOW_RISK          20              2747   
2  SALE DEED 3.pdf     NORMAL        LOW_RISK          20              3131   
3  SALE DEED 4.pdf     NORMAL        LOW_RISK          23              2802   
4  SALE DEED 5.pdf     NORMAL        LOW_RISK          20              2872   

   num_dates  num_big_numbers  \
0          0                2   
1          0                3   
2          0                3   
3          0                3   
4          0                3   

                                     missing_clauses suspicious_phrases  
0        indemnity, jurisdiction, dispute_resolution                     
1  indemnity, boundaries, jurisdiction, dispute_r...                     
2  indemnity, possession, jurisdiction, dispute_r...                     
3  indemnity, possession, cons

In [None]:
df_results.head()

Unnamed: 0,file,true_label,predicted_label,risk_score,doc_length_chars,num_dates,num_big_numbers,missing_clauses,suspicious_phrases,correct
0,SALE DEED 1.pdf,NORMAL,LOW_RISK,20,3453,0,2,"indemnity, jurisdiction, dispute_resolution",,False
1,SALE DEED 2.pdf,NORMAL,LOW_RISK,20,2747,0,3,"indemnity, boundaries, jurisdiction, dispute_r...",,False
2,SALE DEED 3.pdf,NORMAL,LOW_RISK,20,3131,0,3,"indemnity, possession, jurisdiction, dispute_r...",,False
3,SALE DEED 4.pdf,NORMAL,LOW_RISK,23,2802,0,3,"indemnity, possession, consideration, jurisdic...",,False
4,SALE DEED 5.pdf,NORMAL,LOW_RISK,20,2872,0,3,"indemnity, boundaries, jurisdiction, dispute_r...",,False


In [None]:
df_results.tail()

Unnamed: 0,file,true_label,predicted_label,risk_score,doc_length_chars,num_dates,num_big_numbers,missing_clauses,suspicious_phrases,correct
55,FRAUD SALE DEED 6.pdf,FRAUD,NORMAL,8,2973,0,3,,,False
56,FRAUD SALE DEED 7.pdf,FRAUD,NORMAL,8,3006,0,4,,,False
57,FRAUD SALE DEED 8.pdf,FRAUD,NORMAL,11,2888,0,2,,,False
58,FRAUD SALE DEED 9.pdf,FRAUD,NORMAL,8,3111,0,3,,,False
59,FRAUD SALE DEED 10.pdf,FRAUD,NORMAL,8,3005,0,3,,,False
