In [6]:
# Install required packages:
# pip install transformers torch rapidfuzz pandas
import torch
import re
import pandas as pd
from transformers import pipeline
from rapidfuzz import process, fuzz

# 1) Load and clean OCR text
with open("Telangana Entities\\ocr txt\\1806-1-2025-642_Redacted.txt", "r", encoding="utf-8") as f:
    raw = f.read()

def clean_text(t):
    t = re.sub(r"[^\x00-\x7F]+", " ", t)  # drop non‑ASCII noise
    t = re.sub(r"\s+", " ", t).strip()    # normalize whitespace
    return t

text = clean_text(raw)

# 2) Domain‑specific regex patterns
patterns = [
    r"by[:-]\s*([A-Z][A-Z\s]+?)(?=\s|,|\.)",
    r"IN FAVOUR OF\s*([A-Z][A-Z\s]+?)(?=\s|,|\.)",
    r"Transferar\s*\( Name of previous.*?\):\s*([A-Z][A-Z\s]+?)(?=\s|\|)",
    r"Transferree\s*\( Name of PT.*?\):\s*([A-Z][A-Z\s]+?)(?=\s|\|)",
]
regex_matches = []
for p in patterns:
    regex_matches += re.findall(p, text)

# 3) Transformer NER for PERSON
ner = pipeline(
    "ner",
    model="dslim/bert-base-NER",
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1,
)
ner_ents = ner(text)
ner_names = [e["word"].upper() for e in ner_ents if e["entity_group"] == "PER"]

# 4) Combine & fuzzy‐dedupe
candidates = [n.strip() for n in regex_matches + ner_names if len(n.strip()) > 2]
# Use rapidfuzz to cluster names with similarity ≥ 85%
unique = []
for name in candidates:
    # if no existing cluster is similar, keep it
    if not any(fuzz.partial_ratio(name, u) > 85 for u in unique):
        unique.append(name)

# 5) Save to DataFrame / CSV
df = pd.DataFrame({"Name": unique})
print(df.to_string(index=False))
df.to_csv("extracted_names.csv", index=False)


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.