In [1]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

print("Using:", pytesseract.pytesseract.tesseract_cmd)


Using: C:\Program Files\Tesseract-OCR\tesseract.exe


In [2]:
from pathlib import Path

BASE = Path.cwd()
PDF_DIR = BASE              # your PDFs (case1.pdf, case2.pdf, …) are in the root folder
OUT_RAW = BASE / "outputs" / "raw_text"
OUT_LOG = BASE / "outputs" / "logs"

for p in [OUT_RAW, OUT_LOG]:
    p.mkdir(parents=True, exist_ok=True)

print("Ready.\nPDF_DIR:", PDF_DIR, "\nOUT_RAW:", OUT_RAW, "\nOUT_LOG:", OUT_LOG)


Ready.
PDF_DIR: C:\Users\bbuser\Desktop\legal-extractor 
OUT_RAW: C:\Users\bbuser\Desktop\legal-extractor\outputs\raw_text 
OUT_LOG: C:\Users\bbuser\Desktop\legal-extractor\outputs\logs


In [3]:
import fitz  # PyMuPDF
import cv2, re, json
import numpy as np
from PIL import Image
from datetime import datetime

def direct_text_from_page(page, min_chars=50):
    """Try to extract text directly from a PDF page.
    If text length is too small, return None."""
    txt = page.get_text("text") or ""
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt if len(txt) >= min_chars else None

def ocr_text_from_page(page, dpi=300, lang="ara+eng"):
    """Render PDF page as image and run OCR."""
    pix = page.get_pixmap(dpi=dpi, alpha=False)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    frame = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    gray = cv2.bilateralFilter(gray, 7, 50, 50)
    thr  = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                 cv2.THRESH_BINARY, 35, 11)
    return pytesseract.image_to_string(thr, lang=lang)

def extract_pdf_to_txt(pdf_path, out_txt_path, log_path):
    """Extract text from a PDF:
       - Direct text extraction if possible
       - Otherwise OCR
       Save raw text + log file with OCR pages info.
    """
    doc = fitz.open(pdf_path)
    all_text, ocr_pages = [], []
    for i, page in enumerate(doc, start=1):
        t = direct_text_from_page(page)
        if t is None:
            t = ocr_text_from_page(page)
            ocr_pages.append(i)
        all_text.append((t or "").strip())
    doc.close()

    # save text file
    out_txt_path.write_text(("\n\n==== [PAGE BREAK] ====\n\n").join(all_text), encoding="utf-8")

    # save log
    log = {
        "pdf": pdf_path.name,
        "timestamp": datetime.now().isoformat(timespec="seconds"),
        "pages_total": len(all_text),
        "ocr_pages": ocr_pages
    }
    log_path.write_text(json.dumps(log, ensure_ascii=False, indent=2), encoding="utf-8")
    return log


In [4]:
cases = sorted(PDF_DIR.glob("case*.pdf"))
if not cases:
    raise FileNotFoundError("No case*.pdf files found in the project folder.")

sample_pdf = cases[0]  # first file (case1.pdf)
out_txt = OUT_RAW / f"{sample_pdf.stem}.txt"
out_log = OUT_LOG / f"{sample_pdf.stem}.json"

log = extract_pdf_to_txt(sample_pdf, out_txt, out_log)
print("✅ Text extraction completed.")
print("Raw text file:", out_txt)
print("Log file:", out_log)
print("Summary:", log)


✅ Text extraction completed.
Raw text file: C:\Users\bbuser\Desktop\legal-extractor\outputs\raw_text\case1.txt
Log file: C:\Users\bbuser\Desktop\legal-extractor\outputs\logs\case1.json
Summary: {'pdf': 'case1.pdf', 'timestamp': '2025-08-27T01:42:20', 'pages_total': 5, 'ocr_pages': [1, 2, 3, 4, 5]}


In [5]:
results = []
for pdf in sorted(PDF_DIR.glob("case*.pdf")):
    out_txt = OUT_RAW / f"{pdf.stem}.txt"
    out_log = OUT_LOG / f"{pdf.stem}.json"
    results.append(extract_pdf_to_txt(pdf, out_txt, out_log))
results


[{'pdf': 'case1.pdf',
  'timestamp': '2025-08-27T01:42:43',
  'pages_total': 5,
  'ocr_pages': [1, 2, 3, 4, 5]},
 {'pdf': 'case2.pdf',
  'timestamp': '2025-08-27T01:46:34',
  'pages_total': 25,
  'ocr_pages': [1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25]},
 {'pdf': 'case3.pdf',
  'timestamp': '2025-08-27T01:53:30',
  'pages_total': 75,
  'ocr_pages': [1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   40,
   41,
   42,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   56,
   57,
   58,
   59,
   60,
   61,
   62,
   63,
   64,
   65,
   66,
   67,
   68,
   69,
   70,
   71,
   72,
   73,
   74,
   75]},
 {'pdf': 'cas

In [7]:
import re
import pandas as pd

def extract_legal_fields(text):
    # Regex patterns
    phone_pattern = re.compile(r"(?:\+?\d[\d\s\-]{7,15})")
    email_pattern = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")

    # Court
    court = None
    court_match = re.search(r"(?:أمام|امام)?\s*المحكمة.+?(?:\n|$)", text)
    if court_match:
        court = court_match.group(0).strip()

    # Plaintiff
    plaintiff = None
    plaintiff_match = re.search(r"(?:مقدمة من|المدعي.?[:：]?)\s*(.+?)(?:[،\n\"“])", text)
    if plaintiff_match:
        plaintiff = plaintiff_match.group(1).strip()

    # Defendant
    defendant = None
    defendant_match = re.search(r"(?:ضد|المدعى عليه.?[:：]?)\s*(.+?)(?:[،\n\"“])", text)
    if defendant_match:
        defendant = defendant_match.group(1).strip()

    # Phones (exclude lawyer numbers near كلمة "مكتب" or "محاماة")
    phones = phone_pattern.findall(text)
    phones_clean = []
    for p in phones:
        ctx = text[max(0, text.find(p)-25): text.find(p)+25]
        if "مكتب" not in ctx and "محام" not in ctx:
            phones_clean.append(p)

    # Emails (exclude lawyers)
    emails = email_pattern.findall(text)
    emails_clean = [e for e in emails if "law" not in e.lower() and "office" not in e.lower()]

    return {
        "court": court,
        "plaintiff": plaintiff,
        "defendant": defendant,
        "phone": phones_clean[0] if phones_clean else None,
        "email": emails_clean[0] if emails_clean else None
    }

# Process all extracted .txt files
records = []
for txt_file in sorted(OUT_RAW.glob("case*.txt")):
    text = txt_file.read_text(encoding="utf-8", errors="ignore")
    fields = extract_legal_fields(text)
    fields["file"] = txt_file.name
    records.append(fields)

# Build DataFrame
df = pd.DataFrame(records, columns=["file", "court", "plaintiff", "defendant", "phone", "email"])
df.to_csv("cases_extracted.csv", index=False, encoding="utf-8-sig")

df


Unnamed: 0,file,court,plaintiff,defendant,phone,email
0,case1.txt,أمام المحكمة الإبتدائية بمسقط,ويمثلها والوكيل عنها / سالم بن محمد بن سليم ال...,الثاني,985 1566,
1,case2.txt,المحكمة لطلب المستأنف ضدها وأصدر القرار القضائي,:‎ ca 05/0/١١ ‏الصادربتار يخ‎,/,5\n0018860,mj-co@omantel.net.om
2,case3.txt,المحكمة العليا,بعض ورثة/ راشد بن حميد بن سعيد المعمري. وهم:,كل من/ عبد الحكيم بن محمد بن عبدالله القاسمي و...,95094994\n,advocate.nasser@gmail.com
3,case4.txt,,: شركة االنامل السحرية املتحدة.,,99255479,sameralahd@gmail.com
4,case5.txt,,: شركة االنامل السحرية املتحدة.,,99255479,sameralahd@gmail.com
5,case6.txt,,شركة الفالح للمواش ي و التجارة,,99247030 - 99255,
