In [1]:
import os
import json
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from datetime import datetime
from tqdm import tqdm

# Docling imports
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions

# LLM imports
import ollama

# Supabase
from supabase import create_client, Client

# Validation
from jsonschema import validate, ValidationError

print(" All libraries imported successfully")

 All libraries imported successfully


In [2]:
!nvidia-smi

Mon Nov 24 08:36:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.57.08              Driver Version: 575.57.08      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      On  |   00000000:3C:00.0 Off |                    0 |
| N/A   38C    P8             16W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
# Paths
PDF_DIR = r"C:\Users\samue\Documents\Work\Code\valuation_data_miner\data"
OUTPUT_DIR = "./extracted_data"
ERROR_LOG_DIR = "./errors"
from dotenv import load_dotenv

# dotenv_path = r"C:\Users\samue\Documents\Work\Code\valuation_data_miner\.env"

# with open(dotenv_path, "r") as f:
#     for line in f:
#         if line.strip() == "" or line.startswith("#"):
#             continue
#         key, value = line.strip().split("=", 1)
#         os.environ[key] = value  # set environment variable



load_dotenv() 
# Supabase credentials
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
SUPABASE_TABLE = "property_valuations"



# LLM settings
LLM_MODEL = "mistral"
LLM_TEMPERATURE = 0.1

# Processing settings
BATCH_SIZE = 5
MAX_RETRIES = 2

# Create directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(ERROR_LOG_DIR, exist_ok=True)

print(f" Configuration loaded")
print(f"  PDF Directory: {PDF_DIR}")
print(f"  Output Directory: {OUTPUT_DIR}")
print(f"  LLM Model: {LLM_MODEL}")

 Configuration loaded
  PDF Directory: C:\Users\samue\Documents\Work\Code\valuation_data_miner\data
  Output Directory: ./extracted_data
  LLM Model: mistral


In [4]:
VALUATION_SCHEMA = {
    "type": "object",
    "required": ["property_id", "valuation_report", "property_details", "valuations"],
    "properties": {
        "property_id": {"type": "string"},
        "valuation_report": {
            "type": "object",
            "properties": {
                "report_reference": {"type": "string"},
                "valuer": {"type": "string"},
                "valuers": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "name": {"type": "string"},
                            "qualification": {"type": "string"}
                        }
                    }
                },
                "inspection_date": {"type": "string"},
                "report_date": {"type": "string"},
                "client": {"type": "string"},
                "client_address": {"type": "string"},
                "purpose": {"type": "string"}
            }
        },
        "property_details": {
            "type": "object",
            "properties": {
                "apartment_number": {"type": "string"},
                "block": {"type": "string"},
                "floor": {"type": "string"},
                "title_details": {"type": "object"},
                "location": {"type": "object"},
                "tenure": {"type": "object"},
                "registered_proprietors": {"type": "array"},
                "ownership_type": {"type": "string"},
                "encumbrances": {"type": "string"}
            }
        },
        "property_description": {"type": "object"},
        "apartment_details": {"type": "object"},
        "occupancy": {"type": "string"},
        "condition": {"type": "string"},
        "market_assessment": {"type": "object"},
        "valuations": {
            "type": "object",
            "properties": {
                "current_market_value": {
                    "type": "object",
                    "properties": {
                        "amount": {"type": "number"},
                        "currency": {"type": "string"},
                        "amount_words": {"type": "string"}
                    }
                }
            }
        },
        "valuation_methodology": {"type": "array"},
        "lease_details": {"type": "object"},
        "compliance": {"type": "object"}
    }
}

print("JSON schema defined")

JSON schema defined


In [5]:
import os
from typing import Optional
from supabase import create_client, Client
from dotenv import load_dotenv

# dotenv_path = r"C:\Users\samue\Documents\Work\Code\valuation_data_miner\.env"

# with open(dotenv_path, "r") as f:
#     for line in f:
#         if line.strip() == "" or line.startswith("#"):
#             continue
#         key, value = line.strip().split("=", 1)
#         os.environ[key] = value  # set environment variable

load_dotenv() 


def init_supabase() -> Optional[Client]:
    """Initialize Supabase client using environment variables"""
    try:
        supabase_url = os.getenv("SUPABASE_URL")
        supabase_key = os.getenv("SUPABASE_KEY")

        if not supabase_url or not supabase_key:
            print("Supabase credentials not configured")
            return None

        client = create_client(supabase_url, supabase_key)
        print("Supabase client initialized")
        return client

    except Exception as e:
        print(f" Failed to initialize Supabase: {e}")
        return None

supabase_client = init_supabase()

Supabase credentials not configured


In [6]:
import pytesseract
from pdf2image import convert_from_path
import tempfile
import os

def process_pdf_with_docling(pdf_path: str) -> str:
    """
    Robust hybrid PDF extraction:
    1. Extract text using Docling
    2. Detect missing text
    3. Run fallback Tesseract OCR for image-based pages
    4. Merge into final extracted text
    """
    try:
        print("    → Processing PDF with Docling...")

        from docling.document_converter import DocumentConverter

        # No options required for Docling 2.x
        converter = DocumentConverter()
        result = converter.convert(pdf_path)
        doc = result.document

        # ---------------------------
        # (1) DOC TEXT EXTRACTION
        # ---------------------------
        markdown_text = doc.export_to_markdown() or ""
        markdown_text = markdown_text.replace("<!-- image -->", "").strip()

        # Collect raw extracted text
        extracted_raw = []
        try:
            for item in doc.iterate_items():
                if hasattr(item, 'text') and item.text:
                    t = item.text.strip()
                    if t and t != "<!-- image -->":
                        extracted_raw.append(t)
        except:
            pass

        combined_docling_text = (markdown_text + "\n" + "\n".join(extracted_raw)).strip()

        print(f"    → Docling text extracted: {len(combined_docling_text)} chars")

        # ---------------------------
        # (2) IF TEXT < THRESHOLD → RUN OCR
        # ---------------------------
        NEED_OCR = len(combined_docling_text) < 200  # You can adjust threshold

        ocr_text = ""

        if NEED_OCR:
            print("    ⚠ Low text detected → Running Tesseract OCR fallback...")
            print("    → Converting PDF pages to images...")

            with tempfile.TemporaryDirectory() as tmpdir:
                pages = convert_from_path(pdf_path, dpi=300, output_folder=tmpdir)

                for idx, page in enumerate(pages):
                    print(f"        → OCR page {idx+1}/{len(pages)}")
                    page_text = pytesseract.image_to_string(page, lang="eng")
                    ocr_text += f"\n--- OCR Page {idx+1} ---\n{page_text.strip()}\n"

            print(f"    → OCR extracted: {len(ocr_text)} chars")

        # ---------------------------
        # (3) MERGE RESULTS
        # ---------------------------
        final_text = combined_docling_text
        if NEED_OCR:
            final_text = combined_docling_text + "\n\n" + ocr_text

        # Clean formatting
        import re
        final_text = re.sub(r"\n{3,}", "\n\n", final_text).strip()

        print(f"    → Final extracted text: {len(final_text)} chars")

        return final_text

    except Exception as e:
        print(f"    ✗ Docling failed: {str(e)}")
        raise Exception(f"Docling processing failed: {str(e)}")

print("✓ Hybrid Docling + Tesseract OCR processor loaded")


✓ Hybrid Docling + Tesseract OCR processor loaded


In [7]:
def validate_extracted_data(data: Dict) -> Tuple[bool, Optional[str]]:
    """
    Validate extracted data against schema
    Returns (is_valid, error_message)
    """
    try:
        validate(instance=data, schema=VALUATION_SCHEMA)
        return True, None
    except ValidationError as e:
        return False, str(e)

print(" Validation function defined")

 Validation function defined


###  LLM Extraction with Mistral

In [8]:
def extract_with_llm(text_content: str, filename: str) -> Dict:
    """
    Extract structured data from text using local Mistral model
    FIXED: Better prompt and error handling
    """
    
    # Truncate content if too long
    text_sample = text_content[:12000] if len(text_content) > 12000 else text_content
    
    # Simplified, more direct prompt
    prompt = f"""You are extracting property valuation data. Return ONLY a JSON object, nothing else.

Extract these fields from the document:
- property_id
- valuation_report (report_reference, valuer, inspection_date, report_date, client, purpose)
- property_details (location, title_details, tenure)
- valuations (current_market_value with amount and currency)
- valuation_methodology (array of methods)

Document text:
{text_sample}

Return ONLY the JSON object with this structure:
{{
  "property_id": "",
  "valuation_report": {{}},
  "property_details": {{}},
  "property_description": {{}},
  "apartment_details": {{}},
  "valuations": {{}},
  "valuation_methodology": []
}}"""
    
    try:
        print(f"    → Sending to Mistral LLM...")
        
        response = ollama.chat(
            model=LLM_MODEL,
            messages=[
                {"role": "user", "content": prompt}
            ],
            options={
                "temperature": 0.1,
                "num_predict": 4096
            }
        )
        
        # Extract response
        response_text = response['message']['content'].strip()
        
        # Debug: Save raw response
        debug_file = os.path.join(ERROR_LOG_DIR, f"{Path(filename).stem}_raw_response.txt")
        with open(debug_file, 'w', encoding='utf-8') as f:
            f.write(f"Raw LLM Response:\n{response_text}\n\n")
            f.write(f"Length: {len(response_text)} chars\n")
        
        print(f"    → LLM returned {len(response_text)} characters")
        
        # Check if response is empty
        if not response_text or len(response_text) < 10:
            raise Exception(f"LLM returned empty or very short response: '{response_text}'")
        
        # Clean response
        if '```json' in response_text:
            response_text = response_text.split('```json')[1].split('```')[0]
        elif '```' in response_text:
            response_text = response_text.split('```')[1].split('```')[0]
        
        response_text = response_text.strip()
        
        # Find JSON object if there's extra text
        if not response_text.startswith('{'):
            # Try to find JSON object in response
            start = response_text.find('{')
            end = response_text.rfind('}')
            if start != -1 and end != -1:
                response_text = response_text[start:end+1]
        
        # Try to parse JSON
        try:
            extracted_data = json.loads(response_text)
        except json.JSONDecodeError as e:
            # Save failed response for debugging
            error_file = os.path.join(ERROR_LOG_DIR, f"{Path(filename).stem}_json_error.txt")
            with open(error_file, 'w', encoding='utf-8') as f:
                f.write(f"JSON Parse Error: {str(e)}\n\n")
                f.write(f"Cleaned Response:\n{response_text}\n\n")
                f.write(f"Original Response:\n{response['message']['content']}")
            
            # Return minimal valid structure instead of failing
            print(f"    ⚠ JSON parsing failed, creating minimal structure")
            extracted_data = {
                "property_id": Path(filename).stem,
                "valuation_report": {
                    "report_reference": "",
                    "valuer": "",
                    "inspection_date": "",
                    "report_date": "",
                    "client": "",
                    "purpose": ""
                },
                "property_details": {},
                "property_description": {},
                "apartment_details": {},
                "valuations": {
                    "current_market_value": {
                        "amount": 0,
                        "currency": "KES"
                    }
                },
                "valuation_methodology": [],
                "_extraction_note": "LLM failed to return valid JSON, using template"
            }
        
        # Ensure property_id exists
        if not extracted_data.get('property_id'):
            extracted_data['property_id'] = Path(filename).stem
        
        print(f"    → Extraction successful")
        return extracted_data
        
    except Exception as e:
        print(f"    ✗ LLM extraction error: {str(e)[:100]}")
        raise Exception(f"LLM extraction failed: {str(e)}")

print("✓ Fixed LLM extraction function defined")

✓ Fixed LLM extraction function defined


In [9]:
def check_ollama_status():
    """Verify Ollama is running and responsive"""
    try:
        print("Checking Ollama status...")
        response = ollama.list()
        models = [m.model for m in response.models]
        print(f"✓ Ollama is running")
        print(f"  Available models: {models}")
        
        # Test a simple chat
        print("\nTesting Mistral model with simple query...")
        test_response = ollama.chat(
            model='mistral',
            messages=[{'role': 'user', 'content': 'Return only this JSON: {"test": "success"}'}],
            options={'temperature': 0.1, 'num_predict': 100}
        )
        print(f"✓ Test response: {test_response['message']['content'][:100]}")
        return True
    except Exception as e:
        print(f"✗ Ollama check failed: {e}")
        print("\nMake sure Ollama is running:")
        print("  1. Open terminal/command prompt")
        print("  2. Run: ollama serve")
        print("  3. Keep it running in background")
        return False

check_ollama_status()

Checking Ollama status...
✗ Ollama check failed: Failed to connect to Ollama. Please check that Ollama is downloaded, running and accessible. https://ollama.com/download

Make sure Ollama is running:
  1. Open terminal/command prompt
  2. Run: ollama serve
  3. Keep it running in background


False

In [10]:
def test_single_pdf():
    """Test processing one PDF to debug issues"""
    pdf_files = list(Path(PDF_DIR).glob("*.pdf"))
    if not pdf_files:
        print("No PDFs found")
        return
    
    test_file = pdf_files[0]
    print(f"\n{'='*60}")
    print(f"Testing with: {test_file.name}")
    print(f"{'='*60}\n")
    
    try:
        # Step 1: Extract text
        print("Step 1: Extracting text...")
        text = process_pdf_with_docling(str(test_file))
        print(f"✓ Extracted {len(text)} characters")
        print(f"\nFirst 500 chars:\n{text[:500]}\n")
        
        # Step 2: LLM extraction
        print("\nStep 2: LLM extraction...")
        data = extract_with_llm(text, test_file.name)
        print(f"✓ Extracted data")
        print(json.dumps(data, indent=2)[:500])
        
        # Step 3: Validate
        print("\nStep 3: Validating...")
        is_valid, error = validate_extracted_data(data)
        if is_valid:
            print("✓ Validation passed")
        else:
            print(f"⚠ Validation warning: {error[:100]}")
        
        # Step 4: Save
        print("\nStep 4: Saving...")
        output_file = os.path.join(OUTPUT_DIR, f"{test_file.stem}.json")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"✓ Saved to {output_file}")
        
        return True
        
    except Exception as e:
        print(f"\n✗ Test failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

test_single_pdf()

print("\n" + "="*60)
print("READY TO TEST")
print("="*60)
print("\n1. First run: test_single_pdf()")
print("2. If successful, run: results = process_all_pdfs()")

No PDFs found

READY TO TEST

1. First run: test_single_pdf()
2. If successful, run: results = process_all_pdfs()


### Supabase upload

In [11]:
def upload_to_supabase(client: Client, data: Dict, filename: str) -> bool:
    """
    Upload extracted data to Supabase
    Returns True if successful
    """
    if client is None:
        return False
    
    try:
        # Add metadata
        upload_data = data.copy()
        upload_data['source_filename'] = filename
        upload_data['processed_at'] = datetime.now().isoformat()
        
        # Insert into Supabase
        response = client.table(SUPABASE_TABLE).insert(upload_data).execute()
        
        return True
        
    except Exception as e:
        raise Exception(f"Supabase upload failed: {str(e)}")

print("Supabase upload function defined")

Supabase upload function defined


### Processing

In [12]:
def process_single_pdf(pdf_path: str, filename: str) -> Dict:
    """
    Process a single PDF through the entire pipeline
    Returns result dictionary with status
    """
    result = {
        'filename': filename,
        'success': False,
        'stage': None,
        'error': None,
        'data': None
    }
    
    try:
        # Stage 1: Docling OCR
        result['stage'] = 'docling'
        text_content = process_pdf_with_docling(pdf_path)
        
        # Stage 2: LLM Extraction
        result['stage'] = 'extraction'
        extracted_data = extract_with_llm(text_content, filename)
        
        # Stage 3: Validation
        result['stage'] = 'validation'
        is_valid, validation_error = validate_extracted_data(extracted_data)
        
        if not is_valid:
            result['error'] = f"Validation failed: {validation_error}"
        
        result['data'] = extracted_data
        
        # Stage 4: Save locally
        result['stage'] = 'saving'
        output_file = os.path.join(OUTPUT_DIR, f"{Path(filename).stem}.json")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(extracted_data, f, indent=2, ensure_ascii=False)
        
        # Stage 5: Upload to Supabase
        if supabase_client and is_valid:
            result['stage'] = 'upload'
            upload_to_supabase(supabase_client, extracted_data, filename)
        
        result['success'] = True
        result['stage'] = 'complete'
        
    except Exception as e:
        result['error'] = str(e)
        
        # Log error
        error_file = os.path.join(ERROR_LOG_DIR, f"{Path(filename).stem}_error.txt")
        with open(error_file, 'w') as f:
            f.write(f"Stage: {result['stage']}\n")
            f.write(f"Error: {result['error']}\n")
    
    return result

print(" Main pipeline function defined")

 Main pipeline function defined


### Batch processing

In [13]:
def process_all_pdfs():
    """
    Process all PDFs in the configured directory
    """
    # Get all PDF files
    pdf_files = list(Path(PDF_DIR).glob("*.pdf"))
    
    if not pdf_files:
        print(f"✗ No PDF files found in {PDF_DIR}")
        return []
    
    print(f"\n{'='*60}")
    print(f"Found {len(pdf_files)} PDF files to process")
    print(f"{'='*60}\n")
    
    results = []
    
    # Process each PDF
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        print(f"\n[{pdf_file.name}]")
        result = process_single_pdf(str(pdf_file), pdf_file.name)
        results.append(result)
        
        if result['success']:
            print(f"  ✓ Successfully processed")
        else:
            print(f"  ✗ Failed at stage '{result['stage']}': {result['error']}")
    
    # Summary
    print(f"\n{'='*60}")
    print("PROCESSING SUMMARY")
    print(f"{'='*60}")
    
    successful = sum(1 for r in results if r['success'])
    failed = len(results) - successful
    
    print(f"Total PDFs: {len(results)}")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")
    print(f"\nOutput saved to: {OUTPUT_DIR}")
    if failed > 0:
        print(f"Error logs saved to: {ERROR_LOG_DIR}")
    
    # Save summary report
    summary_file = os.path.join(OUTPUT_DIR, "processing_summary.json")
    with open(summary_file, 'w') as f:
        json.dump({
            'timestamp': datetime.now().isoformat(),
            'total': len(results),
            'successful': successful,
            'failed': failed,
            'results': results
        }, f, indent=2)
    
    return results

print("Batch processing function defined")


Batch processing function defined


In [14]:
pdf_count = len(list(Path(PDF_DIR).glob("*.pdf")))
print(f"Found {pdf_count} PDF files in {PDF_DIR}")

if pdf_count == 0:
    print(f"\nWARNING: No PDF files found!")
    print(f"Please check that PDFs exist in: {PDF_DIR}")
else:
    print("\n" + "="*60)
    print("STARTING PDF PROCESSING PIPELINE")
    print("="*60)
    print("\nThis will:")
    print("1. Extract text from PDFs using Docling OCR")
    print("2. Extract structured data using Mistral LLM")
    print("3. Validate against JSON schema")
    print("4. Save to local JSON files")
    print("5. Upload to Supabase (if configured)")
    print("\n" + "="*60)
    
    # RUN THE PIPELINE
    results = process_all_pdfs()
    
    print("\nPROCESSING COMPLETE!")
    print(f"Check results in: {OUTPUT_DIR}")

Found 0 PDF files in C:\Users\samue\Documents\Work\Code\valuation_data_miner\data

Please check that PDFs exist in: C:\Users\samue\Documents\Work\Code\valuation_data_miner\data


### Ollama setup

In [15]:
import ollama

try:
    response = ollama.list()
    print("RAW MODEL RESPONSE:", response)

    models = response.models  # this is a list of Model objects
    model_names = [m.model for m in models]  # use .model attribute

    print("Detected models:", model_names)

    if LLM_MODEL not in model_names and f"{LLM_MODEL}:latest" not in model_names:
        print(f"⚠ Warning: Model '{LLM_MODEL}' not found in Ollama")
        print(f"Available models: {model_names}")
        print("\nTo install Mistral, run in terminal: ollama pull mistral")
    else:
        print(f"✓ Model '{LLM_MODEL}' is available")

except Exception as e:
    print(f"✗ Could not check Ollama models: {e}")
    print("Make sure Ollama is running (run 'ollama serve' in terminal')")


✗ Could not check Ollama models: Failed to connect to Ollama. Please check that Ollama is downloaded, running and accessible. https://ollama.com/download
Make sure Ollama is running (run 'ollama serve' in terminal')


### View sample results

In [16]:
json_files = list(Path(OUTPUT_DIR).glob("*.json"))
json_files = [f for f in json_files if f.name != "processing_summary.json"]

if json_files:
    sample_file = json_files[0]
    print(f"Sample output from: {sample_file.name}\n")
    print("="*60)
    
    with open(sample_file, 'r') as f:
        sample_data = json.load(f)
    
    print(json.dumps(sample_data, indent=2))
else:
    print("No output files found yet")

No output files found yet


### reprocess failed files

In [17]:
def reprocess_failed(previous_results):
    """Reprocess only the failed PDFs"""
    failed_files = [r['filename'] for r in previous_results if not r['success']]
    
    if not failed_files:
        print("No failed files to reprocess")
        return []
    
    print(f"Reprocessing {len(failed_files)} failed PDFs...")
    
    retry_results = []
    for filename in tqdm(failed_files, desc="Retrying"):
        pdf_path = os.path.join(PDF_DIR, filename)
        result = process_single_pdf(pdf_path, filename)
        retry_results.append(result)
        
        if result['success']:
            print(f"  ✓ {filename} - Success on retry")
        else:
            print(f"  ✗ {filename} - Failed again")
    
    return retry_results

if 'results' in locals() and results:
    retry_results = reprocess_failed(results)

In [18]:
def query_supabase_sample():
    """Query and display sample data from Supabase"""
    if supabase_client is None:
        print("Supabase client not initialized")
        return
    
    try:
        response = supabase_client.table(SUPABASE_TABLE).select("*").limit(5).execute()
        print(f"Sample records from Supabase ({SUPABASE_TABLE}):\n")
        print(json.dumps(response.data, indent=2))
    except Exception as e:
        print(f"Error querying Supabase: {e}")

query_supabase_sample()

Supabase client not initialized


In [19]:
def analyze_results(results):
    """Analyze processing results"""
    if not results:
        print("No results to analyze")
        return
    
    print("\n" + "="*60)
    print("DETAILED ANALYSIS")
    print("="*60 + "\n")
    
    # Stage failures
    stage_failures = {}
    for r in results:
        if not r['success'] and r['stage']:
            stage_failures[r['stage']] = stage_failures.get(r['stage'], 0) + 1
    
    if stage_failures:
        print("Failures by stage:")
        for stage, count in sorted(stage_failures.items(), key=lambda x: x[1], reverse=True):
            print(f"  {stage}: {count}")
    
    # Validation stats
    validation_issues = [r for r in results if r['error'] and 'Validation' in r['error']]
    print(f"\nValidation issues: {len(validation_issues)}")
    
    # Success rate
    success_rate = (sum(1 for r in results if r['success']) / len(results)) * 100
    print(f"\nSuccess rate: {success_rate:.1f}%")
    
    print("\n" + "="*60)

if 'results' in locals() and results:
    analyze_results(results)

In [20]:
def export_to_csv():
    """Export extracted valuations to CSV for easy viewing"""
    import csv
    
    json_files = list(Path(OUTPUT_DIR).glob("*.json"))
    json_files = [f for f in json_files if f.name != "processing_summary.json"]
    
    if not json_files:
        print("No JSON files to export")
        return
    
    csv_path = os.path.join(OUTPUT_DIR, "valuations_summary.csv")
    
    with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['property_id', 'client', 'location', 'valuation_amount', 
                     'currency', 'report_date', 'source_file']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        
        for json_file in json_files:
            with open(json_file, 'r') as f:
                data = json.load(f)
            
            row = {
                'property_id': data.get('property_id', ''),
                'client': data.get('valuation_report', {}).get('client', ''),
                'location': data.get('property_details', {}).get('location', {}).get('area', ''),
                'valuation_amount': data.get('valuations', {}).get('current_market_value', {}).get('amount', ''),
                'currency': data.get('valuations', {}).get('current_market_value', {}).get('currency', ''),
                'report_date': data.get('valuation_report', {}).get('report_date', ''),
                'source_file': json_file.name
            }
            
            writer.writerow(row)
    
    print(f"✓ CSV exported to: {csv_path}")

export_to_csv()

No JSON files to export
