In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import PyPDF2
import re
import pandas as pd
from pathlib import Path
import sys

# ------------------------------------------------------------------
# CONFIGURATION
# ------------------------------------------------------------------
DOWNLOADS_DIR = Path(r"C:\Users\steph\Downloads")
EXCEL_OUTPUT  = DOWNLOADS_DIR / "EFFIS_All_Fires_Compiled.xlsx"
PDF_PATTERN   = "SingleFirereport-*.pdf"

print(f"Scanning: {DOWNLOADS_DIR}")
print(f"Pattern : {PDF_PATTERN}\n")

# ------------------------------------------------------------------
# 1. READ PDF
# ------------------------------------------------------------------
def extract_text(pdf_path: Path) -> str | None:
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            return "\n".join(page.extract_text() or "" for page in reader.pages)
    except Exception as e:
        print(f"   [ERROR] {pdf_path.name}: {e}")
        return None

# ------------------------------------------------------------------
# 2. PARSE ONE REPORT
# ------------------------------------------------------------------
def parse_report(text: str, filename: str) -> dict | None:
    if not text:
        return None

    # Normalise whitespace & camel-case
    t = re.sub(r'\s+', ' ', text)
    t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
    t = t.replace('°', ' ')

    row = {
        'Source File': filename,
        'Fire ID': 'N/A',
        'Detected': 'N/A',
        'Updated': 'N/A',
        'Lat': 'N/A',
        'Lon': 'N/A',
        'Area (ha)': 'N/A',
        'Perimeter (km)': 'N/A',
        'Country': 'N/A',
        'Built-up Inside (m²)': 0,
        'Built-up Buffer (m²)': 0,
        'Population Inside': 0,
        'Population Buffer': 0,
        'Population 5km': 'N/A',
        'Pop Density 5km (pop/km²)': 'N/A',
        'Forest (ha)': 0.0,
        'Agricultural (ha)': 0.0,
        'Other Natural (ha)': 0.0,
        'Natura2000 (%)': 0.0,
        'Natura2000 (ha)': 0.0,
    }

    # --- Metadata ---
    meta_patterns = {
        'Fire ID': r'Fire ID\s*[:]\s*([^\s\n]+)',
        'Detected': r'Detected\s*[:]\s*([\d\- :]+)',
        'Updated': r'Updated\s*[:]\s*([\d\- :]+UTC)',
        'Lat': r'Location lat[^:]*[:]\s*([-\d.]+)',
        'Lon': r'Location lon[^:]*[:]\s*([-\d.]+)',
        'Area (ha)': r'Area\s*\(ha\)\s*[:]\s*([-\d.]+)',
        'Perimeter (km)': r'Perimeter\s*\(km\)\s*[:]\s*([-\d.]+)',
        'Country': r'Country\s*[:]\s*([A-Z]{3})',
    }
    for key, pat in meta_patterns.items():
        m = re.search(pat, t, re.IGNORECASE)
        if m:
            row[key] = m.group(1).strip()

    # --- Built-up ---
    built = re.search(r'Estimated built up area[^0-9]*(\d+)[^0-9]*(\d+)', t)
    if built:
        row['Built-up Inside (m²)'] = int(built.group(1))
        row['Built-up Buffer (m²)'] = int(built.group(2))

    # --- Population (total) ---
    pop = re.search(r'Estimated total population[^0-9]*(\d+)[^0-9]*(\d+)', t)
    if pop:
        row['Population Inside'] = int(pop.group(1))
        row['Population Buffer'] = int(pop.group(2))

    # --- 5 km radius ---
    pop5 = re.search(r'population in affected area is about ([\d,]+)', t, re.IGNORECASE)
    den5 = re.search(r'average population density of ([\d.]+) people/km²', t, re.IGNORECASE)
    row['Population 5km'] = pop5.group(1).replace(',', '') if pop5 else 'N/A'
    row['Pop Density 5km (pop/km²)'] = den5.group(1) if den5 else 'N/A'

    # --- Land Cover (ha) ---
    # Use raw strings to avoid \d warning
    lc_mapping = {
        'Forest broadLeaves': 'Forest (ha)',
        'Forest coniferous': 'Forest (ha)',
        'Forest mixed': 'Forest (ha)',
        'Agricultural areas': 'Agricultural (ha)',
        'Other natural areas': 'Other Natural (ha)',
    }

    for lc_name, target_col in lc_mapping.items():
        # r'' raw string → no escape warning
        m = re.search(rf'{re.escape(lc_name)}[^0-9]*[\d.]+\s+([\d.]+)', t)
        if m:
            row[target_col] += float(m.group(1))

    # --- Natura 2000 ---
    nat = re.search(r'Natura 2k areas[^0-9]*([\d.]+)[^0-9]*([\d.]+)', t)
    if nat:
        row['Natura2000 (%)'] = float(nat.group(1))
        row['Natura2000 (ha)'] = float(nat.group(2))

    return row

# ------------------------------------------------------------------
# 3. MAIN LOOP
# ------------------------------------------------------------------
pdf_files = list(DOWNLOADS_DIR.glob(PDF_PATTERN))
if not pdf_files:
    print(f"No files matching '{PDF_PATTERN}' found.")
    sys.exit(1)

print(f"Found {len(pdf_files)} report(s). Processing...\n")

rows = []
for i, pdf_path in enumerate(pdf_files, 1):
    print(f"[{i}/{len(pdf_files)}] {pdf_path.name}")
    text = extract_text(pdf_path)
    if not text:
        continue
    data = parse_report(text, pdf_path.name)
    if data:
        rows.append(data)
        print(f"   Success: {data['Country']} | {data['Area (ha)']} ha | "
              f"{data['Lat']}, {data['Lon']}")
    else:
        print("   [FAIL] Parsing failed.")

if not rows:
    print("\nNo data extracted.")
    sys.exit(1)

# ------------------------------------------------------------------
# 4. BUILD DATAFRAME & EXPORT
# ------------------------------------------------------------------
df = pd.DataFrame(rows)

# Order columns
col_order = [
    'Source File', 'Fire ID', 'Detected', 'Updated', 'Country',
    'Lat', 'Lon', 'Area (ha)', 'Perimeter (km)',
    'Built-up Inside (m²)', 'Built-up Buffer (m²)',
    'Population Inside', 'Population Buffer',
    'Population 5km', 'Pop Density 5km (pop/km²)',
    'Forest (ha)', 'Agricultural (ha)', 'Other Natural (ha)',
    'Natura2000 (%)', 'Natura2000 (ha)'
]
df = df[col_order]

try:
    df.to_excel(EXCEL_OUTPUT, index=False, engine='openpyxl')
    print(f"\nExcel saved: {EXCEL_OUTPUT}")
except Exception as e:
    print(f"\n[ERROR] Export failed: {e}")
    sys.exit(1)

# ------------------------------------------------------------------
# 5. SUMMARY
# ------------------------------------------------------------------
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
total_area = pd.to_numeric(df['Area (ha)'], errors='coerce').sum()
total_forest = df['Forest (ha)'].sum()
total_agri = df['Agricultural (ha)'].sum()
print(f"Fires processed   : {len(df)}")
print(f"Total burned area : {total_area:.1f} ha")
print(f"Forest burned     : {total_forest:.1f} ha")
print(f"Agricultural      : {total_agri:.1f} ha")
print(f"Countries         : {', '.join(sorted(df['Country'].dropna().unique()))}")
print(f"Natura 2000 fires : {df[df['Natura2000 (ha)'] > 0].shape[0]}")
print("="*60)

ImportError: DLL load failed while importing _multiarray_umath: The specified module could not be found.

Scanning: C:\Users\steph\Downloads
Pattern : SingleFirereport-*.pdf

Found 5 report(s). Processing...

[1/5] SingleFirereport-217884-rev_0.pdf
   Success: BGR | 1109 ha | 25.389579, 41.967216
[2/5] SingleFirereport-223385-rev_0.pdf
   Success: BGR | 2401 ha | 23.085883, 42.912783
[3/5] SingleFirereport-277499-rev_0.pdf
   Success: BGR | 2677 ha | 26.068463, 41.910791
[4/5] SingleFirereport-277844-rev_0.pdf
   Success: BGR | 1242 ha | 26.741844, 42.783496
[5/5] SingleFirereport-281009-rev_0.pdf
   Success: BGR | 1106 ha | 26.270351, 41.921013

Excel saved: C:\Users\steph\Downloads\EFFIS_All_Fires_Compiled.xlsx

SUMMARY
Fires processed   : 5
Total burned area : 8535.0 ha
Forest burned     : 0.0 ha
Agricultural      : 2436.0 ha
Countries         : BGR
Natura 2000 fires : 4
