In [None]:
!apt-get update -y
!apt-get install -y --fix-missing poppler-utils
!apt-get update -qq
!apt-get install -y -qq poppler-utils
!pip install docling-parse docling-core rapidfuzz docling


0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 https://cli.github.com/packages stable InRelease
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [83.2 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,411 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:12 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:13 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64

In [None]:
import gdown
import pandas as pd
import numpy as np

# I’ve uploaded the file to Google Drive, and this code is used to read it.
url = 'https://drive.google.com/uc?export=download&id=1QtmW3UhIMXDLQoI3wI7zH88SACFurjcR'
output = 'FY2023_Asset_Report.pdf'
gdown.download(url, output, quiet=False)

In [None]:
import os
import re
import subprocess
from typing import List, Tuple, Optional, Dict

import numpy as np
import pandas as pd

# =============================================================================
# Common Regular Expressions
# =============================================================================
DATE_RE = re.compile(r'^(?:\d{1,2}/\d{1,2}/\d{2,4}|\d{4}-\d{1,2}-\d{1,2})$')
NUM_RE  = re.compile(r'^[()\-]?\d{1,3}(?:,\d{3})*(?:\.\d+)?$')

# Allowed keywords for page-start Region (common in Asset Details sections)
REGION_PREFIX = re.compile(
    r'^(Europe|Japan|Korea|Okinawa|Pacific|United(?:\s+States)?)\b',
    re.IGNORECASE
)

# Serial pattern: letters/numbers/-
SERIAL_RE = re.compile(r'(?i)^[a-z0-9-]+$')

# =============================================================================
# Helper: safe concat (filter out empty/all-NA DataFrames to avoid dtype warnings/behavior differences)
# =============================================================================
def _safe_concat(frames: List[pd.DataFrame]) -> pd.DataFrame:
    if not frames:
        return pd.DataFrame()
    kept: List[pd.DataFrame] = []
    for df in frames:
        if df is None or df.empty:
            continue
        if df.dropna(how='all').empty:
            continue
        kept.append(df)
    return pd.concat(kept, ignore_index=True) if kept else pd.DataFrame()

# =============================================================================
# A) —— Three Summary Tables (reuse code 1 logic; renamed to avoid conflicts)
# =============================================================================
def get_month_v1(page_text: str) -> Optional[str]:
    m = re.search(r'for month of\s+([A-Za-z]+)\s+(\d{4})', page_text, re.IGNORECASE)
    return f"{m.group(1).capitalize()} {m.group(2)}" if m else None


def parse_region_and_field_office_v1(lines: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Code 1 version: Region and Field Office (more complete, used with v1 flow)"""
    region_df = pd.DataFrame()
    field_df = pd.DataFrame()

    # ----- Region summary -----
    idx_slots = next((i for i, l in enumerate(lines) if l.strip().startswith('Slots Only')), None)
    if idx_slots is not None:
        region_rows: List[List] = []
        i = idx_slots + 1
        while i < len(lines):
            line = lines[i].strip()
            if not line:
                i += 1
                continue
            if (
                line.startswith('Locations by Service') or
                'Assets by Field Office' in line or
                'EGMs by Field Office' in line or
                'Installed Assets by Location' in line
            ):
                break
            parts = [p for p in re.split(r'\s{2,}', line) if p]
            if len(parts) >= 5:
                region_name = parts[0]
                values: List = []
                for p in parts[1:]:
                    x = p.replace('%', '')
                    if x in ('-', ''):
                        values.append(np.nan)
                    else:
                        try:
                            values.append(float(x.replace(',', '')))
                        except Exception:
                            values.append(x)
                region_rows.append([region_name] + values)
            i += 1
        if region_rows:
            maxlen = max(len(r) for r in region_rows)
            for r in region_rows:
                while len(r) < maxlen:
                    r.append(np.nan)
            cols = ['Region', '#Locations', 'Army', 'Navy', 'Marine_Corps', 'Airforce', 'Total', 'Percent']
            region_df = pd.DataFrame(region_rows, columns=cols[:maxlen])

    # ----- Field office summary -----
    field_rows: List[List] = []
    start = False
    current_region: Optional[str] = None
    for raw in lines:
        line = raw.rstrip(' ')
        if 'Assets by Field Office' in line or 'EGMs by Field Office' in line:
            start = True
            continue
        if not start:
            continue
        if not line.strip() or line.strip().startswith('Slots'):
            continue
        if line.strip() and not re.search(r'\d', line.strip()):
            current_region = line.strip()
            continue
        parts = [p for p in re.split(r'\s{2,}', line.strip()) if p]
        if len(parts) >= 6 and all(re.match(r'^[()0-9,.-]+$', p) for p in parts[-5:]):
            fo_number = parts[0]
            location_name = ' '.join(parts[1:-5])
            values: List[float] = []
            for p in parts[-5:]:
                if p == '-':
                    values.append(np.nan)
                else:
                    v = p
                    if v.startswith('(') and v.endswith(')'):
                        v = '-' + v[1:-1]
                    values.append(float(v.replace(',', '')))
            field_rows.append([current_region, fo_number, location_name] + values)
    if field_rows:
        field_df = pd.DataFrame(field_rows, columns=[
            'Region', 'FO#', 'Location', 'Slots', 'ACM_CountR', 'ITC', 'FRS', 'Total'
        ])
    return region_df, field_df


def parse_installed_assets_v1(lines: List[str]) -> pd.DataFrame:
    """Code 1 version: Installed Assets (outputs both Total_PDF and Total_Computed)"""
    rows: List[List] = []
    started = False
    for line in lines:
        if not started:
            if ('Installed Assets by Location' in line) or (('FO #' in line) and ('IGT' in line)):
                started = True
            continue
        if not line.strip() or not re.search(r'\d', line):
            continue
        if ('Tot/EGMs' in line) or (('NOV' in line) and ('AIN' in line)):
            continue
        parts = [p.strip() for p in re.split(r'\s{2,}', line) if p.strip()]
        if len(parts) < 5:
            continue
        name = parts[0]
        idx = 1
        fo_number: Optional[str] = None
        if idx < len(parts) and re.match(r'^\d+$', parts[idx]):
            fo_number = parts[idx]
            idx += 1
        loc = parts[idx] if idx < len(parts) else ''
        idx += 1
        svc = parts[idx] if idx < len(parts) else ''
        idx += 1
        if not re.search(r'[A-Za-z]', svc):
            continue
        metric_tokens: List[str] = []
        for token in parts[idx:]:
            metric_tokens += token.split()
        if len(metric_tokens) < 8:
            continue
        manuf_vals: List[Optional[float]] = []
        for t in metric_tokens[:7]:
            if t == '-':
                manuf_vals.append(None)
            else:
                try:
                    manuf_vals.append(float(t.replace(',', '')))
                except Exception:
                    manuf_vals.append(None)

        def _num(tok: Optional[str]) -> Optional[float]:
            if tok is None or tok in ('-', ''):
                return None
            try:
                return float(tok.replace(',', ''))
            except Exception:
                return None

        tot_egms = _num(metric_tokens[7] if len(metric_tokens) > 7 else None)
        frs = _num(metric_tokens[8] if len(metric_tokens) > 8 else None)
        acm = _num(metric_tokens[9] if len(metric_tokens) > 9 else None)

        itc: Optional[float] = None
        total_pdf: Optional[float] = None
        remaining = metric_tokens[10:]
        if remaining:
            total_pdf = _num(remaining[-1])
            for t in remaining[:-1]:
                cand = _num(t)
                if cand is not None:
                    itc = cand
                    break

        total_computed = sum(x for x in [tot_egms, frs, acm, itc] if x is not None)
        rows.append([
            name, fo_number, loc, svc
        ] + manuf_vals + [
            tot_egms, frs, acm, itc, total_pdf, total_computed
        ])
    if rows:
        columns = [
            'LocationName', 'FO#', 'Loc', 'Svc',
            'NOV', 'AIN', 'IGT', 'WMS', 'BAL', 'KON', 'ITE',
            'Tot_EGMs', 'FRS', 'ACM', 'ITC', 'Total_PDF', 'Total_Computed'
        ]
        return pd.DataFrame(rows, columns=columns)
    return pd.DataFrame()


def extract_region_field_installed_v1(pdf_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Code 1 version extraction of three tables: one pass to text, split by page, carry forward Month, then concat.
    Returns: region_df, field_df, installed_df (all with Month column when possible)
    """
    text = subprocess.check_output(['pdftotext', '-layout', pdf_path, '-'], text=True)
    pages = text.split('\f')
    region_frames: List[pd.DataFrame] = []
    field_frames: List[pd.DataFrame] = []
    installed_frames: List[pd.DataFrame] = []
    current_month: Optional[str] = None

    for page_text in pages:
        if not page_text.strip():
            continue
        month = get_month_v1(page_text)
        if month:
            current_month = month

        lines = page_text.split('\n')
        page_upper = page_text.upper()
        has_region = ('ASSETS BY REGION' in page_upper) or ('EGMS BY REGION' in page_upper)
        has_field  = ('ASSETS BY FIELD OFFICE' in page_upper) or ('EGMS BY FIELD OFFICE' in page_upper)
        has_inst   = ('INSTALLED ASSETS BY LOCATION' in page_upper)

        if has_region or has_field:
            rdf, fdf = parse_region_and_field_office_v1(lines)
            if not rdf.empty:
                if current_month:
                    rdf = rdf.assign(Month=current_month)
                region_frames.append(rdf)
            if not fdf.empty:
                if current_month:
                    fdf = fdf.assign(Month=current_month)
                field_frames.append(fdf)

        if has_inst:
            inst_df = parse_installed_assets_v1(lines)
            if not inst_df.empty:
                if current_month:
                    inst_df = inst_df.assign(Month=current_month)
                installed_frames.append(inst_df)

    # Use safe concat to avoid FutureWarning and dtype inference ambiguity
    region_df    = _safe_concat(region_frames)
    field_df     = _safe_concat(field_frames)
    installed_df = _safe_concat(installed_frames)
    return region_df, field_df, installed_df


# =============================================================================
# B) —— Month Detection (enhanced)
# =============================================================================
def detect_month_map(pages: List[str]) -> Dict[int, str]:
    patterns = [
        re.compile(r'Assets by Region,\s*Service\s+for month of\s+([A-Za-z]+\s+\d{4})', re.I),
        re.compile(r'EGMs by Region,\s*Service\s+for month of\s+([A-Za-z]+\s+\d{4})', re.I),
        re.compile(r'for month of\s+([A-Za-z]+\s+\d{4})', re.I),  # fallback
    ]
    month_start_pages: List[Tuple[int, str]] = []
    seen: set = set()
    for i, page in enumerate(pages, start=1):
        for pat in patterns:
            m = pat.search(page)
            if m:
                month = m.group(1).strip()
                if month not in seen:
                    month_start_pages.append((i, month))
                    seen.add(month)
                break

    if not month_start_pages:
        return {}

    month_ranges: List[Tuple[int, int, str]] = []
    for idx, (start, month) in enumerate(month_start_pages):
        end = month_start_pages[idx + 1][0] - 1 if idx + 1 < len(month_start_pages) else len(pages)
        month_ranges.append((start, end, month))

    month_map: Dict[int, str] = {}
    for s, e, mth in month_ranges:
        for p in range(s, e + 1):
            month_map[p] = mth
    return month_map


# =============================================================================
# C) —— Detailed Tables (fixing date-format issues)
# =============================================================================
def parse_asset_details_page(page_text: str) -> List[Dict[str, str]]:
    """
    Fixed version: handle non-standard date formats
    - Adds stricter date validation to exclude Excel serial-like values such as 39681
    - For values that are not valid dates, leave Acquire and Effective empty
    - Other fields are extracted as usual
    """
    out: List[Dict[str, str]] = []

    # Stricter date validation
    def is_valid_date(s: str) -> bool:
        """Check whether a string is a valid date format"""
        if not s:
            return False
        # Basic format check
        if not DATE_RE.match(s):
            return False
        # Exclude pure digits (e.g., Excel serial-like 39681)
        if re.match(r'^\d{5,6}$', s):
            return False
        # Require separator '/' or '-'
        if '/' not in s and '-' not in s:
            return False
        return True

    for raw in page_text.split('\n'):
        line = raw.strip()
        if not line:
            continue
        if not REGION_PREFIX.match(line):
            continue
        toks = line.split()
        if len(toks) < 8:
            continue

        try:
            # Fixed parsing on the left side
            region = toks[0]
            fonum = toks[1]
            i = 2
            while i < len(toks) and not toks[i].isdigit():
                i += 1
            foshort = ' '.join(toks[2:i]).strip()
            if i >= len(toks) or not toks[i].isdigit():
                continue
            loc = toks[i]
            i += 1
            j = i
            while j < len(toks) and not re.fullmatch(r'\d{4,6}', toks[j]):
                j += 1
            if j >= len(toks):
                continue
            lname = ' '.join(toks[i:j]).strip()
            asset = toks[j]
            j += 1
            if j >= len(toks):
                continue
            clazz = toks[j]
            j += 1

            remaining = toks[j:]

            # Find relative indices of valid dates within remaining
            date_rel_idx = [k for k, t in enumerate(remaining) if is_valid_date(t)]

            desc = ''
            type_tok = ''
            acquire = ''
            effective = ''
            disposed = ''
            serial = ''
            age = ''
            years_in_storage = ''

            if date_rel_idx:
                # With valid dates: use the first date to anchor Type
                first_rel = date_rel_idx[0]
                if first_rel > 0:
                    type_tok = remaining[first_rel - 1]
                # Collect consecutive valid dates
                rel_i = first_rel
                dates: List[str] = []
                while rel_i < len(remaining) and is_valid_date(remaining[rel_i]):
                    dates.append(remaining[rel_i])
                    rel_i += 1
                # Fill only valid dates
                acquire = dates[0] if len(dates) > 0 else ''
                effective = dates[1] if len(dates) > 1 else ''
                disposed = dates[2] if len(dates) > 2 else ''
                # Serial: right after the date cluster if it matches pattern
                if rel_i < len(remaining) and SERIAL_RE.match(remaining[rel_i]):
                    serial = remaining[rel_i]
                    rel_i += 1
                # Desc: up to Type if available
                desc_end = first_rel - 1 if type_tok else first_rel
                desc = ' '.join(remaining[:desc_end]).strip()
                # Tail
                tail = remaining[rel_i:]
            else:
                # No valid date present
                # Check for pseudo dates like 39681
                pseudo_date_idx = [k for k, t in enumerate(remaining) if re.match(r'^\d{5,6}$', t)]

                if pseudo_date_idx:
                    # Found pseudo dates; follow similar logic but do not fill date fields
                    first_pseudo = pseudo_date_idx[0]
                    if first_pseudo > 0:
                        type_tok = remaining[first_pseudo - 1]
                    # Skip all consecutive pseudo-date tokens
                    rel_i = first_pseudo
                    while rel_i < len(remaining) and re.match(r'^\d{5,6}$', remaining[rel_i]):
                        rel_i += 1
                    # Serial after pseudo dates
                    if rel_i < len(remaining) and SERIAL_RE.match(remaining[rel_i]):
                        serial = remaining[rel_i]
                        rel_i += 1
                    # Desc
                    desc_end = first_pseudo - 1 if type_tok else first_pseudo
                    desc = ' '.join(remaining[:desc_end]).strip()
                    # Tail
                    tail = remaining[rel_i:]
                else:
                    # No date-like numbers at all
                    desc = ' '.join(remaining).strip()
                    # Check if ends with 'Store'
                    if remaining and remaining[-1].lower() == 'store':
                        type_tok = remaining[-1]
                        if len(remaining) > 1 and SERIAL_RE.match(remaining[-2]):
                            serial = remaining[-2]
                            desc = ' '.join(remaining[:-2]).strip()
                        else:
                            desc = ' '.join(remaining[:-1]).strip()
                    tail = []

            # Parse tail: pure numbers; take from right as Years_in_Storage and Age
            # Limit token length to avoid misidentifying date-like numbers
            if tail:
                ints = [t for t in tail if re.match(r'^\d{1,3}$', t)]
                if len(ints) >= 1:
                    years_in_storage = ints[-1]
                if len(ints) >= 2:
                    age = ints[-2]

            rec = {
                'Region': region, 'FONUM': fonum, 'FOSHORT': foshort, 'Loc': loc, 'LNAME': lname,
                'Asset': asset, 'Class': clazz, 'Desc': desc, 'Type': type_tok,
                'Acquire': acquire, 'Effective': effective, 'SerialNum': serial,
            }
            if disposed:
                rec['Disposed'] = disposed
            if age:
                rec['Age'] = age
            if years_in_storage:
                rec['Years_in_Storage'] = years_in_storage
            out.append(rec)
        except Exception:
            continue  # Skip only extreme anomalies

    return out


def parse_floor_details_page(page_text: str) -> List[Dict[str, str]]:
    """Parse 'Floor Asset Details' from full lines (allowing cross-line tokens)"""
    rows: List[Dict[str, str]] = []
    for line in page_text.split('\n'):
        if not line.strip():
            continue
        if not (re.match(r'^\s*\d', line) and 'Floor' in line):
            continue
        parts = [p.strip() for p in re.split(r'\s{2,}', line) if p.strip()]
        if len(parts) < 6:
            continue

        first = parts[0]
        toks = first.split(None, 1)
        loc = toks[0]
        place = toks[1] if len(toks) > 1 else ''
        region = parts[1] if len(parts) > 1 else ''
        svc = parts[2] if len(parts) > 2 else ''

        asset = ''
        serial = ''
        if len(parts) > 3:
            asset_serial = parts[3].split(None, 1)
            asset = asset_serial[0]
            serial = asset_serial[1] if len(asset_serial) > 1 else ''

        type_ = ''
        desc = ''
        i = 4
        if i < len(parts):
            td = parts[i].split()
            type_ = td[0] if td else ''
            desc = ' '.join(td[1:]) if len(td) > 1 else ''
            i += 1

        acquire = ''
        effective = ''
        disposed = ''
        dates: List[str] = []
        leftover: List[str] = []
        while i < len(parts) and len(dates) < 3:
            token = parts[i]
            for st in token.split():
                if DATE_RE.match(st):
                    dates.append(st)
                else:
                    leftover.append(st)
            i += 1
        if len(dates) > 0: acquire = dates[0]
        if len(dates) > 1: effective = dates[1]
        if len(dates) > 2: disposed = dates[2]

        clazz = ''
        mfg = ''
        class_mfg_parts: List[str] = leftover.copy()
        while i < len(parts) and len(' '.join(class_mfg_parts).split()) < 2:
            class_mfg_parts.append(parts[i]); i += 1
        cm = ' '.join(class_mfg_parts).split()
        if cm: clazz = cm[0]
        if len(cm) > 1: mfg = cm[1]

        lname = parts[i] if i < len(parts) else ''
        i += 1

        fonum = ''
        foshort = ''
        if i < len(parts):
            fo_parts = parts[i].split(None, 1)
            fonum = fo_parts[0]
            foshort = fo_parts[1] if len(fo_parts) > 1 else ''
            i += 1

        cat = parts[i] if i < len(parts) else ''
        i += 1
        year = parts[i] if i < len(parts) else ''
        i += 1

        age = ''
        report_date = ''
        if i < len(parts):
            remaining = parts[i:]
            if len(remaining) == 1:
                token = remaining[0]
                if DATE_RE.match(token):
                    report_date = token
                else:
                    age = token
            else:
                age_candidate = remaining[0]
                last_token = remaining[-1]
                if DATE_RE.match(last_token):
                    report_date = last_token
                    age = age_candidate
                else:
                    age = ' '.join(remaining)

        rows.append({
            'Loc': loc, 'Place': place, 'Region': region, 'Service': svc, 'Asset': asset,
            'SerialNum': serial, 'Type': type_, 'Desc': desc, 'Acquire': effective if False else acquire,  # keep exact structure; no logic change
            'Effective': effective, 'Disposed': disposed, 'Class': clazz, 'MFG': mfg,
            'LNAME': lname, 'FONUM': fonum, 'FOSHORT': foshort, 'Cat': cat,
            'Year': year, 'Age': age, 'ReportDate': report_date
        })
    return rows


def parse_site_line(row: str) -> Optional[Dict[str, str]]:
    """A stitched row -> Site operational fields"""
    toks = row.split()
    if not toks or not toks[0].isdigit():
        return None

    open_idx = None
    for i in range(1, len(toks)):
        if DATE_RE.match(toks[i]):
            open_idx = i
            break
    if open_idx is None or open_idx < 2:
        return None

    loc = toks[0]
    place = toks[open_idx - 1]
    lname = ' '.join(toks[1:open_idx - 1])

    open_date = toks[open_idx]
    closed_date = toks[open_idx + 1] if open_idx + 1 < len(toks) and DATE_RE.match(toks[open_idx + 1]) else ''
    base = open_idx + (2 if closed_date else 1)
    ksi = toks[base] if base < len(toks) else ''
    cmty_num = toks[base + 1] if base + 1 < len(toks) else ''

    svc_idx = None
    j = base + 2
    while j < len(toks):
        if toks[j] in {'Army', 'Navy', 'Air'}:
            svc_idx = j; break
        if toks[j] == 'Marine' and j + 1 < len(toks) and toks[j + 1] == 'Corps':
            svc_idx = j; break
        j += 1
    if svc_idx is None:
        return None

    cmty = ' '.join(toks[base + 2:svc_idx])
    if toks[svc_idx] == 'Marine' and svc_idx + 1 < len(toks) and toks[svc_idx + 1] == 'Corps':
        svc = 'Marine Corps'
        fonum_start = svc_idx + 2
    else:
        svc = toks[svc_idx]
        fonum_start = svc_idx + 1

    if fonum_start >= len(toks):
        return None
    fonum = toks[fonum_start]
    foshort = ' '.join(toks[fonum_start + 1:]) if fonum_start + 1 < len(toks) else ''

    return {
        'Loc': loc, 'LNAME': lname, 'Place': place,
        'Open': open_date, 'Closed': closed_date, 'KSI': ksi,
        'CmtyNum': cmty_num, 'Cmty': cmty, 'SVC': svc,
        'FONUM': fonum, 'FOSHORT': foshort,
    }


def parse_site_status_page(page_text: str) -> List[Dict[str, str]]:
    """Stitch multi-lines into rows: a line that starts with digits marks the start of a new row; other lines are appended to the current row."""
    lines = [ln.rstrip() for ln in page_text.split('\n') if ln.strip()]
    rows: List[str] = []
    buf: List[str] = []

    def flush():
        if buf:
            rows.append(' '.join(buf))
            buf.clear()

    for ln in lines:
        if re.match(r'^\s*\d+\b', ln):  # new row
            flush()
            buf.append(ln.strip())
        else:
            if buf:
                buf.append(ln.strip())
            else:
                continue
    flush()

    out: List[Dict[str, str]] = []
    for r in rows:
        rec = parse_site_line(r)
        if rec:
            out.append(rec)
    return out


# =============================================================================
# D) —— Detailed Extraction (includes Site/Years; fixes empty tables + dtype-compatible casting)
# =============================================================================
def extract_detailed_tables(pdf_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Returns: asset_df, floor_df, site_df, years_storage_df
    - If a page has no detected month, carry forward the most recent month
    - Years in storage: even if years column is missing, default 0 will still appear in the pivot
    """
    text = subprocess.check_output(['pdftotext', '-layout', pdf_path, '-']).decode('utf-8')
    pages = text.split('\f')
    month_map = detect_month_map(pages)

    asset_data: List[Dict[str, str]] = []
    floor_data: List[Dict[str, str]] = []
    site_data: List[Dict[str, str]] = []

    current_month: Optional[str] = None
    for p_num, page_text in enumerate(pages, start=1):
        if not page_text.strip():
            continue
        month = month_map.get(p_num)
        if month:
            current_month = month

        for rec in parse_asset_details_page(page_text):
            if current_month:
                rec['Month'] = current_month
            asset_data.append(rec)

        for rec in parse_floor_details_page(page_text):
            if current_month:
                rec['Month'] = current_month
            floor_data.append(rec)

        for rec in parse_site_status_page(page_text):
            if current_month:
                rec['Month'] = current_month
            site_data.append(rec)

    asset_df = pd.DataFrame(asset_data).drop_duplicates()
    floor_df = pd.DataFrame(floor_data).drop_duplicates()
    site_df  = pd.DataFrame(site_data).drop_duplicates()

    # ---- Years in storage pivot (more robust) ----
    if not asset_df.empty:
        # Find years column
        yrs_col = None
        for c in asset_df.columns:
            if c.lower() in {'years_in_storage', 'years', 'yrs', 'yrs_in_storage'}:
                yrs_col = c
                break
        if yrs_col is None:
            asset_df['Years_in_Storage'] = 0
            yrs_col = 'Years_in_Storage'

        # Compatible casting: prefer nullable int, otherwise fall back to float/int
        age_num = pd.to_numeric(asset_df.get('Age'), errors='coerce')
        yis_num = pd.to_numeric(asset_df[yrs_col], errors='coerce').fillna(0)

        try:
            INT64_NULLABLE = pd.Int64Dtype()
            asset_df['Age_int'] = age_num.astype(INT64_NULLABLE)
            asset_df['Years_in_Storage_int'] = yis_num.astype(INT64_NULLABLE)
        except Exception:
            asset_df['Age_int'] = age_num.astype('float64')
            asset_df['Years_in_Storage_int'] = yis_num.astype('int64')

        filt = asset_df['Age_int'].notna() & (asset_df['Age_int'] <= 50)
        pivot_tables: List[pd.DataFrame] = []

        # Check whether Month column exists
        if 'Month' in asset_df.columns:
            for mth, grp in asset_df[filt].groupby('Month', dropna=False):
                pt = pd.pivot_table(
                    grp,
                    index='Age_int',
                    columns='Years_in_Storage_int',
                    values='Asset',
                    aggfunc='count',
                    fill_value=0
                )
                if isinstance(pt, pd.Series):
                    pt = pt.to_frame()
                pt = pt.reset_index().rename(columns={'Age_int': 'Age'})
                pt.columns = ['Age'] + [f'Storage_{c}' for c in pt.columns[1:]]
                if mth is not None:
                    pt.insert(0, 'Month', mth)
                pivot_tables.append(pt)
        else:
            # Without Month column, handle in aggregate
            pt = pd.pivot_table(
                asset_df[filt],
                index='Age_int',
                columns='Years_in_Storage_int',
                values='Asset',
                aggfunc='count',
                fill_value=0
            )
            if isinstance(pt, pd.Series):
                pt = pt.to_frame()
            pt = pt.reset_index().rename(columns={'Age_int': 'Age'})
            pt.columns = ['Age'] + [f'Storage_{c}' for c in pt.columns[1:]]
            pivot_tables.append(pt)

        years_storage_df = pd.concat(pivot_tables, ignore_index=True) if pivot_tables else pd.DataFrame()
    else:
        years_storage_df = pd.DataFrame()

    return asset_df, floor_df, site_df, years_storage_df


# =============================================================================
# E) —— Output / Runner
# =============================================================================
def save_outputs(outdir: str,
                 region_df: pd.DataFrame,
                 field_df: pd.DataFrame,
                 installed_df: pd.DataFrame,
                 asset_df: pd.DataFrame,
                 floor_df: pd.DataFrame,
                 site_df: pd.DataFrame,
                 years_storage_df: pd.DataFrame) -> None:
    os.makedirs(outdir, exist_ok=True)
    if not region_df.empty:
        region_df.to_csv(os.path.join(outdir, 'assets_by_region_service.csv'), index=False)
    if not field_df.empty:
        field_df.to_csv(os.path.join(outdir, 'assets_by_field_office.csv'), index=False)
    if not installed_df.empty:
        installed_df.to_csv(os.path.join(outdir, 'installed_assets_location_manufacture.csv'), index=False)
    if not asset_df.empty:
        asset_df.to_csv(os.path.join(outdir, 'asset_details.csv'), index=False)
    if not floor_df.empty:
        floor_df.to_csv(os.path.join(outdir, 'floor_asset_details.csv'), index=False)
    if not site_df.empty:
        site_df.to_csv(os.path.join(outdir, 'site_operational_status.csv'), index=False)
    if not years_storage_df.empty:
        years_storage_df.to_csv(os.path.join(outdir, 'years_in_storage.csv'), index=False)


def run_in_notebook(pdf_path: str, outdir: str = '.'):
    """
    First use code 1 flow to extract three summary tables (more complete, with Month);
    then extract four detailed tables (Asset/Floor/Site/Years); finally save all outputs.
    """
    region_df, field_df, installed_df = extract_region_field_installed_v1(pdf_path)
    asset_df, floor_df, site_df, years_storage_df = extract_detailed_tables(pdf_path)

    save_outputs(outdir, region_df, field_df, installed_df, asset_df, floor_df, site_df, years_storage_df)
    print('Extraction complete.')
    for name, df in [('Region', region_df), ('FieldOffice', field_df),
                     ('Installed', installed_df), ('AssetDetails', asset_df),
                     ('FloorDetails', floor_df), ('SiteOperational', site_df),
                     ('YearsInStorage', years_storage_df)]:
        if not df.empty:
            print(f'[{name}] rows={len(df):,}, cols={len(df.columns)}')

run_in_notebook('/content/FY2023_Asset_Report.pdf', '/content')


  return pd.concat(kept, ignore_index=True) if kept else pd.DataFrame()


Extraction complete.
[Region] rows=48, cols=9
[FieldOffice] rows=300, cols=9
[Installed] rows=888, cols=18
[AssetDetails] rows=4,911, cols=17
[FloorDetails] rows=23,303, cols=21
[SiteOperational] rows=994, cols=12
[YearsInStorage] rows=125, cols=56
