In [1]:
!apt-get update -y
!apt-get install -y --fix-missing poppler-utils
!apt-get update -qq
!apt-get install -y -qq poppler-utils
!pip install docling-parse docling-core rapidfuzz docling


Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 https://cli.github.com/packages stable InRelease
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,123 kB]
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,825 kB]
Get:11 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [6,168 kB]
Hit:13 https://ppa.launchpadcontent.net/graphics-dri

In [2]:
import gdown
import pandas as pd
import numpy as np

# I’ve uploaded the file to Google Drive, and this code is used to read it.
url = 'https://drive.google.com/uc?export=download&id=1pAkcjieicAWNrMH5nSjWT8ak3P23vcIp'
output = 'FY2020_Asset_Report.pdf'
gdown.download(url, output, quiet=False)

url = 'https://drive.google.com/uc?export=download&id=1KM8JnW5gaNNYn6eTYI1pRXDtQsKbFJHo'
output = 'FY2022_Asset_Report.pdf'
gdown.download(url, output, quiet=False)

url = 'https://drive.google.com/uc?export=download&id=1QtmW3UhIMXDLQoI3wI7zH88SACFurjcR'
output = 'FY2023_Asset_Report.pdf'
gdown.download(url, output, quiet=False)

url = 'https://drive.google.com/uc?export=download&id=10J0fzRZMbZfO7v0huyD0s7d5kQ8garfQ'
output = 'FY2024_Asset_Report.pdf'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=1pAkcjieicAWNrMH5nSjWT8ak3P23vcIp
To: /content/FY2020_Asset_Report.pdf
100%|██████████| 12.4M/12.4M [00:00<00:00, 39.0MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=1KM8JnW5gaNNYn6eTYI1pRXDtQsKbFJHo
To: /content/FY2022_Asset_Report.pdf
100%|██████████| 16.1M/16.1M [00:00<00:00, 106MB/s] 
Downloading...
From: https://drive.google.com/uc?export=download&id=1QtmW3UhIMXDLQoI3wI7zH88SACFurjcR
To: /content/FY2023_Asset_Report.pdf
100%|██████████| 13.5M/13.5M [00:00<00:00, 88.0MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=10J0fzRZMbZfO7v0huyD0s7d5kQ8garfQ
To: /content/FY2024_Asset_Report.pdf
100%|██████████| 10.4M/10.4M [00:00<00:00, 48.0MB/s]


'FY2024_Asset_Report.pdf'

In [4]:

import os
import re
import shutil
import subprocess
from typing import List, Tuple, Optional, Dict, Any

import numpy as np
import pandas as pd

# =============================================================================
# Common regular expressions and helper functions
# =============================================================================
DATE_RE = re.compile(r'^(?:\d{1,2}/\d{1,2}/\d{2,4}|\d{4}-\d{1,2}-\d{1,2})$')
REGION_PREFIX = re.compile(
    r'^(Europe|Japan|Korea|Okinawa|Pacific|United(?:\s+States)?)\b',
    re.IGNORECASE
)
SERIAL_RE = re.compile(r'(?i)^[a-z0-9-]+$')

# Acceptable Floor regions, used to determine whether a row is a summary
ALLOWED_FLOOR_REGIONS = {"Europe", "Japan", "Korea"}

def _safe_concat(frames: List[pd.DataFrame]) -> pd.DataFrame:
    """Safely concatenate DataFrames, ignoring empty DataFrames."""
    if not frames:
        return pd.DataFrame()
    kept: List[pd.DataFrame] = []
    for df in frames:
        if df is None or df.empty:
            continue
        if df.dropna(how='all').empty:
            continue
        kept.append(df)
    return pd.concat(kept, ignore_index=True) if kept else pd.DataFrame()

# =============================================================================
# PDF → Text (New: try pdftotext first; if it fails, fallback to pdfminer.six)
# =============================================================================
def read_pdf_text_layout(pdf_path: str) -> str:
    """Read PDF text layout using pdftotext only (no fallback)."""
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF not found: {pdf_path}")
    if shutil.which("pdftotext") is None:
        raise EnvironmentError("pdftotext is not installed or not found in PATH.")
    out = subprocess.check_output(
        ["pdftotext", "-layout", pdf_path, "-"],
        stderr=subprocess.STDOUT
    )
    text = out.decode("utf-8", errors="ignore")
    if not text.strip():
        raise ValueError("Empty text extracted from PDF. Check the file or pdftotext output.")
    return text


# =============================================================================
# A) Region and Field Office summary parsing
# =============================================================================
def get_month_v1(page_text: str) -> Optional[str]:
    m = re.search(r'for month of\s+([A-Za-z]+)\s+(\d{4})', page_text, re.IGNORECASE)
    return f"{m.group(1).capitalize()} {m.group(2)}" if m else None

def parse_region_and_field_office_v1(lines: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Parse summary tables such as Assets/EGMs by Region/Field Office."""
    region_df = pd.DataFrame()
    field_df = pd.DataFrame()

    # Region summary
    idx_slots = next((i for i, l in enumerate(lines) if l.strip().startswith('Slots Only')), None)
    if idx_slots is not None:
        region_rows: List[List[Any]] = []
        i = idx_slots + 1
        while i < len(lines):
            line = lines[i].strip()
            if not line:
                i += 1
                continue
            if (
                line.startswith('Locations by Service') or
                'Assets by Field Office' in line or
                'EGMs by Field Office' in line or
                'Installed Assets by Location' in line
            ):
                break
            parts = [p for p in re.split(r'\s{2,}', line) if p]
            if len(parts) >= 5:
                region_name = parts[0]
                values: List[Any] = []
                for p in parts[1:]:
                    x = p.replace('%', '')
                    if x in ('-', ''):
                        values.append(np.nan)
                    else:
                        try:
                            values.append(float(x.replace(',', '')))
                        except Exception:
                            values.append(x)
                region_rows.append([region_name] + values)
            i += 1
        if region_rows:
            maxlen = max(len(r) for r in region_rows)
            for r in region_rows:
                while len(r) < maxlen:
                    r.append(np.nan)
            cols = ['Region', '#Locations', 'Army', 'Navy', 'Marine_Corps', 'Airforce', 'Total', 'Percent']
            region_df = pd.DataFrame(region_rows, columns=cols[:maxlen])

    # Field office summary
    field_rows: List[List[Any]] = []
    start = False
    current_region: Optional[str] = None
    for raw in lines:
        line = raw.rstrip(' ')
        if 'Assets by Field Office' in line or 'EGMs by Field Office' in line:
            start = True
            continue
        if not start:
            continue
        if not line.strip() or line.strip().startswith('Slots'):
            continue
        if line.strip() and not re.search(r'\d', line.strip()):
            current_region = line.strip()
            continue
        parts = [p for p in re.split(r'\s{2,}', line.strip()) if p]
        if len(parts) >= 6 and all(re.match(r'^[()0-9,.-]+$', p) for p in parts[-5:]):
            first_part = parts[0]
            fo_split = first_part.split()
            if len(fo_split) < 2 or not fo_split[0].isdigit():
                continue
            fo_number = fo_split[0]
            location_parts = fo_split[1:] + parts[1:-5]
            location_name = ' '.join(location_parts)
            values: List[float] = []
            for p in parts[-5:]:
                if p == '-':
                    values.append(np.nan)
                else:
                    v = p
                    if v.startswith('(') and v.endswith(')'):
                        v = '-' + v[1:-1]
                    values.append(float(v.replace(',', '')))
            field_rows.append([current_region, fo_number, location_name] + values)
    if field_rows:
        field_df = pd.DataFrame(field_rows, columns=[
            'Region', 'FO#', 'Location', 'Slots', 'ACM_CountR', 'ITC', 'FRS', 'Total'
        ])

    return region_df, field_df

# =============================================================================
# B) Installed Assets by Location parsing (original v2)
# =============================================================================
def parse_installed_assets_v2(lines: List[str]) -> pd.DataFrame:
    rows: List[List[Any]] = []
    started = False
    current_region: Optional[str] = None

    for raw in lines:
        line = raw.rstrip()

        if 'Installed Assets by Location' in line or (("FO #" in line) and ("IGT" in line)):
            started = True
            m = REGION_PREFIX.match(line.strip())
            if m:
                current_region = m.group(1).strip()
            continue

        if not started:
            continue

        if not line.strip():
            continue

        if not re.search(r'\d', line):
            m = REGION_PREFIX.match(line.strip())
            if m:
                current_region = m.group(1).strip()
            continue

        if ('Tot/EGMs' in line) or (("NOV" in line) and ("AIN" in line)):
            parts_hdr = [p.strip() for p in re.split(r'\s{2,}', line) if p.strip()]
            if parts_hdr:
                m = REGION_PREFIX.match(parts_hdr[0])
                if m:
                    current_region = m.group(1).strip()
            continue

        parts = [p.strip() for p in re.split(r'\s{2,}', line) if p.strip()]

        if parts and REGION_PREFIX.match(parts[0]):
            current_region = REGION_PREFIX.match(parts[0]).group(1).strip()
            continue

        if len(parts) < 5:
            continue

        name = parts[0]
        idx = 1

        fo_number = None
        if idx < len(parts) and re.match(r'^\d+$', parts[idx]):
            fo_number = parts[idx]
            idx += 1

        if idx >= len(parts):
            continue
        loc = parts[idx]
        idx += 1

        if idx >= len(parts):
            continue
        svc = parts[idx]
        idx += 1

        if not re.search(r'[A-Za-z]', svc):
            continue

        metric_tokens: List[str] = []
        for token in parts[idx:]:
            metric_tokens += token.split()

        if len(metric_tokens) < 6:
            continue

        manuf_vals: List[Optional[float]] = []
        for t in metric_tokens[:7]:
            if t in {'-', ''}:
                manuf_vals.append(None)
            else:
                try:
                    manuf_vals.append(float(t.replace(',', '')))
                except Exception:
                    manuf_vals.append(None)
        while len(manuf_vals) < 7:
            manuf_vals.append(None)

        remaining = metric_tokens[7:]
        if not remaining:
            continue

        def _num(tok: Optional[str]) -> Optional[float]:
            if tok is None or tok in ('-', ''):
                return None
            try:
                return float(tok.replace(',', ''))
            except Exception:
                return None

        total_pdf = _num(remaining[-1])

        tot_egms = frs = acm = None
        for tok in remaining[:-1]:
            n = _num(tok)
            if n is not None:
                if tot_egms is None:
                    tot_egms = n
                    continue
                if frs is None:
                    frs = n
                    continue
                if acm is None:
                    acm = n
                    continue
            if tot_egms is not None and frs is not None and acm is not None:
                break

        itc = None
        for tok in reversed(remaining[:-1]):
            n = _num(tok)
            if n is not None:
                itc = n
                break

        total_computed = sum(x for x in [tot_egms, frs, acm, itc] if x is not None)

        rows.append([
            current_region, name, fo_number, loc, svc,
            *manuf_vals, tot_egms, frs, acm, itc, total_pdf, total_computed
        ])

    if not rows:
        return pd.DataFrame()

    columns = [
        'Region', 'LocationName', 'FO#', 'Loc', 'Svc',
        'NOV', 'AIN', 'IGT', 'WMS', 'BAL', 'KON', 'ITE',
        'Tot_EGMs', 'FRS', 'ACM', 'ITC', 'Total_PDF', 'Total_Computed'
    ]
    return pd.DataFrame(rows, columns=columns)

def extract_region_field_installed_v2(pdf_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    # Here we switch to the shared PDF→Text function (keep the remaining logic unchanged)
    text = read_pdf_text_layout(pdf_path)
    pages = text.split('\f')

    region_frames: List[pd.DataFrame] = []
    field_frames: List[pd.DataFrame] = []
    installed_frames: List[pd.DataFrame] = []

    current_month: Optional[str] = None
    for page_text in pages:
        if not page_text.strip():
            continue

        month = get_month_v1(page_text)
        if month:
            current_month = month

        lines = page_text.split('\n')
        page_upper = page_text.upper()

        has_region = ('ASSETS BY REGION' in page_upper) or ('EGMS BY REGION' in page_upper)
        has_field = ('ASSETS BY FIELD OFFICE' in page_upper) or ('EGMS BY FIELD OFFICE' in page_upper)
        if has_region or has_field:
            rdf, fdf = parse_region_and_field_office_v1(lines)
            if not rdf.empty:
                if current_month:
                    rdf = rdf.assign(Month=current_month)
                region_frames.append(rdf)
            if not fdf.empty:
                if current_month:
                    fdf = fdf.assign(Month=current_month)
                field_frames.append(fdf)

        if 'INSTALLED ASSETS BY LOCATION' in page_upper:
            inst_df = parse_installed_assets_v2(lines)
            if not inst_df.empty:
                if current_month:
                    inst_df = inst_df.assign(Month=current_month)
                installed_frames.append(inst_df)

    region_df = _safe_concat(region_frames)
    field_df = _safe_concat(field_frames)
    installed_df = _safe_concat(installed_frames)
    return region_df, field_df, installed_df

# =============================================================================
# C) Other detailed table parsing
# =============================================================================
def detect_month_map(pages: List[str]) -> Dict[int, str]:
    patterns = [
        re.compile(r'Assets by Region,\s*Service\s+for month of\s+([A-Za-z]+\s+\d{4})', re.I),
        re.compile(r'EGMs by Region,\s*Service\s+for month of\s+([A-Za-z]+\s+\d{4})', re.I),
        re.compile(r'for month of\s+([A-Za-z]+\s+\d{4})', re.I),
    ]
    month_start_pages: List[Tuple[int, str]] = []
    seen: set = set()
    for i, page in enumerate(pages, start=1):
        for pat in patterns:
            m = pat.search(page)
            if m:
                month = m.group(1).strip()
                if month not in seen:
                    month_start_pages.append((i, month))
                    seen.add(month)
                break
    if not month_start_pages:
        return {}
    month_ranges: List[Tuple[int, int, str]] = []
    for idx, (start, month) in enumerate(month_start_pages):
        end = month_start_pages[idx + 1][0] - 1 if idx + 1 < len(month_start_pages) else len(pages)
        month_ranges.append((start, end, month))
    month_map: Dict[int, str] = {}
    for s, e, mth in month_ranges:
        for p in range(s, e + 1):
            month_map[p] = mth
    return month_map

# -- asset details parsing --
def parse_asset_details_page(page_text: str) -> List[Dict[str, str]]:
    out: List[Dict[str, str]] = []
    def is_valid_date(s: str) -> bool:
        if not s:
            return False
        if not DATE_RE.match(s):
            return False
        if re.match(r'^\d{5,6}$', s):
            return False
        if '/' not in s and '-' not in s:
            return False
        return True

    for raw in page_text.split('\n'):
        line = raw.strip()
        if not line:
            continue
        if not REGION_PREFIX.match(line):
            continue
        toks = line.split()
        if len(toks) < 8:
            continue
        try:
            region = toks[0]
            fonum = toks[1]
            i = 2
            while i < len(toks) and not toks[i].isdigit():
                i += 1
            foshort = ' '.join(toks[2:i]).strip()
            if i >= len(toks) or not toks[i].isdigit():
                continue
            loc = toks[i]
            i += 1
            j = i
            while j < len(toks) and not re.fullmatch(r'\d{4,6}', toks[j]):
                j += 1
            if j >= len(toks):
                continue
            lname = ' '.join(toks[i:j]).strip()
            asset = toks[j]
            j += 1
            if j >= len(toks):
                continue
            clazz = toks[j]
            j += 1
            remaining = toks[j:]
            date_rel_idx = [k for k, t in enumerate(remaining) if is_valid_date(t)]
            desc = ''
            type_tok = ''
            acquire = ''
            effective = ''
            disposed = ''
            serial = ''
            age = ''
            years_in_storage = ''
            if date_rel_idx:
                first_rel = date_rel_idx[0]
                if first_rel > 0:
                    type_tok = remaining[first_rel - 1]
                rel_i = first_rel
                dates: List[str] = []
                while rel_i < len(remaining) and is_valid_date(remaining[rel_i]):
                    dates.append(remaining[rel_i])
                    rel_i += 1
                acquire = dates[0] if len(dates) > 0 else ''
                effective = dates[1] if len(dates) > 1 else ''
                disposed = dates[2] if len(dates) > 2 else ''
                if rel_i < len(remaining) and SERIAL_RE.match(remaining[rel_i]):
                    serial = remaining[rel_i]
                    rel_i += 1
                desc_end = first_rel - 1 if type_tok else first_rel
                desc = ' '.join(remaining[:desc_end]).strip()
                tail = remaining[rel_i:]
            else:
                pseudo_date_idx = [k for k, t in enumerate(remaining) if re.match(r'^\d{5,6}$', t)]
                if pseudo_date_idx:
                    first_pseudo = pseudo_date_idx[0]
                    if first_pseudo > 0:
                        type_tok = remaining[first_pseudo - 1]
                    rel_i = first_pseudo
                    while rel_i < len(remaining) and re.match(r'^\d{5,6}$', remaining[rel_i]):
                        rel_i += 1
                    if rel_i < len(remaining) and SERIAL_RE.match(remaining[rel_i]):
                        serial = remaining[rel_i]
                        rel_i += 1
                    desc_end = first_pseudo - 1 if type_tok else first_pseudo
                    desc = ' '.join(remaining[:desc_end]).strip()
                    tail = remaining[rel_i:]
                else:
                    desc = ' '.join(remaining).strip()
                    if remaining and remaining[-1].lower() == 'store':
                        type_tok = remaining[-1]
                        if len(remaining) > 1 and SERIAL_RE.match(remaining[-2]):
                            serial = remaining[-2]
                            desc = ' '.join(remaining[:-2]).strip()
                        else:
                            desc = ' '.join(remaining[:-1]).strip()
                    tail = []
            if tail:
                ints = [t for t in tail if re.match(r'^\d{1,3}$', t)]
                if len(ints) >= 1:
                    years_in_storage = ints[-1]
                if len(ints) >= 2:
                    age = ints[-2]
            rec = {
                'Region': region, 'FONUM': fonum, 'FOSHORT': foshort, 'Loc': loc, 'LNAME': lname,
                'Asset': asset, 'Class': clazz, 'Desc': desc, 'Type': type_tok,
                'Acquire': acquire, 'Effective': effective, 'SerialNum': serial,
            }
            if disposed:
                rec['Disposed'] = disposed
            if age:
                rec['Age'] = age
            if years_in_storage:
                rec['Years_in_Storage'] = years_in_storage
            out.append(rec)
        except Exception:
            continue
    return out

# -- Floor summary section parsing --
def parse_floor_summary_parts(parts: List[str]) -> Dict[str, Optional[str]]:
    rec: Dict[str, Optional[str]] = {}
    tokens = parts[0].split()
    if not tokens:
        return rec
    rec['FONUM'] = tokens[0]
    rec['Location'] = ' '.join(tokens[1:]) if len(tokens) > 1 else ''
    open_date = None
    close_date = None
    if len(parts) > 2:
        dates = parts[2].split()
        if dates:
            open_date = dates[0]
            if len(dates) > 1:
                close_date = dates[1]
    rec['Open'] = open_date
    rec['Closed'] = close_date
    rec['CountA'] = parts[3] if len(parts) > 3 else None
    if len(parts) > 4:
        sub = parts[4].split(None, 1)
        rec['CountB'] = sub[0] if sub else None
        rec['Service'] = sub[1] if len(sub) > 1 else None
    else:
        rec['CountB'] = None
        rec['Service'] = None
    if len(parts) > 5:
        sub = parts[5].split(None, 1)
        rec['CountC'] = sub[0]
        rec['Region2'] = sub[1] if len(sub) > 1 else None
    else:
        rec['CountC'] = None
        rec['Region2'] = None
    if len(parts) > 6:
        sub = parts[6].split(None, 1)
        rec['CountD'] = sub[0]
        rec['Country'] = sub[1] if len(sub) > 1 else None
    else:
        rec['CountD'] = None
        rec['Country'] = None
    rec['SiteCode'] = parts[7] if len(parts) > 7 else None
    rec['Code'] = parts[8] if len(parts) > 8 else None
    rec['Person'] = parts[9] if len(parts) > 9 else None
    rec['DuplicateName'] = parts[10] if len(parts) > 10 else None
    rec['Occupancy'] = parts[11] if len(parts) > 11 else None
    rec['RegionCode'] = parts[12] if len(parts) > 12 else None
    rec['Fields'] = '|'.join(parts)
    return rec

# -- Floor asset details + summary detection/parsing --
def parse_floor_details_page(page_text: str) -> Tuple[List[Dict[str, str]], List[Dict[str, Optional[str]]]]:
    rows: List[Dict[str, str]] = []
    extras: List[Dict[str, Optional[str]]] = []
    for line in page_text.split('\n'):
        if not line.strip():
            continue
        if not (re.match(r'^\s*\d', line) and 'Floor' in line):
            continue
        parts = [p.strip() for p in re.split(r'\s{2,}', line) if p.strip()]
        if len(parts) < 3:
            continue
        candidate_region = parts[1]
        if candidate_region not in ALLOWED_FLOOR_REGIONS:
            extras.append(parse_floor_summary_parts(parts))
            continue
        # detail row
        first = parts[0]
        toks = first.split(None, 1)
        loc = toks[0]
        place = toks[1] if len(toks) > 1 else ''
        region = parts[1] if len(parts) > 1 else ''
        svc = parts[2] if len(parts) > 2 else ''
        asset = ''
        serial = ''
        if len(parts) > 3:
            asset_serial = parts[3].split(None, 1)
            asset = asset_serial[0]
            serial = asset_serial[1] if len(asset_serial) > 1 else ''
        type_ = ''
        desc = ''
        i = 4
        if i < len(parts):
            td = parts[i].split()
            type_ = td[0] if td else ''
            desc = ' '.join(td[1:]) if len(td) > 1 else ''
            i += 1
        acquire = ''
        effective = ''
        disposed = ''
        dates: List[str] = []
        leftover: List[str] = []
        while i < len(parts) and len(dates) < 3:
            token = parts[i]
            for st in token.split():
                if DATE_RE.match(st):
                    dates.append(st)
                else:
                    leftover.append(st)
            i += 1
        if len(dates) > 0:
            acquire = dates[0]
        if len(dates) > 1:
            effective = dates[1]
        if len(dates) > 2:
            disposed = dates[2]
        clazz = ''
        mfg = ''
        class_mfg_parts: List[str] = leftover.copy()
        while i < len(parts) and len(' '.join(class_mfg_parts).split()) < 2:
            class_mfg_parts.append(parts[i])
            i += 1
        cm = ' '.join(class_mfg_parts).split()
        if cm:
            clazz = cm[0]
        if len(cm) > 1:
            mfg = cm[1]
        lname = parts[i] if i < len(parts) else ''
        i += 1
        fonum = ''
        foshort = ''
        if i < len(parts):
            fo_parts = parts[i].split(None, 1)
            fonum = fo_parts[0]
            foshort = fo_parts[1] if len(fo_parts) > 1 else ''
            i += 1
        cat = parts[i] if i < len(parts) else ''
        i += 1
        year = parts[i] if i < len(parts) else ''
        i += 1
        age = ''
        report_date = ''
        if i < len(parts):
            remaining = parts[i:]
            if len(remaining) == 1:
                token = remaining[0]
                if DATE_RE.match(token):
                    report_date = token
                else:
                    age = token
            else:
                age_candidate = remaining[0]
                last_token = remaining[-1]
                if DATE_RE.match(last_token):
                    report_date = last_token
                    age = age_candidate
                else:
                    age = ' '.join(remaining)
        rows.append({
            'Loc': loc, 'Place': place, 'Region': region, 'Service': svc, 'Asset': asset,
            'SerialNum': serial, 'Type': type_, 'Desc': desc, 'Acquire': acquire,
            'Effective': effective, 'Disposed': disposed, 'Class': clazz, 'MFG': mfg,
            'LNAME': lname, 'FONUM': fonum, 'FOSHORT': foshort, 'Cat': cat,
            'Year': year, 'Age': age, 'ReportDate': report_date
        })
    return rows, extras

# -- Site status parsing --
def parse_site_line(row: str) -> Optional[Dict[str, str]]:
    toks = row.split()
    if not toks or not toks[0].isdigit():
        return None
    open_idx = None
    for i in range(1, len(toks)):
        if DATE_RE.match(toks[i]):
            open_idx = i
            break
    if open_idx is None or open_idx < 2:
        return None
    loc = toks[0]
    place = toks[open_idx - 1]
    lname = ' '.join(toks[1:open_idx - 1])
    open_date = toks[open_idx]
    closed_date = toks[open_idx + 1] if open_idx + 1 < len(toks) and DATE_RE.match(toks[open_idx + 1]) else ''
    base = open_idx + (2 if closed_date else 1)
    ksi = toks[base] if base < len(toks) else ''
    cmty_num = toks[base + 1] if base + 1 < len(toks) else ''
    svc_idx = None
    j = base + 2
    while j < len(toks):
        if toks[j] in {'Army', 'Navy', 'Air'}:
            svc_idx = j; break
        if toks[j] == 'Marine' and j + 1 < len(toks) and toks[j + 1] == 'Corps':
            svc_idx = j; break
        j += 1
    if svc_idx is None:
        return None
    cmty = ' '.join(toks[base + 2:svc_idx])
    if toks[svc_idx] == 'Marine' and svc_idx + 1 < len(toks) and toks[svc_idx + 1] == 'Corps':
        svc = 'Marine Corps'
        fonum_start = svc_idx + 2
    else:
        svc = toks[svc_idx]
        fonum_start = svc_idx + 1
    if fonum_start >= len(toks):
        return None
    fonum = toks[fonum_start]
    foshort = ' '.join(toks[fonum_start + 1:]) if fonum_start + 1 < len(toks) else ''
    return {
        'Loc': loc, 'LNAME': lname, 'Place': place,
        'Open': open_date, 'Closed': closed_date, 'KSI': ksi,
        'CmtyNum': cmty_num, 'Cmty': cmty, 'SVC': svc,
        'FONUM': fonum, 'FOSHORT': foshort,
    }

def parse_site_status_page(page_text: str) -> List[Dict[str, str]]:
    lines = [ln.rstrip() for ln in page_text.split('\n') if ln.strip()]
    rows: List[str] = []
    buf: List[str] = []
    def flush():
        if buf:
            rows.append(' '.join(buf))
            buf.clear()
    for ln in lines:
        if re.match(r'^\s*\d+\b', ln):
            flush()
            buf.append(ln.strip())
        else:
            if buf:
                buf.append(ln.strip())
            else:
                continue
    flush()
    out: List[Dict[str, str]] = []
    for r in rows:
        rec = parse_site_line(r)
        if rec:
            out.append(rec)
    return out

# =============================================================================
# D) Extract detailed tables (using read_pdf_text_layout)
# =============================================================================
def extract_detailed_tables(pdf_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    text = read_pdf_text_layout(pdf_path)
    pages = text.split('\f')
    month_map = detect_month_map(pages)

    asset_data: List[Dict[str, str]] = []
    floor_data: List[Dict[str, str]] = []
    extra_floor_data: List[Dict[str, Optional[str]]] = []
    site_data: List[Dict[str, str]] = []

    current_month: Optional[str] = None
    for p_num, page_text in enumerate(pages, start=1):
        if not page_text.strip():
            continue
        month = month_map.get(p_num)
        if month:
            current_month = month

        for rec in parse_asset_details_page(page_text):
            if current_month:
                rec['Month'] = current_month
            asset_data.append(rec)

        floor_recs, extra_recs = parse_floor_details_page(page_text)
        for rec in floor_recs:
            if current_month:
                rec['Month'] = current_month
            floor_data.append(rec)
        for rec in extra_recs:
            if current_month:
                rec['Month'] = current_month
            extra_floor_data.append(rec)

        for rec in parse_site_status_page(page_text):
            if current_month:
                rec['Month'] = current_month
            site_data.append(rec)

    asset_df = pd.DataFrame(asset_data).drop_duplicates()
    floor_df = pd.DataFrame(floor_data).drop_duplicates()
    site_df = pd.DataFrame(site_data).drop_duplicates()
    extra_floor_df = pd.DataFrame(extra_floor_data).drop_duplicates()

    # Years in storage pivot
    if not asset_df.empty:
        yrs_col = None
        for c in asset_df.columns:
            if c.lower() in {'years_in_storage', 'years', 'yrs', 'yrs_in_storage'}:
                yrs_col = c
                break
        if yrs_col is None:
            asset_df['Years_in_Storage'] = 0
            yrs_col = 'Years_in_Storage'
        age_num = pd.to_numeric(asset_df.get('Age'), errors='coerce')
        yis_num = pd.to_numeric(asset_df[yrs_col], errors='coerce').fillna(0)
        try:
            INT64_NULLABLE = pd.Int64Dtype()
            asset_df['Age_int'] = age_num.astype(INT64_NULLABLE)
            asset_df['Years_in_Storage_int'] = yis_num.astype(INT64_NULLABLE)
        except Exception:
            asset_df['Age_int'] = age_num.astype('float64')
            asset_df['Years_in_Storage_int'] = yis_num.astype('int64')
        filt = asset_df['Age_int'].notna() & (asset_df['Age_int'] <= 50)
        pivot_tables: List[pd.DataFrame] = []
        if 'Month' in asset_df.columns:
            for mth, grp in asset_df[filt].groupby('Month', dropna=False):
                pt = pd.pivot_table(
                    grp,
                    index='Age_int',
                    columns='Years_in_Storage_int',
                    values='Asset',
                    aggfunc='count',
                    fill_value=0
                )
                if isinstance(pt, pd.Series):
                    pt = pt.to_frame()
                pt = pt.reset_index().rename(columns={'Age_int': 'Age'})
                pt.columns = ['Age'] + [f'Storage_{c}' for c in pt.columns[1:]]
                if mth is not None:
                    pt.insert(0, 'Month', mth)
                pivot_tables.append(pt)
        else:
            pt = pd.pivot_table(
                asset_df[filt],
                index='Age_int',
                columns='Years_in_Storage_int',
                values='Asset',
                aggfunc='count',
                fill_value=0
            )
            if isinstance(pt, pd.Series):
                pt = pt.to_frame()
            pt = pt.reset_index().rename(columns={'Age_int': 'Age'})
            pt.columns = ['Age'] + [f'Storage_{c}' for c in pt.columns[1:]]
            pivot_tables.append(pt)
        years_storage_df = pd.concat(pivot_tables, ignore_index=True) if pivot_tables else pd.DataFrame()
    else:
        years_storage_df = pd.DataFrame()

    return asset_df, floor_df, site_df, years_storage_df, extra_floor_df

# =============================================================================
# E) Save output results (keep your 8 tables including extra_floor_df)
# =============================================================================
def save_outputs(outdir: str,
                 region_df: pd.DataFrame,
                 field_df: pd.DataFrame,
                 installed_df: pd.DataFrame,
                 asset_df: pd.DataFrame,
                 floor_df: pd.DataFrame,
                 site_df: pd.DataFrame,
                 years_storage_df: pd.DataFrame,
                 extra_floor_df: pd.DataFrame) -> None:
    os.makedirs(outdir, exist_ok=True)
    if not region_df.empty:
        region_df.to_csv(os.path.join(outdir, 'assets_by_region_service.csv'), index=False)
    if not field_df.empty:
        field_df.to_csv(os.path.join(outdir, 'assets_by_field_office.csv'), index=False)
    if not installed_df.empty:
        installed_df.to_csv(os.path.join(outdir, 'installed_assets_location_manufacture.csv'), index=False)
    if not asset_df.empty:
        asset_df.to_csv(os.path.join(outdir, 'asset_details.csv'), index=False)
    if not floor_df.empty:
        floor_df.to_csv(os.path.join(outdir, 'floor_asset_details.csv'), index=False)
    if not site_df.empty:
        site_df.to_csv(os.path.join(outdir, 'site_operational_status.csv'), index=False)
    if not years_storage_df.empty:
        years_storage_df.to_csv(os.path.join(outdir, 'years_in_storage.csv'), index=False)
    if not extra_floor_df.empty:
        # Eighth table: Floor Summary (retain/output)
        extra_floor_df.to_csv(os.path.join(outdir, 'floor_summary_details.csv'), index=False)

# =============================================================================
# F) Main execution function
# =============================================================================
def run_extraction(pdf_path: str, outdir: str='/content/output'):
    # Summary tables
    region_df, field_df, installed_df = extract_region_field_installed_v2(pdf_path)
    # Detailed tables (including the eighth table)
    asset_df, floor_df, site_df, years_storage_df, extra_floor_df = extract_detailed_tables(pdf_path)
    # Save all tables
    save_outputs(outdir, region_df, field_df, installed_df, asset_df, floor_df, site_df, years_storage_df, extra_floor_df)
    print('Extraction complete.')
    for name, df in [('Region', region_df), ('FieldOffice', field_df),
                     ('Installed', installed_df), ('AssetDetails', asset_df),
                     ('FloorDetails', floor_df), ('SiteOperational', site_df),
                     ('YearsInStorage', years_storage_df), ('FloorSummary', extra_floor_df)]:
        if not df.empty:
            print(f'[{name}] rows={len(df):,}, cols={len(df.columns)}')

# -- Execute parsing for multiple years --
year_jobs = [
    ("/content/FY2020_Asset_Report.pdf", "/content/FY2020_Asset_Report_output"),
    ("/content/FY2022_Asset_Report.pdf", "/content/FY2022_Asset_Report_output"),
    ("/content/FY2023_Asset_Report.pdf", "/content/FY2023_Asset_Report_output"),
    ("/content/FY2024_Asset_Report.pdf", "/content/FY2024_Asset_Report_output"),
]

for pdf_path, outdir in year_jobs:
    print(f"\n=== Running extraction for: {pdf_path} ===")
    run_extraction(pdf_path=pdf_path, outdir=outdir)




=== Running extraction for: /content/FY2020_Asset_Report.pdf ===


  return pd.concat(kept, ignore_index=True) if kept else pd.DataFrame()


Extraction complete.
[Region] rows=40, cols=9
[FieldOffice] rows=210, cols=9
[Installed] rows=566, cols=19
[AssetDetails] rows=3,084, cols=16
[FloorDetails] rows=16,157, cols=21
[SiteOperational] rows=582, cols=12
[FloorSummary] rows=582, cols=19

=== Running extraction for: /content/FY2022_Asset_Report.pdf ===


  return pd.concat(kept, ignore_index=True) if kept else pd.DataFrame()


Extraction complete.
[Region] rows=52, cols=9
[FieldOffice] rows=273, cols=9
[Installed] rows=819, cols=19
[AssetDetails] rows=3,014, cols=17
[FloorDetails] rows=22,445, cols=21
[SiteOperational] rows=836, cols=12
[YearsInStorage] rows=112, cols=58
[FloorSummary] rows=836, cols=19

=== Running extraction for: /content/FY2023_Asset_Report.pdf ===


  return pd.concat(kept, ignore_index=True) if kept else pd.DataFrame()


Extraction complete.
[Region] rows=48, cols=9
[FieldOffice] rows=252, cols=9
[Installed] rows=888, cols=19
[AssetDetails] rows=4,911, cols=17
[FloorDetails] rows=20,445, cols=21
[SiteOperational] rows=994, cols=12
[YearsInStorage] rows=125, cols=56
[FloorSummary] rows=2,858, cols=19

=== Running extraction for: /content/FY2024_Asset_Report.pdf ===


  return pd.concat(kept, ignore_index=True) if kept else pd.DataFrame()


Extraction complete.
[Region] rows=36, cols=9
[FieldOffice] rows=189, cols=9
[Installed] rows=743, cols=19
[AssetDetails] rows=4,526, cols=17
[FloorDetails] rows=9,240, cols=21
[SiteOperational] rows=753, cols=12
[YearsInStorage] rows=97, cols=62
[FloorSummary] rows=7,316, cols=19
