In [24]:
"""
extract multiple tables from the FY2021 Asset Report PDF.

This script brings together parsers for the following tables and writes the
results to CSV files:

* Assets by Region, Service (region summary)
* Assets by Field Office
* Installed assets by location & manufacture
* Asset details (blue header)
* Floor asset details (white 'Place' floor)
* Site operational status
* Years in storage pivot (summary of asset details by age and years in storage)

"""

import os
import re
import subprocess
from typing import List, Tuple, Dict, Optional
from pathlib import Path

import numpy as np
import pandas as pd


###############################################################################
# Parsers for the region summary and field office tables
###############################################################################

def parse_region_and_field_office(lines: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Extract the "Assets by Region, Service" summary and the "Assets by Field Office"
    table from a list of lines representing a single page.  Each table appears
    once per month.  The region summary table begins after a header line
    containing "Slots Only" and continues until the next heading or blank line.
    The field office table begins after a header containing "Assets by Field Office"
    and continues until the end of that section.  Both tables are returned as
    pandas DataFrames (which may be empty if no rows were detected).
    """
    region_df = pd.DataFrame()
    field_df = pd.DataFrame()

    # ----- Region summary -----
    # Locate the index of the region table by searching for 'Slots Only'
    idx_slots = next((i for i, l in enumerate(lines) if l.strip().startswith('Slots Only')), None)
    if idx_slots is not None:
        region_rows: List[List] = []
        i = idx_slots + 1
        while i < len(lines):
            line = lines[i].strip()
            if not line:
                i += 1
                continue
            # Stop at the start of another table or explanatory note
            if (
                line.startswith('Locations by Service') or
                'Assets by Field Office' in line or
                'EGMs by Field Office' in line or
                'Installed Assets by Location' in line
            ):
                break
            parts = [p for p in re.split(r'\s{2,}', line) if p]
            # A valid row has at least five tokens (region name plus at least four numbers)
            if len(parts) >= 5:
                region_name = parts[0]
                values: List = []
                for p in parts[1:]:
                    x = p.replace('%', '')
                    if x in ('-', ''):
                        values.append(np.nan)
                    else:
                        try:
                            values.append(float(x.replace(',', '')))
                        except Exception:
                            # Keep string for non‐numeric values (should not occur)
                            values.append(x)
                region_rows.append([region_name] + values)
            i += 1
        if region_rows:
            # Normalize row lengths by padding with NaN
            maxlen = max(len(r) for r in region_rows)
            for r in region_rows:
                while len(r) < maxlen:
                    r.append(np.nan)
            # Assign column names according to the maximum length detected
            cols = ['Region', '#Locations', 'Army', 'Navy', 'Marine_Corps', 'Airforce', 'Total', 'Percent']
            region_df = pd.DataFrame(region_rows, columns=cols[:maxlen])

    # ----- Field office summary -----
    field_rows: List[List] = []
    start = False
    current_region: Optional[str] = None
    for raw in lines:
        line = raw.rstrip(' ')
        # Identify the start of the field office section
        if 'Assets by Field Office' in line or 'EGMs by Field Office' in line:
            start = True
            continue
        if not start:
            continue
        # Skip blank lines and header lines containing 'Slots'
        if not line.strip() or line.strip().startswith('Slots'):
            continue
        # A line without any digits denotes the region name header
        if line.strip() and not re.search(r'\d', line.strip()):
            current_region = line.strip()
            continue
        # Split into parts separated by two or more spaces
        parts = [p for p in re.split(r'\s{2,}', line.strip()) if p]
        # Data rows end with five numeric tokens (Slots, ACM, ITC, FRS, Total)
        if len(parts) >= 6 and all(re.match(r'^[()0-9,.-]+$', p) for p in parts[-5:]):
            fo_number = parts[0]
            location_name = ' '.join(parts[1:-5])
            values: List[float] = []
            for p in parts[-5:]:
                if p == '-':
                    values.append(np.nan)
                else:
                    v = p
                    # Handle negative numbers enclosed in parentheses
                    if v.startswith('(') and v.endswith(')'):
                        v = '-' + v[1:-1]
                    values.append(float(v.replace(',', '')))
            field_rows.append([current_region, fo_number, location_name] + values)
    if field_rows:
        field_df = pd.DataFrame(field_rows, columns=[
            'Region', 'FO#', 'Location', 'Slots', 'ACM_CountR', 'ITC', 'FRS', 'Total'
        ])

    return region_df, field_df


###############################################################################
# Parser for installed assets
###############################################################################

def parse_installed_assets(lines: List[str]) -> pd.DataFrame:
    """
    Parse the 'Installed Assets by Location, Manufacture' table from a list of lines.
    Each row begins with a location name and may optionally include a field office
    number.  Seven manufacturer columns (NOV, AIN, IGT, WMS, BAL, KON, ITE)
    are followed by Tot_EGMs, FRS, ACM, ITC, Total_PDF, and a computed total.
    """
    rows: List[List] = []
    started = False
    for line in lines:
        if not started:
            # Identify the beginning of the installed assets table by finding the
            # section title or a header line containing 'FO #' and manufacturer names.
            if ('Installed Assets by Location' in line) or (('FO #' in line) and ('IGT' in line)):
                started = True
            continue
        # Skip lines without any digits (likely blank or notes)
        if not line.strip() or not re.search(r'\d', line):
            continue
        # Skip repeated header lines that contain manufacturer names or Tot/EGMs
        if ('Tot/EGMs' in line) or (('NOV' in line) and ('AIN' in line)):
            continue
        parts = [p.strip() for p in re.split(r'\s{2,}', line) if p.strip()]
        if len(parts) < 5:
            continue
        name = parts[0]
        idx = 1
        fo_number: Optional[str] = None
        # If the next token is all digits, treat it as the field office number
        if idx < len(parts) and re.match(r'^\d+$', parts[idx]):
            fo_number = parts[idx]
            idx += 1
        loc = parts[idx] if idx < len(parts) else ''
        idx += 1
        svc = parts[idx] if idx < len(parts) else ''
        idx += 1
        # If the service column does not contain letters, skip this row
        if not re.search(r'[A-Za-z]', svc):
            continue
        # Flatten the remaining tokens so that numbers separated by spaces are captured individually
        metric_tokens: List[str] = []
        for token in parts[idx:]:
            metric_tokens += token.split()
        if len(metric_tokens) < 8:
            continue
        # The first seven numeric values correspond to manufacturers
        manuf_vals: List[Optional[float]] = []
        for t in metric_tokens[:7]:
            if t == '-':
                manuf_vals.append(None)
            else:
                try:
                    manuf_vals.append(float(t.replace(',', '')))
                except Exception:
                    manuf_vals.append(None)
        # Helper for optional numeric fields
        def _num(tok: Optional[str]) -> Optional[float]:
            if tok is None or tok in ('-', ''):
                return None
            try:
                return float(tok.replace(',', ''))
            except Exception:
                return None
        tot_egms = _num(metric_tokens[7] if len(metric_tokens) > 7 else None)
        frs = _num(metric_tokens[8] if len(metric_tokens) > 8 else None)
        acm = _num(metric_tokens[9] if len(metric_tokens) > 9 else None)
        # Remaining tokens may contain ITC and Total_PDF; the first number in
        # the tail is treated as ITC, and the last token as Total_PDF
        itc: Optional[float] = None
        total_pdf: Optional[float] = None
        remaining = metric_tokens[10:]
        if remaining:
            total_pdf = _num(remaining[-1])
            for t in remaining[:-1]:
                cand = _num(t)
                if cand is not None:
                    itc = cand
                    break
        total_computed = sum(x for x in [tot_egms, frs, acm, itc] if x is not None)
        rows.append([
            name, fo_number, loc, svc
        ] + manuf_vals + [
            tot_egms, frs, acm, itc, total_pdf, total_computed
        ])
    if rows:
        columns = [
            'LocationName', 'FO#', 'Loc', 'Svc',
            'NOV', 'AIN', 'IGT', 'WMS', 'BAL', 'KON', 'ITE',
            'Tot_EGMs', 'FRS', 'ACM', 'ITC', 'Total_PDF', 'Total_Computed'
        ]
        return pd.DataFrame(rows, columns=columns)
    return pd.DataFrame()


###############################################################################
# Helper functions to detect months and build month maps
###############################################################################

def get_month(page_text: str) -> Optional[str]:
    """Extract a month label (e.g., 'March 2021') from a page's text."""
    m = re.search(r'for month of\s+([A-Za-z]+)\s+(\d{4})', page_text, re.IGNORECASE)
    return f"{m.group(1).capitalize()} {m.group(2)}" if m else None


###############################################################################
# Extraction for region/field/installed tables
###############################################################################

def extract_region_field_installed(pdf_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Convert the entire PDF to text and extract the region summaries, field office
    tables and installed asset tables.  A context tracker maintains the current
    month across pages; pages that do not contain a new month heading inherit
    the month from the most recently encountered heading.
    """
    # Convert all pages to text in a single call to pdftotext
    text = subprocess.check_output(['pdftotext', '-layout', pdf_path, '-'], text=True)
    # Split pages on the form feed character
    pages = text.split('\f')
    region_frames: List[pd.DataFrame] = []
    field_frames: List[pd.DataFrame] = []
    installed_frames: List[pd.DataFrame] = []
    current_month: Optional[str] = None
    for page_text in pages:
        if not page_text.strip():
            continue
        month = get_month(page_text)
        if month:
            current_month = month
        # Split lines on newline
        lines = page_text.split('\n')
        page_upper = page_text.upper()
        has_region = ('ASSETS BY REGION' in page_upper) or ('EGMS BY REGION' in page_upper)
        has_field = ('ASSETS BY FIELD OFFICE' in page_upper) or ('EGMS BY FIELD OFFICE' in page_upper)
        has_inst = ('INSTALLED ASSETS BY LOCATION' in page_upper)
        if has_region or has_field:
            rdf, fdf = parse_region_and_field_office(lines)
            if not rdf.empty:
                rdf['Month'] = current_month
                region_frames.append(rdf)
            if not fdf.empty:
                fdf['Month'] = current_month
                field_frames.append(fdf)
        if has_inst:
            inst_df = parse_installed_assets(lines)
            if not inst_df.empty:
                inst_df['Month'] = current_month
                installed_frames.append(inst_df)
    region_df = pd.concat(region_frames, ignore_index=True) if region_frames else pd.DataFrame()
    field_df = pd.concat(field_frames, ignore_index=True) if field_frames else pd.DataFrame()
    installed_df = pd.concat(installed_frames, ignore_index=True) if installed_frames else pd.DataFrame()
    return region_df, field_df, installed_df


###############################################################################
# Detailed table parsers (asset details, floor details, site status)
###############################################################################

def detect_month_map(pages: List[str]) -> Dict[int, str]:
    """
    Build a mapping from page numbers (1-indexed) to month strings by scanning
    pages for the 'Assets by Region, Service for month of <Month Year>' header.
    Returns a dictionary mapping each page index to its corresponding month.
    """
    pattern = re.compile(r'Assets by Region, Service\s+for month of\s+([A-Za-z]+\s+\d{4})', re.I)
    month_start_pages: List[Tuple[int, str]] = []
    seen_months: List[str] = []
    for i, page in enumerate(pages, start=1):
        m = pattern.search(page)
        if m:
            month = m.group(1).strip()
            if month not in seen_months:
                month_start_pages.append((i, month))
                seen_months.append(month)
    # Now assign ranges between month start pages
    month_ranges: List[Tuple[int, int, str]] = []
    for idx, (start, month) in enumerate(month_start_pages):
        end = month_start_pages[idx + 1][0] - 1 if idx + 1 < len(month_start_pages) else len(pages)
        month_ranges.append((start, end, month))
    month_map: Dict[int, str] = {}
    for start, end, month in month_ranges:
        for p in range(start, end + 1):
            month_map[p] = month
    return month_map


def parse_asset_line(line: str) -> Optional[Dict[str, str]]:
    """
    Parse a single line from the asset details table.  Each row contains
    region, FONUM, FOSHORT, location number, location name, asset number,
    class code, description, type, acquisition date, effective date, serial
    number, age, and years in storage.  Returns a dictionary of values or
    None if the line does not match the expected pattern.
    """
    tokens = line.split()
    if len(tokens) < 10:
        return None
    # Extract trailing numeric/date tokens
    try:
        years = tokens[-1]
        age = tokens[-2]
        serial = tokens[-3]
        eff_date = tokens[-4]
        acquire = tokens[-5]
        _type = tokens[-6]
    except IndexError:
        return None
    remaining = tokens[:-6]
    if len(remaining) < 5:
        return None
    region = remaining[0]
    fonum = remaining[1]
    # Find the index of the location (first purely numeric token after FOSHORT)
    loc_idx: Optional[int] = None
    for i in range(2, len(remaining)):
        if remaining[i].isdigit():
            loc_idx = i
            break
    if loc_idx is None:
        return None
    foshort = ' '.join(remaining[2:loc_idx])
    loc = remaining[loc_idx]
    # Identify the asset number (a 4–6 digit number)
    asset_idx: Optional[int] = None
    for j in range(loc_idx + 1, len(remaining)):
        if re.fullmatch(r'\d{4,6}', remaining[j]):
            asset_idx = j
            break
    if asset_idx is None or asset_idx + 1 >= len(remaining):
        return None
    lname = ' '.join(remaining[loc_idx + 1:asset_idx])
    asset = remaining[asset_idx]
    class_num = remaining[asset_idx + 1]
    desc = ' '.join(remaining[asset_idx + 2:])
    return {
        'Region': region,
        'FONUM': fonum,
        'FOSHORT': foshort,
        'Loc': loc,
        'LNAME': lname,
        'Asset': asset,
        'Class': class_num,
        'Desc': desc,
        'Type': _type,
        'Acquire': acquire,
        'Effective': eff_date,
        'SerialNum': serial,
        'Age': age,
        'Years_in_Storage': years,
    }


def parse_asset_details_page(page_text: str) -> List[Dict[str, str]]:
    data: List[Dict[str, str]] = []
    # Iterate through lines and parse those beginning with region names
    for line in page_text.split('\n'):
        line_strip = line.strip()
        if not line_strip:
            continue
        if line_strip.startswith(('Europe', 'Korea', 'Japan', 'Okinawa', 'Pacific', 'United')):
            parsed = parse_asset_line(line_strip)
            if parsed:
                data.append(parsed)
    return data


def parse_floor_line(line: str) -> Optional[Dict[str, str]]:
    tokens = line.split()
    if len(tokens) < 15:
        return None
    try:
        age = tokens[-1]
        year = tokens[-2]
        cat = tokens[-3]
        # Determine FONUM and FOSHORT by scanning tokens from the right
        idx = len(tokens) - 3
        foshort_tokens: List[str] = []
        fonum: Optional[str] = None
        while idx > 0:
            token = tokens[idx - 1]
            if token.isdigit():
                fonum = token
                idx -= 1
                break
            else:
                foshort_tokens.insert(0, token)
                idx -= 1
        if fonum is None:
            return None
        foshort = ' '.join(foshort_tokens)
        loc = tokens[0]
        place = tokens[1]
        region = tokens[2]
        svc = tokens[3]
        asset = tokens[4]
        serial = tokens[5]
        asset_type = tokens[6]
        # Find the first date which marks the end of the description
        date_pattern = re.compile(r'\d{1,2}/\d{1,2}/\d{2,4}')
        idx_desc_end: Optional[int] = None
        for i in range(7, len(tokens)):
            if date_pattern.fullmatch(tokens[i]):
                idx_desc_end = i
                break
        if idx_desc_end is None or idx_desc_end + 6 >= len(tokens):
            return None
        desc = ' '.join(tokens[7:idx_desc_end])
        acquire = tokens[idx_desc_end]
        effective = tokens[idx_desc_end + 1]
        disposed = tokens[idx_desc_end + 2]
        class_num = tokens[idx_desc_end + 3]
        mfg = tokens[idx_desc_end + 4]
        # LNAME spans from after the MFG up to the location of FONUM
        fonum_idx = tokens.index(fonum)
        lname = ' '.join(tokens[idx_desc_end + 5:fonum_idx])
    except Exception:
        return None
    return {
        'Loc': loc,
        'Place': place,
        'Region': region,
        'SVC': svc,
        'Asset': asset,
        'SerialNum': serial,
        'Type': asset_type,
        'Desc': desc,
        'Acquire': acquire,
        'Effective': effective,
        'Disposed': disposed,
        'Class': class_num,
        'MFG': mfg,
        'LNAME': lname,
        'FONUM': fonum,
        'FOSHORT': foshort,
        'Cat': cat,
        'Year': year,
        'Age': age,
    }


def parse_floor_details_page(page_text: str) -> List[Dict[str, str]]:
    data: List[Dict[str, str]] = []
    for line in page_text.split('\n'):
        if not line.strip():
            continue
        # Rows start with a location number and include the literal 'Floor' as the second field
        if re.match(r'\s*\d', line) and 'Floor' in line:
            parsed = parse_floor_line(line.strip())
            if parsed:
                data.append(parsed)
    return data


def parse_site_line(row: str) -> Optional[Dict[str, str]]:
    tokens = row.split()
    if not tokens or not tokens[0].isdigit():
        return None
    # Find where the open date appears to delimit LNAME and Place
    date_pattern = re.compile(r'\d{1,2}/\d{1,2}/\d{2,4}')
    open_idx: Optional[int] = None
    for i in range(1, len(tokens)):
        if date_pattern.fullmatch(tokens[i]):
            open_idx = i
            break
    if open_idx is None or open_idx < 2:
        return None
    place = tokens[open_idx - 1]
    lname = ' '.join(tokens[1:open_idx - 1])
    try:
        open_date = tokens[open_idx]
        closed_date = tokens[open_idx + 1]
        ksi = tokens[open_idx + 2]
        cmty_num = tokens[open_idx + 3]
    except IndexError:
        return None
    # Identify where the service field appears (Army, Navy, Air, Marine Corps)
    services = {'Army', 'Navy', 'Air', 'Marine', 'Corps'}
    svc_idx: Optional[int] = None
    for j in range(open_idx + 4, len(tokens)):
        tok = tokens[j]
        if tok == 'Marine' and j + 1 < len(tokens) and tokens[j + 1] == 'Corps':
            svc_idx = j
            break
        if tok in services:
            svc_idx = j
            break
    if svc_idx is None:
        return None
    cmty = ' '.join(tokens[open_idx + 4:svc_idx])
    if tokens[svc_idx] == 'Marine' and svc_idx + 1 < len(tokens) and tokens[svc_idx + 1] == 'Corps':
        svc = 'Marine Corps'
        fonum_index = svc_idx + 2
    else:
        svc = tokens[svc_idx]
        fonum_index = svc_idx + 1
    if fonum_index >= len(tokens):
        return None
    fonum = tokens[fonum_index]
    foshort = ' '.join(tokens[fonum_index + 1:]) if fonum_index + 1 < len(tokens) else ''
    return {
        'Loc': tokens[0],
        'LNAME': lname,
        'Place': place,
        'Open': open_date,
        'Closed': closed_date,
        'KSI': ksi,
        'CmtyNum': cmty_num,
        'Cmty': cmty,
        'SVC': svc,
        'FONUM': fonum,
        'FOSHORT': foshort,
    }


def parse_site_status_page(page_text: str) -> List[Dict[str, str]]:
    # Combine lines into rows: a row begins with a number and continues until the next numeric line
    lines = page_text.split('\n')
    rows: List[str] = []
    current: str = ''
    for line in lines:
        if not line.strip():
            continue
        if re.match(r'\s*\d', line):
            if current:
                rows.append(current)
                current = ''
            current = line.strip()
        else:
            if current:
                current += ' ' + line.strip()
    if current:
        rows.append(current)
    data: List[Dict[str, str]] = []
    for row in rows:
        parsed = parse_site_line(row)
        if parsed:
            data.append(parsed)
    return data


###############################################################################
# Extraction for detailed tables (asset, floor, site) and years in storage
###############################################################################

_REGION_WORDS = r"(Europe|Korea|Japan|Okinawa|Pacific|United)"

# Service can be ANY single letter + digit (E0/K0/J0…)
_FOSHORT_HEAD = re.compile(
    rf"""
    ^\s*
    (?P<city>[A-Z0-9'&./\- ]+?)\s+     # KAISERSLAUTERN / STUTTGART / UIJEONGBU ...
    (?P<locnum>\d+)\s+                 # 2 / 4 / 5 ...
    (?P<Region>{_REGION_WORDS})\s+     # Region
    (?P<regcode>[A-Z]{{2}}\d{{2}})\s+  # KA02 / ST03 / UJ01 ...
    (?P<service>[A-Z]\d)\s+            # E0 / K0 / J0
    (?P<after>.+?)\s*$                 # remainder
    """,
    re.IGNORECASE | re.VERBOSE
)

UNPARSED_FOSHORT = []
_RPTGRP_CODE = re.compile(r"^[A-Z]/C$", re.IGNORECASE)     # B/C, C/C, E/C...
_BOOL_RX     = re.compile(r"^(?:TRUE|FALSE)$", re.IGNORECASE)
_OPEN_RX     = re.compile(r"^(?:OPEN|CLOSED|~CLOSED)$", re.IGNORECASE)

def _fallback_head_tokens(s: str):
    """
    Fallback when _FOSHORT_HEAD doesn't match.
    We look for: ... <locnum> <Region> <regcode> <service> <after...>
    and derive <city> from the tokens before <locnum>.
    """
    toks = s.split()
    if len(toks) < 5:
        return None
    REGIONS = {"EUROPE","KOREA","JAPAN","OKINAWA","PACIFIC","UNITED"}
    ridxs = [i for i,t in enumerate(toks) if t.upper() in REGIONS]
    if not ridxs:
        return None
    for ri in ridxs:
        if ri == 0 or ri+3 >= len(toks):
            continue
        if not re.fullmatch(r"\d+", toks[ri-1]):
            continue
        locnum  = toks[ri-1]
        regcode = toks[ri+1] if ri+1 < len(toks) else None
        # accept e.g. KA02 or a mild OCR like KAO2 (we'll normalize O→0 between digits)
        if not regcode or not re.fullmatch(r"[A-Za-z]{2}[0-9O][0-9]", regcode):
            continue
        service = toks[ri+2] if ri+2 < len(toks) else None
        if not service or not re.fullmatch(r"[A-Za-z]\d", service, flags=re.I):
            continue
        # normalize regcode OCR: letter-letter-(O|0)-digit  -> letter-letter-0-digit
        regcode = re.sub(r"^([A-Za-z]{2})O(\d)$", r"\g<1>0\g<2>", regcode)
        city = " ".join(toks[:ri-1]).strip()
        after = " ".join(toks[ri+3:]).strip()
        if city and after:
            return {
                "city": city, "locnum": locnum,
                "Region": toks[ri], "regcode": regcode,
                "service": service, "after": after
            }
    return None

def _split_tail(after: str):
    """
    Parses the part of FOSHORT after <Region> <REGCODE> <SERVICE>.
    Handles BOTH formats:

    OLD (pre–May 2021):
      [MESSAGE…] <Banker> <Shortname…> [RptGrpCode?] <Split%> <TVLREST> <COVID19>

    NEW (May 2021+):
      [MESSAGE…] <Banker> <Shortname…> <Split%> <CMMTY words…>

    Returns keys: MESSAGE, banker, shortname, grp_code, Split, TVLREST, COVID19, CMMTY
    """
    # Normalize weird punctuation and compress spaces
    after = (after.replace("’","'").replace("‘","'")
                  .replace("–","-").replace("—","-"))
    # Ensure a space after '%' if it glues to 'Open/Closed'
    after = re.sub(r"%\s*(Open|Closed|~Closed)\b", r"% \1", after, flags=re.I)
    # Remove bracket chars; collapse whitespace
    after = re.sub(r"[\[\](){}]", " ", after)
    after = re.sub(r"\s+", " ", after).strip()

    toks = after.split()
    if len(toks) < 2:
        return None

    # last token ending with % is our Split
    split_i = None
    for i in range(len(toks)-1, -1, -1):
        if toks[i].endswith("%"):
            split_i = i
            break
    if split_i is None:
        return None

    # right side
    right = toks[split_i+1:]
    covid = None; tvl = None; cmty = None
    if right:
        if len(right) >= 2 and _OPEN_RX.match(right[0]) and _BOOL_RX.match(right[-1]):
            tvl, covid = right[0].title(), right[-1].upper()
        else:
            cmty = " ".join(right).strip() or None

    # optional RptGrp code to the left of Split
    grp_code = None
    left_end = split_i
    if split_i-1 >= 0 and _RPTGRP_CODE.match(toks[split_i-1]):
        grp_code = toks[split_i-1]
        left_end = split_i-1

    # left side = [MESSAGE … Banker Shortname…]
    left = toks[:left_end]
    if len(left) < 2:
        return None

    STOP = {"Open", "Closed", "~Closed", "Clsd", "SMS"}
    MONTHS = {
        "Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec",
        "Jan.","Feb.","Mar.","Apr.","Jun.","Jul.","Aug.","Sep.","Oct.","Nov.","Dec."
    }

    def is_title(tok):     # Dave, Reno, Rick…
        return bool(re.fullmatch(r"[A-Z][a-z]+(?:'[A-Za-z]+)?", tok))
    def is_upperish(tok):  # KAZABRA, PATCH, K-16, ACE'S, HILLTOP, G/C …
        return bool(re.fullmatch(r"[A-Z0-9'&/()\-]+", tok))

    banker_i = None
    for i, tok in enumerate(left):
        if is_title(tok) and tok not in STOP and tok not in MONTHS:
            nxt = left[i+1] if i+1 < len(left) else ""
            if is_upperish(nxt):
                banker_i = i
                break
    if banker_i is None:
        for i in range(len(left)-1, -1, -1):
            if re.fullmatch(r"[A-Za-z']+", left[i]):
                banker_i = i
                break
    if banker_i is None or banker_i == len(left)-1:
        return None

    message   = " ".join(left[:banker_i]).strip() or None
    banker    = left[banker_i]
    shortname = " ".join(left[banker_i+1:]).strip() or None

    return {
        "MESSAGE":   message,
        "banker":    banker,
        "shortname": shortname,
        "grp_code":  grp_code,
        "Split":     toks[split_i].upper(),
        "TVLREST":   tvl,
        "COVID19":   covid,
        "CMMTY":     cmty,
    }

def _parse_foshort_row(s: str):
    out = {
        "city": None, "locnum": None, "Region": None,
        "regcode": None, "service": None,
        "MESSAGE": None, "banker": None, "shortname": None,
        "Split": None, "TVLREST": None, "COVID19": None, "CMMTY": None
    }
    if not isinstance(s, str) or not s.strip():
        return out

    # CLEANUP (note: we REMOVED the bad "([A-Z]{2})0(\d{2}) -> \1O\2" mutation)
    s = (s.replace("’","'").replace("‘","'")
           .replace("–","-").replace("—","-")
           .replace("|"," ").replace("[","").replace("]","").replace(";"," ")
           .strip())
    s = re.sub(r"\s+", " ", s)

    m = _FOSHORT_HEAD.match(s)
    if not m:
        f = _fallback_head_tokens(s)
        if not f:
            UNPARSED_FOSHORT.append(s)
            return out
        city, locnum = f["city"], f["locnum"]
        region, regcode, service = f["Region"], f["regcode"], f["service"]
        after = f["after"]
    else:
        city     = m.group("city").strip()
        locnum   = m.group("locnum")
        region   = m.group("Region")
        regcode  = m.group("regcode")
        service  = m.group("service")
        after    = m.group("after")

    out["city"]    = city
    out["locnum"]  = locnum
    out["Region"]  = region.title()
    out["regcode"] = regcode.upper()
    out["service"] = service.upper()

    tail = _split_tail(after)
    if tail:
        out.update(tail)
    else:
        UNPARSED_FOSHORT.append(s)

    return out

def extract_detailed_tables(pdf_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Extract asset details, floor asset details, site operational status and a
    derived years-in-storage pivot from the PDF.  Uses the month map to
    assign pages to months, then applies parsers to each page.  Returns four
    DataFrames: asset details, floor details, site status, and years storage.
    """
    text = subprocess.check_output(['pdftotext', '-layout', pdf_path, '-']).decode('utf-8')
    pages = text.split('\f')
    month_map = detect_month_map(pages)
    asset_data: List[Dict[str, str]] = []
    floor_data: List[Dict[str, str]] = []
    site_data: List[Dict[str, str]] = []
    for p_num, page_text in enumerate(pages, start=1):
        month = month_map.get(p_num)
        if not month:
            continue
        for rec in parse_asset_details_page(page_text):
            rec['Month'] = month
            asset_data.append(rec)
        for rec in parse_floor_details_page(page_text):
            rec['Month'] = month
            floor_data.append(rec)
        for rec in parse_site_status_page(page_text):
            rec['Month'] = month
            site_data.append(rec)
    asset_df = pd.DataFrame(asset_data).drop_duplicates()
    floor_df = pd.DataFrame(floor_data).drop_duplicates()
    site_df  = pd.DataFrame(site_data).drop_duplicates()

    if not site_df.empty and "FOSHORT" in site_df.columns:
        # ---- parse FOSHORT into pieces ----
        parsed = site_df["FOSHORT"].apply(_parse_foshort_row).apply(pd.Series)

        # A flag so you can see which rows parsed successfully
        site_df["ParseOK"] = ~parsed.isna().all(axis=1)

        # Normalize empties to NA aggressively
        site_df = (site_df
                .replace(r"^\s*$", pd.NA, regex=True)
                .applymap(lambda v: pd.NA if isinstance(v, str) and not v.strip() else v))

        # helpers
        def _up(v):    return v.upper() if isinstance(v, str) and v.strip() else pd.NA
        def _title(v): return v.title() if isinstance(v, str) and v.strip() else pd.NA

        # parsed series
        p_city   = parsed.get("city"     , pd.Series(dtype=object)).apply(_up)
        p_locnum = pd.to_numeric(parsed.get("locnum", pd.Series(dtype=object)), errors="coerce").astype("Int64")
        p_region = parsed.get("Region"   , pd.Series(dtype=object))
        p_code   = parsed.get("regcode"  , pd.Series(dtype=object)).apply(_up)
        p_bank   = parsed.get("service"  , pd.Series(dtype=object)).apply(_up)
        p_msg    = parsed.get("MESSAGE"  , pd.Series(dtype=object))
        p_banker = parsed.get("banker"   , pd.Series(dtype=object)).apply(_title)
        p_short  = parsed.get("shortname", pd.Series(dtype=object))
        p_split  = parsed.get("Split"    , pd.Series(dtype=object)).apply(_up)
        p_covid  = parsed.get("COVID19"  , pd.Series(dtype=object)).apply(_up)
        p_tvl    = parsed.get("TVLREST"  , pd.Series(dtype=object)).apply(lambda v: v.title() if isinstance(v, str) else pd.NA)
        p_cmty   = parsed.get("CMMTY"    , pd.Series(dtype=object))

        # left-series helper
        def _col(df, name, dtype="object"):
            if name in df.columns:
                return df[name].reindex(df.index)
            return pd.Series(pd.NA, index=df.index, dtype=dtype)

        # overwrite/merge
        site_df["FOSHORT"]    = p_city
        site_df["REGNUM"]     = _col(site_df, "REGNUM", "Int64").combine_first(p_locnum)
        site_df["Region"]     = _col(site_df, "Region").combine_first(p_region)
        site_df["SMSSection"] = _col(site_df, "SMSSection").combine_first(p_code)
        site_df["SMSBank"]    = _col(site_df, "SMSBank").combine_first(p_bank)
        site_df["MESSAGE"]    = _col(site_df, "MESSAGE").combine_first(p_msg)
        site_df["RptGrp"]     = _col(site_df, "RptGrp").combine_first(p_banker)
        site_df["Shortname"]  = _col(site_df, "Shortname").combine_first(p_short)
        site_df["Split"]      = _col(site_df, "Split").combine_first(p_split)
        site_df["COVID19"]    = _col(site_df, "COVID19").combine_first(p_covid)
        site_df["TVLREST"]    = _col(site_df, "TVLREST").combine_first(p_tvl)
        site_df["CMMTY"]      = _col(site_df, "CMMTY").combine_first(p_cmty)

        # ---- Handle layout shift in May 2021 (CMMTY replaces COVID19/TVLREST) ----
        if any(site_df["Month"].astype(str).str.contains(r"May\s*2021", case=False, na=False)):
            cols_needed = [
                "Loc","LNAME","Open","Closed","KSI","CmtyNum","SVC","FONUM",
                "FOSHORT","REGNUM","Region","SMSSection","SMSBank",
                "MESSAGE","RptGrp","Shortname","Split","CMMTY"
            ]
        else:
            cols_needed = [
                "Loc","LNAME","Open","Closed","KSI","CmtyNum","SVC","FONUM",
                "FOSHORT","REGNUM","Region","SMSSection","SMSBank",
                "MESSAGE","RptGrp","Shortname","Split","COVID19","TVLREST"
            ]
        
        # Not needed in final output
        site_df.drop(columns=["PLACE"], errors="ignore", inplace=True)

        # Guarantee all columns exist and reorder them
        for c in cols_needed:
            if c not in site_df.columns:
                site_df[c] = pd.NA

        site_df = site_df[[c for c in cols_needed if c in site_df.columns] +
                        [c for c in site_df.columns if c not in cols_needed]]


        # Fallbacks from original site parser if present
        if "Cmty" in site_df.columns:
            site_df["CMMTY"] = site_df["CMMTY"].combine_first(site_df["Cmty"])
        if "CmtyNum" in site_df.columns:
            site_df["CmtyNum"] = pd.to_numeric(site_df["CmtyNum"], errors="coerce").astype("Int64")

        
    # ---- end of site_df processing ----

    # Derive years in storage pivot: bucket Years_in_Storage to integers and count by age
    if not asset_df.empty:
        asset_df['Age_int'] = pd.to_numeric(asset_df['Age'], errors='coerce').astype('Int64')
        asset_df['Years_in_Storage_float'] = pd.to_numeric(asset_df['Years_in_Storage'], errors='coerce')
        asset_df['Years_in_Storage_int'] = asset_df['Years_in_Storage_float'].fillna(0).astype(int)
        pivot_tables: List[pd.DataFrame] = []
        for month, group in asset_df[(asset_df['Age_int'].notna()) & (asset_df['Age_int'] <= 24)].groupby('Month'):
            pt = pd.pivot_table(
                group,
                index='Age_int',
                columns='Years_in_Storage_int',
                values='Asset',
                aggfunc='count',
                fill_value=0
            )
            pt = pt.reset_index().rename(columns={'Age_int': 'Age'})
            pt.columns = ['Age'] + [f'Storage_{col}' for col in pt.columns[1:]]
            pt.insert(0, 'Month', month)
            pivot_tables.append(pt)
        years_storage_df = pd.concat(pivot_tables, ignore_index=True) if pivot_tables else pd.DataFrame()
    else:
        years_storage_df = pd.DataFrame()
    return asset_df, floor_df, site_df, years_storage_df


###############################################################################
# Main entry point
###############################################################################

def run_all(pdf_path: str,
            outdir: str = 'csv_output',
            excel_path: str = None,
            also_write_csvs: bool = True) -> None:
    """
    Execute the full extraction pipeline, optionally writing CSVs AND/OR
    a single Excel workbook with one sheet per table.
    """
    # Extract three high-level tables
    region_df, field_df, installed_df = extract_region_field_installed(pdf_path)
    # Extract detailed tables
    asset_df, floor_df, site_df, years_storage_df = extract_detailed_tables(pdf_path)

    # ---- CSVs (optional) ----
    if also_write_csvs:
        os.makedirs(outdir, exist_ok=True)
        if not region_df.empty:
            region_df.to_csv(os.path.join(outdir, 'assets_by_region_service.csv'), index=False)
        if not field_df.empty:
            field_df.to_csv(os.path.join(outdir, 'assets_by_field_office.csv'), index=False)
        if not installed_df.empty:
            installed_df.to_csv(os.path.join(outdir, 'installed_assets_location_manufacture.csv'), index=False)
        if not asset_df.empty:
            asset_df.to_csv(os.path.join(outdir, 'asset_details.csv'), index=False)
        if not floor_df.empty:
            floor_df.to_csv(os.path.join(outdir, 'floor_asset_details.csv'), index=False)
        if not site_df.empty:
            site_df.to_csv(os.path.join(outdir, 'site_operational_status.csv'), index=False)
        if not years_storage_df.empty:
            years_storage_df.to_csv(os.path.join(outdir, 'years_in_storage.csv'), index=False)

    # ---- Single Excel workbook ----
    # Default Excel path if not provided
    if excel_path is None:
        excel_path = os.path.join(outdir, 'FY2021_Asset_Report.xlsx')

    # Ensure destination directory exists
    Path(excel_path).parent.mkdir(parents=True, exist_ok=True)

    # Sheet names must be <= 31 chars and unique
    sheets = [
        ("Region_Service", region_df),
        ("Field_Office", field_df),
        ("Installed_Assets", installed_df),
        ("Asset_Details", asset_df),
        ("Floor_Details", floor_df),
        ("Site_Status", site_df),
        ("Years_Storage", years_storage_df),
    ]

    # Keep only non-empty frames
    sheets = [(name, df) for name, df in sheets if not df.empty]

    if sheets:
        # Deduplicate sheet names if any collision (defensive)
        used = set()
        final = []
        for name, df in sheets:
            base = name[:31]
            nm = base
            i = 1
            while nm in used:
                suffix = f"_{i}"
                nm = (base[:31 - len(suffix)]) + suffix
                i += 1
            used.add(nm)
            final.append((nm, df))

        with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
            for sheet_name, df in final:
                df.to_excel(writer, sheet_name=sheet_name, index=False)

        print(f'Excel workbook written to: {excel_path}')
    else:
        print('No tables to write to Excel (all DataFrames were empty).')

    # Print summary
    print('Extraction complete.')
    print(f'Region summary rows: {len(region_df)}')
    print(f'Field office rows: {len(field_df)}')
    print(f'Installed assets rows: {len(installed_df)}')
    print(f'Asset details rows: {len(asset_df)}')
    print(f'Floor asset details rows: {len(floor_df)}')
    print(f'Site operational rows: {len(site_df)}')
    print(f'Years in storage rows: {len(years_storage_df)}')

# --- update your __main__ to point to the Excel destination you want ---
if __name__ == '__main__':
    pdf_path = r"C:\Users\dawid\Desktop\MUCKROCK\data\FY2021 Asset Report.pdf"
    # keep CSVs (if you want) but also write a single Excel file here:
    excel_out = r"C:\Users\dawid\Desktop\MUCKROCK\database\FY2021_CSVs\FY2021_Asset_Report.xlsx"
    output_dir = r"C:\Users\dawid\Desktop\MUCKROCK\database\FY2021_CSVs\csv_output"

    if os.path.exists(pdf_path):
        run_all(pdf_path, outdir=output_dir, excel_path=excel_out, also_write_csvs=True)
    else:
        print(f'PDF file not found: {pdf_path}')


  installed_df = pd.concat(installed_frames, ignore_index=True) if installed_frames else pd.DataFrame()
  .applymap(lambda v: pd.NA if isinstance(v, str) and not v.strip() else v))


Excel workbook written to: C:\Users\dawid\Desktop\MUCKROCK\database\FY2021_CSVs\FY2021_Asset_Report.xlsx
Extraction complete.
Region summary rows: 68
Field office rows: 424
Installed assets rows: 1066
Asset details rows: 4383
Floor asset details rows: 22940
Site operational rows: 925
Years in storage rows: 182
