In [1]:
!pip install pdfplumber tabula-py

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tabula-py
  Downloading tabula_py-2.10.0-py3-none-any.whl.metadata (7.6 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [2]:
import gdown
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

url = 'https://drive.google.com/uc?export=download&id=1jKoXzzd5bqpcJeT_TtchmoGza3WRY8Xc'
output = 'Marine_Revenue_FY20-FY24.pdf'
gdown.download(url, output, quiet=False)

url = 'https://drive.google.com/uc?export=download&id=1QtmW3UhIMXDLQoI3wI7zH88SACFurjcR'
output = 'FY2023_Asset_Report.pdf'
gdown.download(url, output, quiet=False)


Downloading...
From: https://drive.google.com/uc?export=download&id=1jKoXzzd5bqpcJeT_TtchmoGza3WRY8Xc
To: /content/Marine_Revenue_FY20-FY24.pdf
100%|██████████| 1.64M/1.64M [00:00<00:00, 19.9MB/s]
Downloading...
From: https://drive.google.com/uc?export=download&id=1QtmW3UhIMXDLQoI3wI7zH88SACFurjcR
To: /content/FY2023_Asset_Report.pdf
100%|██████████| 13.5M/13.5M [00:00<00:00, 88.6MB/s]


'FY2023_Asset_Report.pdf'

In [4]:
import re
import numpy as np
import pandas as pd
import pdfplumber
from pathlib import Path

# ---- Paths ----
PDF = Path("/content/Marine_Revenue_FY20-FY24.pdf")
OUT = PDF.parent / "Marine_Revenue_FY20-FY24__p1to120_skipped_stable.csv"

# ---- Columns and Extraction Parameters ----
COLS = ["Loc #", "Location", "Month", "Revenue", "NAFI Amt", "Annual Revenue", "Annual NAFI"]

# Base vertical cut lines (as a fallback)
BASE_CUTS = [-1e9, 156.21, 214.20, 269.19, 327.93, 393.67, 460.17, 1e9]
# Allowed deviation clamp for each cut line
CLAMP_DELTA = [0, 30, 40, 40, 30, 30, 30, 0]

# Layout/Aggregation Parameters
Y_TOL = 3.0        # Vertical merge tolerance
X_JOIN = 3.0       # Horizontal text join tolerance (character spacing)
GAP_RATIO = 0.8    # Smart word join space threshold
DROP_MIN = 2       # Minimum non-empty columns to keep a row
EDGE_PAD = 2.0     # Cut line edge padding
LEFT_SHIFT_MONTH   = 18   # Month left boundary adjustment
LEFT_SHIFT_REVENUE = 8    # Revenue left boundary adjustment

# ---- Regex ----
MONTH_FULL = re.compile(r"^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[-/]?\d{2}$", re.I)
MONTH_REV  = re.compile(r"^\d{2}[-/](?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)$", re.I)
MONTH_ONLY = re.compile(r"^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)$", re.I)
NUM_RE     = re.compile(r"^-?\(?\d{1,3}(?:,\d{3})*(?:\.\d+)?\)?$")
INT_RE     = re.compile(r"^\d+$")
YEAR_FRAG  = {-17, -18, -19, -20}
NUM_RE_STRICT = re.compile(r"^-?\(?\d{1,3}(?:,\d{3})*(?:\.\d+)?\)?$")

# ---- Header Patterns ----
HEADER_PATTERNS = {
    "Loc #": re.compile(r"^Loc\s*#?$", re.I),
    "Location": re.compile(r"^Location$", re.I),
    "Month": re.compile(r"^Month$", re.I),
    "Revenue": re.compile(r"^Revenue$", re.I),
    "NAFI Amt": re.compile(r"^NAFI\s*Amt$", re.I),
    "Annual Revenue": re.compile(r"^Annual\s+Revenue$", re.I),
    "Annual NAFI": re.compile(r"^Annual\s+NAFI$", re.I),
}
HEADER_ORDER = ["Loc #", "Location", "Month", "Revenue", "NAFI Amt", "Annual Revenue", "Annual NAFI"]

# ==== Header/Noise Keywords (to exclude non-data row tokens) ====
EXCLUDE_LINE_PATTERNS = [
    re.compile(r"^ARMP\s+Marine\s+Slot\s+Report$", re.I),
    re.compile(r"^Monthly\s+Summary\s+by\s+Location$", re.I),
    re.compile(r"^Slot\s+Revenue$", re.I),
    re.compile(r"^NAFI\s+Reimbursement\s+from\s+ARMP$", re.I),
    re.compile(r"^Region$", re.I),
    re.compile(r"^Loc\s*#?$", re.I),
    re.compile(r"^Location$", re.I),
    re.compile(r"^Month$", re.I),
    re.compile(r"^Revenue$", re.I),
    re.compile(r"^NAFI\s*Amt$", re.I),
    re.compile(r"^Annual\s+Revenue$", re.I),
    re.compile(r"^Annual\s+NAFI$", re.I),
]

def _is_exclude_token(t):
    txt = t["text"].strip()
    return any(p.match(txt) for p in EXCLUDE_LINE_PATTERNS)

# ---------------- Basic Utilities ----------------
def chars_to_words(page, x_tol=X_JOIN, y_tol=Y_TOL):
    chars = sorted(page.chars, key=lambda c: (round(c["top"], 1), c["x0"]))
    lines, words = [], []
    for ch in chars:
        if not lines or abs(ch["top"] - lines[-1]["y"]) > y_tol:
            lines.append({"y": ch["top"], "chars": [ch]})
        else:
            lines[-1]["chars"].append(ch)
    for line in lines:
        row = sorted(line["chars"], key=lambda c: c["x0"])
        cur = [row[0]]
        for c in row[1:]:
            if c["x0"] - cur[-1]["x1"] <= x_tol:
                cur.append(c)
            else:
                words.append({"text": "".join(x["text"] for x in cur), "x0": cur[0]["x0"], "x1": cur[-1]["x1"], "top": line["y"]})
                cur = [c]
        if cur:
            words.append({"text": "".join(x["text"] for x in cur), "x0": cur[0]["x0"], "x1": cur[-1]["x1"], "top": line["y"]})
    return words

def smart_join(tokens):
    if not tokens: return ""
    tokens = sorted(tokens, key=lambda w: w["x0"])
    widths = [t["x1"] - t["x0"] for t in tokens]
    cw = (np.median(widths) or 1.0) * GAP_RATIO
    s, prev = tokens[0]["text"], tokens[0]
    for t in tokens[1:]:
        s += ("" if (t["x0"] - prev["x1"]) <= cw else " ") + t["text"]
        prev = t
    return s.strip()

def assign_bin(xmid, cuts):
    for i in range(len(cuts) - 1):
        if cuts[i] - EDGE_PAD <= xmid <= cuts[i + 1] + EDGE_PAD:
            return i
    return None

def header_y_of(words):
    ys = [w["top"] for w in words if w["text"] in ("Location", "Month", "Revenue")]
    return min(ys) if ys else 0

def group_by_y(words, y_tol=Y_TOL):
    lines = []
    for w in sorted(words, key=lambda w: (round(w["top"], 1), w["x0"])):
        if not lines or abs(w["top"] - lines[-1]["y"]) > y_tol:
            lines.append({"y": w["top"], "tokens": [w]})
        else:
            lines[-1]["tokens"].append(w)
    return lines

# ---------------- Infer Cut Lines from Header ----------------
def detect_header_line(words):
    candidates = []
    for line in group_by_y(words):
        texts = [t["text"].strip() for t in line["tokens"]]
        hit = sum(1 for pat in HEADER_PATTERNS.values() if any(pat.match(tx) for tx in texts))
        if hit >= 3:
            candidates.append((line["y"], line["tokens"], hit))
    if not candidates:
        return 0, None
    candidates.sort(key=lambda x: (x[2], x[0]))
    y, toks, _ = candidates[-1]
    return y, toks

def cuts_from_header(words, base_cuts, prev_cuts=None):
    header_y, header_tokens = detect_header_line(words)
    if not header_tokens:
        # Use default cuts to prevent accumulating errors
        cuts = list(base_cuts)
        return cuts, (header_y or header_y_of(words))

    name_to_x0 = {}
    for name, pat in HEADER_PATTERNS.items():
        xs = [t["x0"] for t in header_tokens if pat.match(t["text"].strip())]
        if xs:
            name_to_x0[name] = float(min(xs))

    starts = []
    ref = prev_cuts or base_cuts
    for name in HEADER_ORDER:
        if name in name_to_x0:
            starts.append(name_to_x0[name])
        else:
            col_idx = HEADER_ORDER.index(name)
            left_edge = ref[col_idx]
            right_edge = ref[col_idx + 1]
            starts.append((left_edge * 0.65 + right_edge * 0.35))

    cuts = [-1e9]
    for i in range(1, len(starts)):
        cuts.append((starts[i - 1] + starts[i]) / 2.0)
    cuts.append(1e9)

    idx_month = HEADER_ORDER.index("Month")
    idx_revenue = HEADER_ORDER.index("Revenue")
    cuts[idx_month]  -= LEFT_SHIFT_MONTH
    cuts[idx_revenue] -= LEFT_SHIFT_REVENUE

    clamped = []
    for i, c in enumerate(cuts):
        base = base_cuts[i] if i < len(base_cuts) else c
        delta = CLAMP_DELTA[i] if i < len(CLAMP_DELTA) else 50
        clamped.append(max(base - delta, min(base + delta, c)))
    return clamped, header_y

# ---------------- Type-Aware Cut Line Refinement ----------------
def _is_month_token(txt: str) -> bool:
    s = str(txt).strip()
    return bool(MONTH_FULL.match(s) or MONTH_REV.match(s) or MONTH_ONLY.match(s))

def _is_money_token(txt: str) -> bool:
    s = str(txt).strip().replace(",", "")
    return bool(NUM_RE.match(s))

def refine_cuts_typeaware(page, cuts, hy, sample_rows=None):
    words = chars_to_words(page)
    body = [w for w in words if w["top"] > hy + 1 and not _is_exclude_token(w)]

    rows, cur_y, cur = [], None, []
    for w in sorted(body, key=lambda w: (round(w["top"], 1), w["x0"])):
        y = round(w["top"], 1)
        if cur_y is None or abs(y - cur_y) <= Y_TOL:
            cur.append(w); cur_y = y if cur_y is None else (cur_y + y) / 2
        else:
            rows.append(cur); cur = [w]; cur_y = y
    if cur: rows.append(cur)

    month_right, revenue_left = [], []
    revenue_right, nafi_left = [], []
    nafi_right, annual_rev_left = [], []
    annual_rev_right, annual_nafi_left = [], []

    for r in rows:
        for w in r:
            xmid = (w["x0"] + w["x1"]) / 2
            bi = assign_bin(xmid, cuts)
            if bi is None:
                continue
            txt = w["text"]
            if bi == 2 and _is_month_token(txt):  # Month bucket
                month_right.append(w["x1"])
            if bi == 3 and _is_money_token(txt):  # Revenue bucket
                revenue_left.append(w["x0"])
                revenue_right.append(w["x1"])
            if bi == 4 and _is_money_token(txt):  # NAFI Amt bucket
                nafi_left.append(w["x0"])
                nafi_right.append(w["x1"])
            if bi == 5 and _is_money_token(txt):  # Annual Revenue bucket
                annual_rev_left.append(w["x0"])
                annual_rev_right.append(w["x1"])
            if bi == 6 and _is_money_token(txt):  # Annual NAFI bucket
                annual_nafi_left.append(w["x0"])

    def pct(a, p):
        return float(np.percentile(a, p)) if a else None

    # Month↔Revenue boundary (cuts[3])
    left_p = pct(month_right, 100)
    right_p = pct(revenue_left, 0)
    if left_p is not None and right_p is not None and right_p > left_p:
        target = (left_p + right_p) / 2
        base = BASE_CUTS[3]; delta = CLAMP_DELTA[3]
        cuts[3] = max(base - delta, min(base + delta, target))
    if left_p is not None and right_p is None:
        cuts[3] = max(cuts[3], left_p + 1)

    # Revenue↔NAFI boundary (cuts[4])
    left_p = pct(revenue_right, 100)
    right_p = pct(nafi_left, 0)
    if left_p is not None and right_p is not None and right_p > left_p:
        target = (left_p + right_p) / 2
        base = BASE_CUTS[4]; delta = CLAMP_DELTA[4]
        cuts[4] = max(base - delta, min(base + delta, target))
    if left_p is not None and right_p is None:
        cuts[4] = max(cuts[4], left_p + 1)

    # NAFI Amt↔Annual Revenue boundary (cuts[5])
    left_p = pct(nafi_right, 100)
    right_p = pct(annual_rev_left, 0)
    if left_p is not None and right_p is not None and right_p > left_p:
        target = (left_p + right_p) / 2
        base = BASE_CUTS[5]; delta = CLAMP_DELTA[5]
        cuts[5] = max(base - delta, min(base + delta, target))
    if left_p is not None and right_p is None:
        cuts[5] = max(cuts[5], left_p + 1)

    # Annual Revenue↔Annual NAFI boundary (cuts[6])
    left_p = pct(annual_rev_right, 100)
    right_p = pct(annual_nafi_left, 0)
    if left_p is not None and right_p is not None and right_p > left_p:
        target = (left_p + right_p) / 2
        base = BASE_CUTS[6]; delta = CLAMP_DELTA[6]
        cuts[6] = max(base - delta, min(base + delta, target))
    if left_p is not None and right_p is None:
        cuts[6] = max(cuts[6], left_p + 1)

    return cuts

# ---------------- Extract Rows Using Cut Lines ----------------
def rows_from_page(page, cuts, hy):
    words = chars_to_words(page)
    body = [w for w in words if w["top"] > hy + 1 and not _is_exclude_token(w)]

    rows, cur_y, cur = [], None, []
    for w in sorted(body, key=lambda w: (round(w["top"], 1), w["x0"])):
        y = round(w["top"], 1)
        if cur_y is None or abs(y - cur_y) <= Y_TOL:
            cur.append(w); cur_y = y if cur_y is None else (cur_y + y) / 2
        else:
            rows.append(cur); cur = [w]; cur_y = y
    if cur: rows.append(cur)

    out = []
    for r in rows:
        buckets = {i: [] for i in range(len(cuts) - 1)}
        for w in r:
            xmid = (w["x0"] + w["x1"]) / 2
            bi = assign_bin(xmid, cuts)
            if bi is not None:
                buckets[bi].append(w)

        vals = [smart_join(buckets[i]) for i in range(len(cuts) - 1)]

        # If the 3rd column (Month) looks like a monetary value and the 4th (Revenue) looks like a month -> swap them
        if len(vals) >= 4:
            m, rv = vals[2].strip(), vals[3].strip()
            if (m and NUM_RE_STRICT.match(m)) and (rv and (MONTH_FULL.match(rv) or MONTH_REV.match(rv) or MONTH_ONLY.match(rv))):
                vals[2], vals[3] = rv, m

        if sum(1 for v in vals if v not in ("", "-")) >= DROP_MIN:
            out.append(vals)

    return pd.DataFrame(out, columns=COLS)

# ---------------- Repair Functions ----------------
def repair_loc_and_location(df):
    mask = df["Loc #"].astype(str).str.match(r"^\d+\s+\S+", na=False)
    if mask.any():
        ex = df.loc[mask, "Loc #"].astype(str).str.extract(r"^(\d+)\s+(.*)$")
        df.loc[mask, "Loc #"] = ex[0]
        df.loc[mask, "Location"] = (ex[1].fillna("").str.strip() + " " + df.loc[mask, "Location"].fillna("")).str.strip().replace({"": None})

    # Added: Handle cases where Location is a pure 6-digit number (append to the previous row's Location)
    for i in range(len(df)-1, 0, -1):
        cur_loc = str(df.at[i, "Location"]).strip()
        if re.fullmatch(r"\d{6}", cur_loc):
            prev_loc = str(df.at[i-1, "Location"]).strip()
            if prev_loc:
                df.at[i-1, "Location"] = (prev_loc + " " + cur_loc).strip()
                df.at[i, "Location"] = np.nan

    for i in range(1, len(df)):
        cur_locnum = str(df.at[i, "Loc #"]) if pd.notna(df.at[i, "Loc #"]) else ""
        cur_loc = str(df.at[i, "Location"]) if pd.notna(df.at[i, "Location"]) else ""
        prev_loc = str(df.at[i - 1, "Location"]) if pd.notna(df.at[i - 1, "Location"]) else ""
        if re.fullmatch(r"\d{6}", cur_locnum) and (cur_loc == "" or cur_loc.lower() == "nan") and prev_loc not in ("", "nan"):
            df.at[i - 1, "Location"] = (prev_loc + " " + cur_locnum).strip()
            df.at[i, "Loc #"] = df.at[i - 1, "Loc #"]
    return df

def split_and_swap_month_revenue(df):
    def split_cell(mon, rev):
        ms = (None if pd.isna(mon) else str(mon).strip())
        rs = (None if pd.isna(rev) else str(rev).strip())
        if ms:
            cleaned = ms.replace(",", "")
            toks = re.split(r"\s+", cleaned)
            mon_tok = next((t for t in toks if _is_month_token(t)), None)
            money_toks = [t for t in toks if not _is_month_token(t)]
            money = "".join(money_toks)
            if mon_tok: ms = mon_tok
            if money and not (rs and NUM_RE_STRICT.match(rs)): rs = money
        if rs and _is_month_token(rs):
            if ms and (NUM_RE_STRICT.match(ms) or NUM_RE_STRICT.match(ms.replace("$",""))):
                ms, rs = rs, ms
        ms = ms if (ms and ms.strip("-")) else None
        rs = rs if (rs and rs.strip("-")) else None
        return ms, rs
    tmp = df.apply(lambda r: split_cell(r["Month"], r["Revenue"]), axis=1, result_type="expand")
    df["Month"], df["Revenue"] = tmp[0], tmp[1]
    return df

def normalize_months(df):
    def norm(m):
        if pd.isna(m):
            return m
        s = str(m).strip()
        if MONTH_REV.match(s):
            parts = re.split(r'[-/]', s)
            if len(parts) == 2:
                year, mon = parts
                mon = mon[:3].capitalize()
                return mon + '-' + year
        return s
    df["Month"] = df["Month"].apply(norm)
    return df

def _seeded_ffill(series: pd.Series, seed):
    s = series.replace({"": None}).copy()
    if s.empty: return s
    if pd.isna(s.iloc[0]) and seed is not None:
        s.iloc[0] = seed
    return s.ffill()

def pagewise_seeded_ffill(dfp: pd.DataFrame, prev_locnum, prev_loc):
    for col in ["Loc #", "Location"]:
        dfp[col] = dfp[col].replace({"": None})
    dfp["Loc #"]    = _seeded_ffill(dfp["Loc #"], prev_locnum)
    dfp["Location"] = _seeded_ffill(dfp["Location"], prev_loc)
    return dfp

def finalize(df):
    mask = df["Month"].astype(str).str.match(MONTH_ONLY, na=False) & df["Revenue"].astype(str).isin({str(x) for x in YEAR_FRAG})
    df.loc[mask, "Month"] = df.loc[mask, "Month"] + "-" + df.loc[mask, "Revenue"].astype(str).str[-2:]
    df.loc[mask, "Revenue"] = np.nan

    df["Loc #"] = df["Loc #"].replace({"": None}).ffill()
    df["Location"] = df["Location"].replace({"": None}).ffill()

    for c in ["Revenue", "NAFI Amt", "Annual Revenue", "Annual NAFI"]:
        df[c] = (df[c].astype(str)
                 .str.replace(" ", "", regex=False)
                 .str.replace(",", "", regex=False)
                 .str.replace("$", "", regex=False)
                 .str.replace("(", "-", regex=False)
                 .str.replace(")", "", regex=False))
        df[c] = pd.to_numeric(df[c], errors="coerce")
    return df.dropna(how="all").reset_index(drop=True)

# --- Left-pack Monthly Numeric Values ---
def monthly_numeric_left_pack(df):
    num_cols = ["Revenue", "NAFI Amt", "Annual Revenue", "Annual NAFI"]
    def is_monthlike(x):
        if pd.isna(x): return False
        s = str(x).strip()
        return bool(MONTH_FULL.match(s) or MONTH_REV.match(s) or MONTH_ONLY.match(s))
    def pack_row(r):
        if not is_monthlike(r.get("Month")):
            return r
        vals = [r[c] for c in num_cols]
        avail = [v for v in vals if pd.notna(v)]
        if avail:
            r["Revenue"] = avail[0]
            r["NAFI Amt"] = avail[1] if len(avail) > 1 else np.nan
            r["Annual Revenue"] = avail[2] if len(avail) > 2 else np.nan
            r["Annual NAFI"] = avail[3] if len(avail) > 3 else np.nan
        return r
    return df.apply(pack_row, axis=1)

# ---------------- Fill Missing Annual Totals ----------------
def fill_missing_annual(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    month_map = {m: i for i, m in enumerate(["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"], start=1)}
    def parse_date(m):
        if pd.isna(m):
            return None
        m = str(m)
        if re.match(r"^[A-Za-z]{3}-\d{2}$", m):
            mon, yr = m.split('-')
            return (2000 + int(yr), month_map.get(mon[:3], 0))
        return None
    df["_ym"] = df["Month"].apply(parse_date)
    for (locnum, loc), idxs in df.groupby(["Loc #", "Location"], dropna=False).groups.items():
        idxs = list(idxs)
        sub = df.loc[idxs].copy()
        sub["_ym_key"] = sub["_ym"].apply(lambda t: t[0]*100 + t[1] if t else None)
        sub = sub.sort_values("_ym_key")
        revenues = sub["Revenue"].fillna(0).to_list()
        nafis    = sub["NAFI Amt"].fillna(0).to_list()
        for i, (idx, row) in enumerate(sub.iterrows()):
            ym = row["_ym"]
            if (pd.isna(row["Annual Revenue"]) or pd.isna(row["Annual NAFI"])) and ym and ym[1] == 9:
                start = max(0, i-11)
                s_rev = sum(revenues[start:i+1])
                s_naf = sum(nafis[start:i+1])
                if pd.isna(row["Annual Revenue"]):
                    df.at[idx,"Annual Revenue"] = s_rev
                if pd.isna(row["Annual NAFI"]):
                    df.at[idx,"Annual NAFI"] = s_naf
    df.drop(columns=["_ym"], inplace=True, errors="ignore")
    df.drop(columns=["_ym_key"], inplace=True, errors="ignore")
    return df

# ---------------- Main Process ----------------
def main():
    all_pages = []
    prev_loc = None
    prev_locnum = None
    prev_cuts = None

    with pdfplumber.open(PDF) as pdf:
        # MODIFIED: Changed page limit to 120
        last = min(120, len(pdf.pages))

        # This fallback for header y-position is based on page 2 (index 1).
        # This is safe because page 2 is not in the skip list.
        words2 = chars_to_words(pdf.pages[1])
        header2_y = header_y_of(words2)

        # MODIFIED: Define pages to skip
        # Note: The loop starts from page 2, which already skips page 1.
        skip_pages = {1, 35, 73, 115}

        for page_num in range(1, last + 1):
            # MODIFIED: Skip specified pages
            if page_num in skip_pages:
                print(f"[p{page_num}] SKIPPED.")
                continue

            page = pdf.pages[page_num - 1]
            words = chars_to_words(page)

            # 1) Infer cut lines from header; fallback to default cuts if not found
            cuts, hy = cuts_from_header(words, BASE_CUTS, prev_cuts=prev_cuts)
            hy = hy or header2_y

            # 2) Type-aware refinement
            cuts = refine_cuts_typeaware(page, cuts, hy)

            # 3) Extract row data
            dfp = rows_from_page(page, cuts, hy)
            if dfp.empty:
                print(f"[p{page_num}] rows=0")
                continue

            # 4) Repair columns, split month and amount
            dfp = repair_loc_and_location(dfp)
            dfp = split_and_swap_month_revenue(dfp)
            dfp = normalize_months(dfp)

            # 5) Forward fill across pages
            dfp = pagewise_seeded_ffill(dfp, prev_locnum, prev_loc)

            # 6) Data cleaning and transformation
            dfp = finalize(dfp)

            # 7) Left-pack numeric values
            dfp = monthly_numeric_left_pack(dfp)

            # 8) Update state and cuts from the previous page
            if dfp["Location"].notna().any():
                prev_loc = dfp["Location"].dropna().iloc[-1]
            if dfp["Loc #"].notna().any():
                prev_locnum = dfp["Loc #"].dropna().iloc[-1]
            prev_cuts = cuts

            dfp.insert(0, "Page", page_num)
            all_pages.append(dfp)
            print(f"[p{page_num}] rows={len(dfp)}")

    if all_pages:
        out = pd.concat(all_pages, ignore_index=True)
        # Fill missing annual totals
        out = fill_missing_annual(out)
        out.to_csv(OUT, index=False)
        print(f"✅ Done. rows={len(out)} → {OUT}")
    else:
        print("⚠️ No valid data extracted.")

if __name__ == "__main__":
    main()

[p1] SKIPPED.
[p2] rows=14
[p3] rows=38
[p4] rows=20
[p5] rows=41
[p6] rows=43
[p7] rows=29
[p8] rows=41
[p9] rows=43
[p10] rows=8
[p11] rows=40
[p12] rows=42
[p13] rows=47
[p14] rows=39
[p15] rows=44
[p16] rows=41
[p17] rows=44
[p18] rows=41
[p19] rows=44
[p20] rows=38
[p21] rows=40
[p22] rows=44
[p23] rows=33
[p24] rows=39
[p25] rows=4
[p26] rows=26
[p27] rows=41
[p28] rows=42
[p29] rows=39
[p30] rows=17
[p31] rows=39
[p32] rows=38
[p33] rows=36
[p34] rows=10
[p35] SKIPPED.
[p36] rows=14
[p37] rows=40
[p38] rows=30
[p39] rows=36
[p40] rows=38
[p41] rows=37
[p42] rows=15
[p43] rows=44
[p44] rows=47
[p45] rows=21
[p46] rows=38
[p47] rows=42
[p48] rows=41
[p49] rows=44
[p50] rows=25
[p51] rows=44
[p52] rows=43
[p53] rows=42
[p54] rows=44
[p55] rows=40
[p56] rows=44
[p57] rows=40
[p58] rows=40
[p59] rows=44
[p60] rows=32
[p61] rows=41
[p62] rows=16
[p63] rows=26
[p64] rows=40
[p65] rows=44
[p66] rows=11
[p67] rows=41
[p68] rows=31
[p69] rows=41
[p70] rows=38
[p71] rows=40
[p72] rows=13
[