## **Title II (2012–2024) Harmonization to 2024 Schema**

In [None]:
import pandas as pd
import requests
import io
from pathlib import Path
from typing import List

# ============================================================
# Title II (2012–2024) Harmonization to 2024 Schema
# ============================================================

TITLEII_SHEET_NAME = "Program"

TITLEII_URLS = [
    "https://title2.ed.gov/Public/DataTools/2012/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2013/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2014/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2015/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2016/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2017/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2018/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2019/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2020/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2021/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2022/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2023/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2024/AllStates.xlsx",
]

TARGET_PROGRAM_TYPES = [
    "Traditional",
    "Alternative, not IHE-based",
    "Alternative, IHE-based",
]

# --- Canonical 2024 column order (your ordered_columns) ---
ORDERED_COLUMNS = [
    "ReportYear", "State", "ProgramCode", "Program", "ProgramType",
    "SchoolNeeds", "StateLocalNeeds", "LocalityTrn", "SpecEdCore",
    "GenEdDisabilities", "GenEdLEP", "GenEdLowInc", "GrantMajors",
    "TotalEnrollment", "CompletersCurrent",
    "AsianCompleters", "AsianEnrollment",
    "BlackCompleters", "BlackEnrollment",
    "HispanicCompleters", "HispanicEnrollment",
    "IndianCompleters", "IndianEnrollment",
    "IslanderCompleters", "IslanderEnrollment",
    "FemaleCompleters", "FemaleEnrollment",
    "MaleCompleters", "MaleEnrollment",
    "MultiRacialCompleters", "MultiRacialEnrollment",
    "NonReportGenderCompleters", "NonReportGenderEnrollment",
    "NonReportRaceEthCompleters", "NonReportRaceEthEnrollment",
    "OtherGenderCompleters", "OtherGenderEnrollment",
    "WhiteCompleters", "WhiteEnrollment",
    "PGMinGPAEntry", "PGMinGPAExit", "PGPrograms",
    "SCEAvgHrsFor_StdntTch", "SCEAvgHrsPrior_StdntTch", "SCEAvgHrsPrior_TeachRcd",
    "SCENumAdjunctIHE", "SCENumCoopTeachK12",
    "SuperFTEFaculty", "SuperStudents", "TotalNumPrepPrgs",
    "UGMinGPAEntry", "UGMinGPAExit", "UGPrograms",
    "AssuranceComments",
]

# --- Pre-2020 → 2024-schema rename map (your pre_2020_column_map) ---
PRE2020_RENAME_TO_2024 = {
    "AsianEnrollment": "AsianEnrollment",
    "BlackEnrollment": "BlackEnrollment",
    "CompletersCurrent": "CompletersCurrent",
    "FemaleEnrollment": "FemaleEnrollment",
    "GenEdDisabilities": "GenEdDisabilities",
    "GenEdLEP": "GenEdLEP",
    "GenEdLowInc": "GenEdLowInc",
    "HispanicEnrollment": "HispanicEnrollment",
    "IndianEnrollment": "IndianEnrollment",
    "IslanderEnrollment": "IslanderEnrollment",
    "LocalityTrn": "LocalityTrn",
    "MaleEnrollment": "MaleEnrollment",
    "MultiRacialEnrollment": "MultiRacialEnrollment",
    "PGMinGPAEntry": "PGMinGPAEntry",
    "PGMinGPAExit": "PGMinGPAExit",
    "PGPrograms": "PGPrograms",
    "Program": "Program",
    "ProgramCode": "ProgramCode",
    "ProgramType": "ProgramType",
    "ReportYear": "ReportYear",
    "SchoolNeeds": "SchoolNeeds",
    "SpecEdCore": "SpecEdCore",
    "State": "State",
    "StateLocalNeeds": "StateLocalNeeds",
    "SuperAvgForStuTeach": "SCEAvgHrsFor_StdntTch",
    "SuperAvgPriorStuTeach": "SCEAvgHrsPrior_StdntTch",
    "SuperFTEAdjunct": "SCENumAdjunctIHE",
    "SuperFTEFaculty": "SuperFTEFaculty",
    "SuperStudents": "SuperStudents",
    "TotalEnrollment": "TotalEnrollment",
    "UGMinGPAEntry": "UGMinGPAEntry",
    "UGMinGPAExit": "UGMinGPAExit",
    "UGPrograms": "UGPrograms",
    "WhiteEnrollment": "WhiteEnrollment",
}

# --- Columns introduced in 2020+ (fill as NA for pre-2020 files) ---
NEW_COLUMNS_2020_PLUS = [
    "AsianCompleters",
    "AssuranceComments",
    "BlackCompleters",
    "FemaleCompleters",
    "GrantMajors",
    "HispanicCompleters",
    "IndianCompleters",
    "IslanderCompleters",
    "MaleCompleters",
    "MultiRacialCompleters",
    "NonReportGenderCompleters",
    "NonReportGenderEnrollment",
    "NonReportRaceEthCompleters",
    "NonReportRaceEthEnrollment",
    "OtherGenderCompleters",
    "OtherGenderEnrollment",
    "SCEAvgHrsPrior_TeachRcd",
    "SCENumCoopTeachK12",
    "TotalNumPrepPrgs",
    "WhiteCompleters",
]


def download_excel_sheet(url: str, sheet_name: str = TITLEII_SHEET_NAME, timeout: int = 30) -> pd.DataFrame:
    headers = {"User-Agent": "titleII-ipeds-crosswalk/1.0 (research; colab)"}
    resp = requests.get(url, timeout=timeout, headers=headers)
    resp.raise_for_status()
    return pd.read_excel(io.BytesIO(resp.content), sheet_name=sheet_name, engine="openpyxl")


def normalize_headers(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns.astype(str)
        .str.replace("\u00A0", " ", regex=False)
        .str.strip()
        .str.replace(" ", "", regex=False)
    )
    return df


def harmonize_one_year(url: str) -> pd.DataFrame:
    year = int(url.split("/")[-2])
    df = download_excel_sheet(url)
    df = normalize_headers(df)

    df["ReportYear"] = year

    # Keep only target program types
    if "ProgramType" not in df.columns:
        raise KeyError(f"{year}: Missing ProgramType column.")
    df = df[df["ProgramType"].isin(TARGET_PROGRAM_TYPES)].copy()

    if year <= 2019:
        # Select only columns we can map, then rename into 2024 schema
        existing_map = {k: v for k, v in PRE2020_RENAME_TO_2024.items() if k in df.columns}
        df = df[list(existing_map.keys())].rename(columns=existing_map)

        # Add 2020+ columns as NA so schema matches
        for c in NEW_COLUMNS_2020_PLUS:
            if c not in df.columns:
                df[c] = pd.NA

        # Also add any other ORDERED_COLUMNS missing (future-proof)
        for c in ORDERED_COLUMNS:
            if c not in df.columns:
                df[c] = pd.NA

        df = df[ORDERED_COLUMNS]

    else:
        # 2020+ already aligned to 2024 column names (after header normalization)
        # Ensure all expected columns exist
        for c in ORDERED_COLUMNS:
            if c not in df.columns:
                df[c] = pd.NA
        df = df[ORDERED_COLUMNS]

    return df


def build_titleII_all_years(urls: List[str]) -> pd.DataFrame:
    frames = []
    for url in urls:
        year = url.split("/")[-2]
        try:
            print(f"Processing {year}...")
            frames.append(harmonize_one_year(url))
        except Exception as e:
            print(f"  ERROR: {year} failed: {e}")

    if not frames:
        raise RuntimeError("No Title II data was processed.")

    all_years = pd.concat(frames, ignore_index=True)
    return all_years


# ============================================================
# Run + export
# ============================================================

titleII_all_years = build_titleII_all_years(TITLEII_URLS)

print("Combined Title II rows:", len(titleII_all_years))
print(titleII_all_years.head())

# Exports (choose what you want)
OUTPUT_XLSX = "TitleII_AllYears_Harmonized_2024Schema.xlsx"
OUTPUT_CSV  = "TitleII_AllYears_Harmonized_2024Schema.csv"

titleII_all_years.to_excel(OUTPUT_XLSX, index=False)
titleII_all_years.to_csv(OUTPUT_CSV, index=False)

print(f"Saved: {OUTPUT_XLSX}")
print(f"Saved: {OUTPUT_CSV}")