# **Merge Title II (all years, harmonized) with MANUALLY VALIDATED crosswalk**

In [None]:
import pandas as pd
import re
from pathlib import Path

# ============================================================
# Block A: Merge Title II (all years, harmonized) with MANUALLY VALIDATED crosswalk
# ============================================================

TITLEII_ALL_YEARS_FILE = Path("TitleII_AllYears_Harmonized_2024Schema.xlsx")
CROSSWALK_VALIDATED_FILE = Path("_FINAL_TitleII_with_IPEDS_Matched_UnitID_and_Fuzzy_Details.xlsx")
CROSSWALK_SHEET = "Cleaned_Final"
OUTPUT_FILE = Path("T2_IPEDS_Combined_Data_Matched_Final.xlsx")

STATE_NAME_TO_ABBR = {
    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR", "California": "CA",
    "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "Florida": "FL", "Georgia": "GA",
    "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA",
    "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD",
    "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO",
    "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ",
    "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH",
    "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI",
    "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX",
    "Utah": "UT", "Vermont": "VT", "Virginia": "VA", "Washington": "WA",
    "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS", "Guam": "GU", "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR", "Virgin Islands": "VI",
    "Marshall Islands": "MH", "Micronesia": "FM",
}

def standardize_name(name):
    if pd.isna(name) or name is None:
        return ""
    name = str(name).lower()
    name = name.replace(" at ", "-").replace(" & ", " and ")
    name = re.sub(r"\s*-\s*", "-", name)
    name = re.sub(r"-+", "-", name)
    name = name.replace(" uni ", " university ").replace(" univ ", " university ")
    name = name.replace(" coll ", " college ").replace(" clg ", " college ")
    name = name.replace(" inst ", " institute ")
    name = name.replace(" tech ", " technical ")
    name = name.replace(" st ", " saint ").replace(" mt ", " mount ")
    name = name.replace(" dept ", " department ")
    name = name.replace(" isd ", " independent school district ")
    name = name.replace(" sch ", " school ")
    name = name.replace(" of ", " ")
    if name.startswith("the "):
        name = name[4:]
    name = re.sub(r"[^a-z0-9\s-]", "", name)
    name = re.sub(r"\s+", " ", name).strip().strip("-")
    return name

def normalize_key_series(s: pd.Series) -> pd.Series:
    return (
        s.fillna("")
         .astype(str)
         .str.replace("\u00A0", " ", regex=False)
         .str.replace("–", "-", regex=False)
         .str.replace("—", "-", regex=False)
         .str.replace("’", "'", regex=False)
         .str.strip()
    )

# ------------------------------------------------------------
# Existence checks
# ------------------------------------------------------------
for f in [TITLEII_ALL_YEARS_FILE, CROSSWALK_VALIDATED_FILE]:
    if not f.exists():
        raise FileNotFoundError(f"Required file not found: {f}. Upload it to /content in Colab.")

# ------------------------------------------------------------
# Load
# ------------------------------------------------------------
titleii = pd.read_excel(TITLEII_ALL_YEARS_FILE)
crosswalk = pd.read_excel(CROSSWALK_VALIDATED_FILE, sheet_name=CROSSWALK_SHEET)

print("Title II rows:", len(titleii))
print("Validated crosswalk rows:", len(crosswalk))

# ------------------------------------------------------------
# Validate required raw columns
# ------------------------------------------------------------
for col in ["State", "Program", "ProgramType"]:
    if col not in titleii.columns:
        raise KeyError(f"Title II missing '{col}'. Available: {titleii.columns.tolist()}")

for col in ["Standardized_State", "Program", "ProgramType", "Matched_UnitID"]:
    if col not in crosswalk.columns:
        raise KeyError(f"Crosswalk missing '{col}'. Available: {crosswalk.columns.tolist()}")

# ------------------------------------------------------------
# Build merge keys (RECOMPUTE, do not trust saved Standardized_Program)
# ------------------------------------------------------------
titleii["TitleII_State_Abbr"] = titleii["State"].astype(str).str.strip().map(STATE_NAME_TO_ABBR)
titleii["Standardized_Program"] = titleii["Program"].astype(str).map(standardize_name)
titleii["ProgramType"] = titleii["ProgramType"].astype(str).str.strip()

crosswalk["Crosswalk_State_Abbr"] = crosswalk["Standardized_State"].astype(str).str.strip()
crosswalk["Standardized_Program"] = crosswalk["Program"].astype(str).map(standardize_name)
crosswalk["ProgramType"] = crosswalk["ProgramType"].astype(str).str.strip()

unmapped_states = titleii[titleii["TitleII_State_Abbr"].isna()]["State"].dropna().unique().tolist()
if unmapped_states:
    print("WARNING: Unmapped Title II states/territories:", unmapped_states)

# ------------------------------------------------------------
# Carry-through columns from validated crosswalk
# ------------------------------------------------------------
columns_to_add = [
    "Matched_UnitID",
    "Exact_Matched_IPEDS_Name",
    "Exact_Matched_State",
    "Match_Source",
    "Match_Category",
    "IHE_Title_IV_Funded",
    "IHE_Based_RPT",
    "NON_IHE_Based_RPT",
    "Notes",
]
missing_add = [c for c in columns_to_add if c not in crosswalk.columns]
if missing_add:
    raise KeyError(f"Crosswalk missing expected columns: {missing_add}")

# ------------------------------------------------------------
# Harden merge keys (both sides)
# ------------------------------------------------------------
for col in ["TitleII_State_Abbr", "ProgramType", "Standardized_Program"]:
    titleii[col] = normalize_key_series(titleii[col])
for col in ["Crosswalk_State_Abbr", "ProgramType", "Standardized_Program"]:
    crosswalk[col] = normalize_key_series(crosswalk[col])

# ------------------------------------------------------------
# Subset + dedupe crosswalk on join keys
# ------------------------------------------------------------
right_keys = ["Crosswalk_State_Abbr", "ProgramType", "Standardized_Program"]
crosswalk_subset = (
    crosswalk.loc[:, right_keys + columns_to_add]
    .drop_duplicates(subset=right_keys)
)

# ------------------------------------------------------------
# Merge
# ------------------------------------------------------------
left_keys = ["TitleII_State_Abbr", "ProgramType", "Standardized_Program"]

merged = pd.merge(
    titleii,
    crosswalk_subset,
    left_on=left_keys,
    right_on=right_keys,
    how="left",
    suffixes=("", "_cw"),
)

# ------------------------------------------------------------
# POST-MERGE FAILSAFE
# ------------------------------------------------------------
merged["ProgramType"] = merged["ProgramType"].astype(str).str.strip()

EXPECTED_NO_UNITID_PROGRAMTYPES = {"Alternative, not IHE-based"}
IHE_EXPECTED_PROGRAMTYPES = {"Traditional", "Alternative, IHE-based"}

merged["Match_Status"] = "Matched"
merged.loc[merged["Matched_UnitID"].isna(), "Match_Status"] = "Unmatched"
merged.loc[
    merged["ProgramType"].isin(EXPECTED_NO_UNITID_PROGRAMTYPES) & merged["Matched_UnitID"].isna(),
    "Match_Status"
] = "Expected blank (non-IHE program)"

needs_review = merged[
    (merged["ProgramType"].isin(IHE_EXPECTED_PROGRAMTYPES)) &
    (merged["Matched_UnitID"].isna()) &
    (
        merged["IHE_Title_IV_Funded"].isna() |
        merged["IHE_Based_RPT"].isna() |
        merged["NON_IHE_Based_RPT"].isna()
    )
].copy()

print("\n--- Post-merge audit ---")
print("Expected blank (non-IHE program) rows:", (merged["Match_Status"] == "Expected blank (non-IHE program)").sum())
print("Needs review (IHE-expected, still unmatched) rows:", len(needs_review))

needs_review.to_csv("audit_needs_review_ihe_expected.csv", index=False)

print("Final rows:", len(merged))
print("Matched UnitIDs:", merged["Matched_UnitID"].notna().sum())
print("Unmatched rows:", merged["Matched_UnitID"].isna().sum())

merged.to_excel(OUTPUT_FILE, index=False)
print(f"Saved merged file → {OUTPUT_FILE}")


# **Post-merge Derivations & Final Column Ordering**

In [None]:
import pandas as pd
import numpy as np

# ============================================================
# Post-merge derivations & final column ordering
# ============================================================

INPUT_FILE = "T2_IPEDS_Combined_Data_Matched_Final.xlsx"
OUTPUT_FILE = "T2_IPEDS_Combined_Data_Final_Updated_Reordered.xlsx"

df = pd.read_excel(INPUT_FILE)

print("Loaded merged dataset:", len(df), "rows")

# ------------------------------------------------------------
# 1) Adjust TotalEnrollment to account for post-2019 definition change
# ------------------------------------------------------------
df["ReportYear"] = df["ReportYear"].astype(int)

df["TotalEnrollment_Adj"] = np.where(
    (df["ReportYear"] >= 2012) & (df["ReportYear"] <= 2019),
    df["TotalEnrollment"],
    df["TotalEnrollment"] - df["CompletersCurrent"]
)

# Optional safety check
negatives = df[df["TotalEnrollment_Adj"] < 0]
if not negatives.empty:
    print("WARNING: Negative adjusted enrollment values detected:")
    print(negatives[["ReportYear", "Program", "TotalEnrollment", "CompletersCurrent", "TotalEnrollment_Adj"]].head())

# ------------------------------------------------------------
# 2) Define authoritative column order
# ------------------------------------------------------------
ordered_columns = [
    'ReportYear',
    'State', 'TitleII_State_Abbr',
    'ProgramCode',
    'Program',
    'ProgramType',
    'SchoolNeeds',
    'StateLocalNeeds',
    'LocalityTrn',
    'SpecEdCore',
    'GenEdDisabilities',
    'GenEdLEP',
    'GenEdLowInc',
    'GrantMajors',
    'TotalEnrollment',
    'TotalEnrollment_Adj',
    'CompletersCurrent',
    'AsianCompleters',
    'AsianEnrollment',
    'BlackCompleters',
    'BlackEnrollment',
    'HispanicCompleters',
    'HispanicEnrollment',
    'IndianCompleters',
    'IndianEnrollment',
    'IslanderCompleters',
    'IslanderEnrollment',
    'FemaleCompleters',
    'FemaleEnrollment',
    'MaleCompleters',
    'MaleEnrollment',
    'MultiRacialCompleters',
    'MultiRacialEnrollment',
    'NonReportGenderCompleters',
    'NonReportGenderEnrollment',
    'NonReportRaceEthCompleters',
    'NonReportRaceEthEnrollment',
    'OtherGenderCompleters',
    'OtherGenderEnrollment',
    'WhiteCompleters',
    'WhiteEnrollment',
    'PGMinGPAEntry',
    'PGMinGPAExit',
    'PGPrograms',
    'SCEAvgHrsFor_StdntTch',
    'SCEAvgHrsPrior_StdntTch',
    'SCEAvgHrsPrior_TeachRcd',
    'SCENumAdjunctIHE',
    'SCENumCoopTeachK12',
    'SuperFTEFaculty',
    'SuperStudents',
    'TotalNumPrepPrgs',
    'UGMinGPAEntry',
    'UGMinGPAExit',
    'UGPrograms',
    'AssuranceComments',
    # Matching / provenance fields (kept at end by design)
    'Standardized_Program',
    'Matched_UnitID',
    'Exact_Matched_IPEDS_Name',
    'Exact_Matched_State',
    'Match_Source',
    'Match_Category',
    'IHE_Title_IV_Funded',
    'IHE_Based_RPT',
    'NON_IHE_Based_RPT',
    'Notes'
]

missing_cols = [c for c in ordered_columns if c not in df.columns]
if missing_cols:
    raise KeyError(f"Missing expected columns before reordering: {missing_cols}")

df = df[ordered_columns]

# ------------------------------------------------------------
# 3) Export final analysis-ready dataset
# ------------------------------------------------------------
df.to_excel(OUTPUT_FILE, index=False)

print("Final dataset saved to:", OUTPUT_FILE)
print("Final column count:", len(df.columns))