## Optional: Title II Column Schema Audit

In [None]:
!pip install pandas requests fuzzywuzzy python-Levenshtein openpyxl

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.3-py3-none-any.whl.metadata (3.9 kB)
Collecting Levenshtein==0.27.3 (from python-Levenshtein)
  Downloading levenshtein-0.27.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.7 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.3->python-Levenshtein)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Downloading python_levenshtein-0.27.3-py3-none-any.whl (9.5 kB)
Downloading levenshtein-0.27.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 M

In [None]:
import pandas as pd
import requests
from io import BytesIO
from fuzzywuzzy import process, fuzz
import numpy as np
from typing import List, Dict, Any

# If needed in Colab:
# !pip install pandas requests fuzzywuzzy python-Levenshtein openpyxl


# ============================================================
# Helpers
# ============================================================

def download_excel_sheet(
    url: str,
    sheet_name: str = "Program",
    timeout: int = 30
) -> pd.DataFrame:
    """Download an Excel file and return the requested sheet as a DataFrame."""
    headers = {"User-Agent": "titleII-ipeds-crosswalk/1.0 (research; colab)"}
    resp = requests.get(url, timeout=timeout, headers=headers)
    resp.raise_for_status()
    return pd.read_excel(BytesIO(resp.content), sheet_name=sheet_name, engine="openpyxl")


def safe_dtype_str(dtype_obj) -> str:
    try:
        return str(dtype_obj)
    except Exception:
        return ""


# ============================================================
# Main Report Function
# ============================================================

def generate_comprehensive_column_report(
    file_urls: List[str],
    reference_year_str: str = "2024",
    sheet_name: str = "Program",
    score_cutoff: int = 75,
    review_band_min: int = 75,
    review_band_max: int = 89,
    output_filename: str = "comprehensive_column_report.csv"
) -> pd.DataFrame:
    """
    Generates a comprehensive CSV report detailing:
      - Column names per year
      - Data types
      - Best match to 2024 (exact match first, then fuzzy)
      - Similarity score
      - Whether dtype matches the 2024 matched column dtype
      - Explanations for dtype mismatch + sample values
      - A NeedsReview flag for borderline name matches

    Args:
        file_urls: List of Title II AllStates.xlsx URLs (2012–2024).
        reference_year_str: Reference year to match against (default "2024").
        sheet_name: Excel sheet name (default "Program").
        score_cutoff: Fuzzywuzzy score cutoff for considering matches.
        review_band_min/review_band_max: Similarity range where matches should be reviewed.
        output_filename: CSV filename to write in the working directory.

    Returns:
        DataFrame of the report.
    """

    # Stores DataFrames of all files to avoid re-downloading
    all_dfs: Dict[str, pd.DataFrame] = {}

    # Stores reference column names and their data types
    reference_column_data_types: Dict[str, Any] = {}
    reference_column_names_list: List[str] = []

    output_rows: List[Dict[str, Any]] = []

    print("--- Step 1: Downloading and Loading All Excel Files ---")
    for url in file_urls:
        current_year_str = url.split("/")[-2]
        print(f"Loading {current_year_str} data from: {url}")

        try:
            df = download_excel_sheet(url, sheet_name=sheet_name, timeout=30)
            all_dfs[current_year_str] = df

            if current_year_str == reference_year_str:
                reference_column_data_types = {col: df[col].dtype for col in df.columns}
                reference_column_names_list = list(df.columns)
                print(f"  {reference_year_str} reference loaded with {len(reference_column_names_list)} columns.")

        except requests.exceptions.RequestException as e:
            print(f"  ERROR: Failed to download or access {url}: {e}. Skipping this file.")
            continue
        except ValueError as e:
            # pandas can raise ValueError if sheet_name missing
            print(f"  ERROR: Could not read sheet '{sheet_name}' from {url}: {e}. Skipping this file.")
            continue
        except Exception as e:
            print(f"  ERROR: Unexpected error while processing {url}: {e}. Skipping this file.")
            continue

    if not reference_column_names_list:
        raise RuntimeError(f"Could not load {reference_year_str} reference data. Cannot proceed.")

    print("\n--- Step 2: Generating Comprehensive Report ---")
    sorted_years = sorted(all_dfs.keys())

    for current_year_str in sorted_years:
        df_current = all_dfs[current_year_str]
        current_column_names = list(df_current.columns)

        for col_name_current in current_column_names:
            col_series = df_current[col_name_current]
            current_dtype = col_series.dtype

            # Initialize report row fields
            possible_match_2024_name = ""
            similarity_score = np.nan
            dtype_2024_of_match = None
            dtypes_match = np.nan
            needs_review = False
            explanation_parts: List[str] = []
            problematic_values: List[str] = []
            sample_values: List[str] = []

            # ------------------------------------------------------------
            # Name matching: exact match first, then fuzzy
            # ------------------------------------------------------------
            if current_year_str == reference_year_str:
                # Reference year: match is itself
                possible_match_2024_name = col_name_current
                similarity_score = 100
                dtype_2024_of_match = current_dtype
                dtypes_match = True
                explanation_parts.append(f"This is the {reference_year_str} reference file itself.")

            else:
                if col_name_current in reference_column_names_list:
                    possible_match_2024_name = col_name_current
                    similarity_score = 100
                else:
                    match_tuple = process.extractOne(
                        col_name_current,
                        reference_column_names_list,
                        scorer=fuzz.ratio,
                        score_cutoff=score_cutoff
                    )
                    if match_tuple:
                        possible_match_2024_name, similarity_score = match_tuple[0], match_tuple[1]
                    else:
                        possible_match_2024_name = ""
                        similarity_score = np.nan

                if possible_match_2024_name:
                    dtype_2024_of_match = reference_column_data_types.get(possible_match_2024_name)
                    if dtype_2024_of_match is not None:
                        dtypes_match = (current_dtype == dtype_2024_of_match)
                    else:
                        dtypes_match = np.nan
                        explanation_parts.append("Matched 2024 column has no recorded dtype (unexpected).")

                    # Flag borderline fuzzy matches for manual review
                    if pd.notna(similarity_score) and review_band_min <= similarity_score <= review_band_max and similarity_score != 100:
                        needs_review = True
                        explanation_parts.append("Borderline name match score—review recommended.")

                else:
                    explanation_parts.append("Column not found in 2024 reference (no strong exact/fuzzy match).")
                    dtypes_match = np.nan

            # ------------------------------------------------------------
            # Dtype mismatch diagnostics
            # ------------------------------------------------------------
            if (current_year_str != reference_year_str) and (dtypes_match is False):
                explanation_parts.append(
                    f"Dtype mismatch vs 2024 matched column ({safe_dtype_str(current_dtype)} vs {safe_dtype_str(dtype_2024_of_match)})."
                )

                # Case 1: object but numeric expected
                if safe_dtype_str(current_dtype) == "object" and (
                    safe_dtype_str(dtype_2024_of_match).startswith("int") or safe_dtype_str(dtype_2024_of_match).startswith("float")
                ):
                    explanation_parts.append("Current is 'object' where numeric expected—likely mixed values or symbols.")
                    numeric_attempt = pd.to_numeric(col_series, errors="coerce")
                    non_numeric = col_series[numeric_attempt.isna() & col_series.notna()]

                    if not non_numeric.empty:
                        examples = [str(x) for x in list(pd.unique(non_numeric))[:5]]
                        problematic_values.extend(examples)
                        explanation_parts.append(f"Contains non-numeric values (examples: {examples[:3]}).")

                    # Mixed python types (rough signal)
                    type_counts = col_series.dropna().apply(lambda x: type(x).__name__).value_counts()
                    if len(type_counts) > 1:
                        explanation_parts.append(f"Mixed Python types detected: {type_counts.to_dict()}.")

                # Case 2: float where int expected (often NaNs)
                elif safe_dtype_str(current_dtype) == "float64" and safe_dtype_str(dtype_2024_of_match).startswith("int"):
                    nan_count = int(col_series.isna().sum())
                    if nan_count > 0:
                        explanation_parts.append(f"NaNs present ({nan_count}), forcing float dtype.")
                        problematic_values.append(f"NaN count: {nan_count}")

                    non_integer_floats = col_series[col_series.notna() & (col_series % 1 != 0)]
                    if not non_integer_floats.empty:
                        examples = [str(x) for x in list(pd.unique(non_integer_floats))[:5]]
                        problematic_values.extend(examples)
                        explanation_parts.append(f"Contains non-integer float values (examples: {examples[:3]}).")

                else:
                    # Generic samples for other mismatch patterns
                    explanation_parts.append("Unexpected dtype mismatch—inspect samples.")
                    examples = [str(x) for x in col_series.dropna().head(10).tolist()]
                    problematic_values.extend(examples)

            elif (current_year_str != reference_year_str) and (dtypes_match is True) and possible_match_2024_name:
                explanation_parts.append("Dtype matches 2024 matched column.")

            # ------------------------------------------------------------
            # Sample values (always useful)
            # ------------------------------------------------------------
            sample_values = [str(x) for x in col_series.dropna().head(5).tolist()]

            # Deduplicate + cap lists
            problematic_values = list(dict.fromkeys(problematic_values))[:5]
            sample_values = list(dict.fromkeys(sample_values))[:5]

            # Add row
            output_rows.append({
                "ReportYear": current_year_str,
                "Column Name": col_name_current,
                "Data Type": safe_dtype_str(current_dtype),
                "Possible Column Name Match from 2024 data": possible_match_2024_name,
                "Similarity Score for Column Name Match": similarity_score,
                "Needs Review (Name Match)": needs_review,
                "True/False column if data types match with 2024 data type": dtypes_match,
                "2024 Data Type": safe_dtype_str(dtype_2024_of_match) if dtype_2024_of_match is not None else "",
                "Explanation": "; ".join(explanation_parts),
                "Problematic Values (if any)": "; ".join(problematic_values),
                "Sample Values": "; ".join(sample_values),
            })

    # ------------------------------------------------------------
    # Step 3: Create CSV Output
    # ------------------------------------------------------------
    print("\n--- Step 3: Writing CSV Report ---")
    report_df = pd.DataFrame(output_rows)

    # Sorting for readability
    report_df = report_df.sort_values(
        by=[
            "ReportYear",
            "Needs Review (Name Match)",
            "True/False column if data types match with 2024 data type",
            "Column Name",
        ],
        ascending=[True, False, True, True]
    )

    report_df.to_csv(output_filename, index=False)
    print(f"Successfully created '{output_filename}'.")

    return report_df


# ============================================================
# Run (Title II Program sheets: 2012–2024 inclusive)
# ============================================================

file_urls = [
    "https://title2.ed.gov/Public/DataTools/2012/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2013/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2014/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2015/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2016/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2017/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2018/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2019/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2020/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2021/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2022/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2023/AllStates.xlsx",
    "https://title2.ed.gov/Public/DataTools/2024/AllStates.xlsx",
]

report_df = generate_comprehensive_column_report(
    file_urls=file_urls,
    reference_year_str="2024",
    sheet_name="Program",
    score_cutoff=75,
    output_filename="comprehensive_column_report.csv"
)

print("\nReport preview:")
print(report_df.head(10))