In [1]:
import logging
import re
from typing import Callable, Dict, Iterable, Optional

import numpy as np
import pandas as pd

In [3]:
# --------------------------- Config / Globals --------------------------- #

INPUT_CSV  = "ufc_comprehensive_data.csv"
OUTPUT_CSV = "ufc_data_numeric_converted.csv"

# Use a fixed reference date (as in your original script). Make it a parameter for testability.
REF_DATE = pd.Timestamp(2025, 10, 17)

# Columns that can appear and should be dropped if present
DETAIL_COLS: tuple[str, ...] = ("height_detail", "weight_detail", "reach_detail", "stance_detail")

STANCE_MAP: Dict[str, int] = {
    "orthodox": 1,
    "southpaw": 2,
    "switch":   3,
}

PERCENT_COLUMNS: tuple[str, ...] = (
    "striking_accuracy",
    "striking_defense",
    "takedown_accuracy",
    "takedown_defense",
)

In [4]:
# --------------------------- Logging Setup ----------------------------- #

logging.basicConfig(
    level=logging.INFO,
    format="%(levelname)s:%(message)s"
)
log = logging.getLogger(__name__)

In [6]:
# --------------------------- Utilities --------------------------------- #

def drop_if_present(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    """Drop any columns from 'cols' that are present in df, ignoring the rest."""
    present = [c for c in cols if c in df.columns]
    if present:
        log.info("Dropping columns: %s", present)
        return df.drop(columns=present, errors="ignore")
    return df

def parse_height_to_inches(value: object) -> float:
    if pd.isna(value):
        return np.nan
    s = str(value).strip().lower()
    if not s or s == "nan":
        return np.nan
    if "'" in s:
        try:
            feet, inches = s.replace('"', "").split("'")
            feet = float(feet) if feet else 0.0
            inches = float(inches) if inches else 0.0
            return feet * 12 + inches
        except Exception:
            return np.nan
    try:
        s = s.replace('"', "").replace("in", "").strip()
        return float(s)
    except Exception:
        return np.nan

def to_float_strip_units(value: object, units: Iterable[str]) -> float:
    if pd.isna(value):
        return np.nan
    s = str(value).strip().lower()
    if not s or s == "nan":
        return np.nan
    for u in units:
        s = s.replace(u.lower(), "")
    s = re.sub(r"\s+", "", s)
    try:
        return float(s)
    except Exception:
        return np.nan

def parse_inches(value: object) -> float:
    return to_float_strip_units(value, units=("in", '"'))

def parse_weight_lbs(value: object) -> float:
    return to_float_strip_units(value, units=("lbs", "lb", "pounds"))

def parse_percent(value: object) -> float:
    return to_float_strip_units(value, units=("%",))

def map_stance(value: object, mapping: Dict[str, int]) -> float:
    if pd.isna(value):
        return np.nan
    key = str(value).strip().lower()
    return float(mapping.get(key, np.nan))

def dob_series_to_age(dob: pd.Series, ref: pd.Timestamp) -> pd.Series:
    """
    Vectorized 'dob' (e.g., 'Jul 13, 1978') -> integer ages as of 'ref'.
    Invalid/unparseable rows become <NA> (nullable Int64 dtype).
    """
    parsed = pd.to_datetime(dob.astype("string").str.strip(), format="%b %d, %Y", errors="coerce")
    age_years = (
        ref.year - parsed.dt.year
        - ((ref.month, ref.day) < (parsed.dt.month, parsed.dt.day)).astype("int")
    )
    age_years = age_years.where(parsed.notna())  # Keep NaN when parse failed
    return age_years.astype("Int64")

def apply_cleaners(df: pd.DataFrame, spec: Dict[str, Callable[[object], float]]) -> pd.DataFrame:
    for col, fn in spec.items():
        if col in df.columns:
            log.info("Cleaning column: %s", col)
            df[col] = df[col].apply(fn)
        else:
            log.info("Column not found (skipped): %s", col)
    return df

def report_columns(df: pd.DataFrame, cols: Iterable[str]) -> None:
    """Compact report: dtype, missing %, and first 5 non-null values."""
    log.info("---- Column Report ----")
    n = len(df)
    for c in cols:
        if c not in df.columns:
            log.info("%s: <missing column>", c)
            continue
        miss = df[c].isna().sum()
        dtype = df[c].dtype
        sample = df[c].dropna().head(5).to_list()
        log.info("%s | dtype=%s | missing=%d (%.1f%%) | sample=%s",
                 c, dtype, miss, miss / max(n, 1) * 100.0, sample)

In [7]:
# --------------------------- Pipeline ---------------------------------- #

def clean_matches(
    csv_path: str = INPUT_CSV,
    ref_date: pd.Timestamp = REF_DATE,
    output_path: Optional[str] = OUTPUT_CSV,
) -> pd.DataFrame:
    """
    End-to-end cleaning pipeline:
      - Load
      - Drop *_detail columns if present
      - Replace 'dob' -> 'age'
      - Normalize height/weight/reach/stance
      - Normalize % columns
      - Save (optional)
    """
    log.info("Loading: %s", csv_path)
    matches = pd.read_csv(csv_path, index_col=0)
    df = matches.copy()

    # 1) Drop *_detail columns
    df = drop_if_present(df, DETAIL_COLS)

    # 2) dob -> age
    if "dob" in df.columns:
        log.info("Converting 'dob' to 'age' with reference date %s", ref_date.date())
        df["age"] = dob_series_to_age(df["dob"], ref=ref_date)
        df = df.drop(columns=["dob"])
    else:
        log.warning("'dob' column not found; skipping age calculation.")

    # 3) Column-specific cleaners (DRY via one mapping)
    cleaners: Dict[str, Callable[[object], float | np.nan]] = {
        "height": parse_height_to_inches,
        "reach":  parse_inches,
        "weight": parse_weight_lbs,
    }
    df = apply_cleaners(df, cleaners)

    # 4) Stance mapping (vectorized)
    if "stance" in df.columns:
        log.info("Mapping stance -> numeric codes")
        df["stance"] = df["stance"].map(lambda v: map_stance(v, STANCE_MAP))
    else:
        log.info("Column not found (skipped): stance")

    # 5) Percent columns (apply one function across many columns)
    percent_clean_spec = {c: parse_percent for c in PERCENT_COLUMNS}
    df = apply_cleaners(df, percent_clean_spec)

    # 6) Quick report
    columns_to_report = [
        "age", "height", "weight", "reach", "stance", *PERCENT_COLUMNS
    ]
    report_columns(df, columns_to_report)

    # 7) Save
    if output_path:
        df.to_csv(output_path)
        log.info("Saved cleaned data -> %s", output_path)

    # 8) dtypes overview (single line)
    log.info("Dtypes summary:\n%s", df.dtypes)
    return df

In [8]:
# --------------------------- Script Entry ------------------------------- #

if __name__ == "__main__":
    clean_matches()


INFO:Loading: ufc_comprehensive_data.csv
INFO:Dropping columns: ['height_detail', 'weight_detail', 'reach_detail', 'stance_detail']
INFO:Converting 'dob' to 'age' with reference date 2025-10-17


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().