# Notebook 02 — Unsupervised Learning
**RaceDayAI ML Prediction Engine (Plan 07)**

Athlete clustering, pacing archetypes, UMAP visualization, anomaly detection.

**Reads:** `athlete_race.csv`, `athlete_profile.csv`
**Writes:** `cluster_assignments.csv`, `pacing_archetypes.csv`, `anomaly_flags.csv`

In [None]:
import pandas as pd
import numpy as np
import gc, warnings
from pathlib import Path
from time import time
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.ensemble import IsolationForest
warnings.filterwarnings('ignore')

BASE = Path('.').resolve().parent
CLEANED = BASE / 'data' / 'cleaned'
print(f"Data dir: {CLEANED}")
print(f"Files: {[f.name for f in CLEANED.glob('*.csv')]}")

Data dir: /Users/mykyta/projects/indie/racedayai/research/data/cleaned
Files: ['athlete_race.csv', 'athlete_profile.csv']


## 1. Load Data

In [None]:
profiles = pd.read_csv(CLEANED / 'athlete_profile.csv', low_memory=False)
races = pd.read_csv(CLEANED / 'athlete_race.csv',
                    usecols=['athlete_hash','gender','age_group','event_distance',
                             'swim_pct','bike_pct','run_pct','fade_ratio',
                             'bike_run_ratio','is_pro','total_sec',
                             'swim_sec','bike_sec','run_sec','t1_sec','t2_sec'],
                    low_memory=False)
print(f"Profiles: {len(profiles):,} | Races: {len(races):,}")
print(f"Profile columns: {list(profiles.columns)}")

Profiles: 1,629,366 | Races: 4,124,345
Profile columns: ['athlete_hash', 'athlete_name', 'gender', 'country', 'total_races', 'years_active', 'distances_raced', 'pb_swim_sec', 'pb_bike_sec', 'pb_run_sec', 'pb_total_sec', 'first_race_year', 'latest_race_year', 'improvement_slope', 'consistency_cv', 'swim_strength_z', 'bike_strength_z', 'run_strength_z', 'dominant_discipline', 'dnf_count', 'dnf_rate', 'avg_fade_ratio']


## 2. Athlete Clustering

Features: swim/bike/run strength z-scores, consistency, experience, improvement slope, fade ratio.
Compare K-Means (silhouette sweep), GMM (BIC), and HDBSCAN.

In [None]:
CLUSTER_FEATURES = [
    'swim_strength_z', 'bike_strength_z', 'run_strength_z',
    'consistency_cv', 'total_races', 'improvement_slope', 'avg_fade_ratio',
]

# Filter to athletes with 3+ races and sufficient features
df = profiles[profiles['total_races'] >= 3].copy()
valid = df[CLUSTER_FEATURES].notna().sum(axis=1) >= 5
df = df[valid].copy()
print(f"Clusterable athletes (3+ races, sufficient features): {len(df):,}")

# Prepare feature matrix
X = df[CLUSTER_FEATURES].copy()
X = X.fillna(X.median())
for col in X.columns:
    lo, hi = X[col].quantile([0.01, 0.99])
    X[col] = X[col].clip(lo, hi)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"Feature matrix: {X_scaled.shape}")
print(f"Feature means (post-scale): {X_scaled.mean(axis=0).round(2)}")

Clusterable athletes (3+ races, sufficient features): 301,730
Feature matrix: (301730, 7)
Feature means (post-scale): [-0.  0.  0.  0. -0.  0. -0.]


### 2.1 K-Means Silhouette Sweep (k=5..20)

In [None]:
km_results = []
for k in range(5, 21):
    km = KMeans(n_clusters=k, n_init=10, random_state=42, max_iter=300)
    labels = km.fit_predict(X_scaled)
    sil = silhouette_score(X_scaled, labels, sample_size=min(50000, len(X_scaled)))
    ch = calinski_harabasz_score(X_scaled, labels)
    db = davies_bouldin_score(X_scaled, labels)
    km_results.append({'k': k, 'silhouette': sil, 'calinski_harabasz': ch, 'davies_bouldin': db})
    print(f"  k={k:2d}: sil={sil:.4f}  ch={ch:.0f}  db={db:.3f}")

km_results = pd.DataFrame(km_results)
best_km_k = int(km_results.loc[km_results['silhouette'].idxmax(), 'k'])
print(f"\nBest K-Means k={best_km_k} (silhouette={km_results['silhouette'].max():.4f})")

  k= 5: sil=0.2382  ch=87826  db=1.316
  k= 6: sil=0.2517  ch=84927  db=1.326
  k= 7: sil=0.2608  ch=80001  db=1.314
  k= 8: sil=0.2660  ch=77542  db=1.302
  k= 9: sil=0.2188  ch=75044  db=1.337
  k=10: sil=0.2227  ch=72703  db=1.292
  k=11: sil=0.2280  ch=70672  db=1.262
  k=12: sil=0.2256  ch=68211  db=1.306
  k=13: sil=0.2319  ch=65981  db=1.249
  k=14: sil=0.2173  ch=63629  db=1.302
  k=15: sil=0.2286  ch=61619  db=1.296
  k=16: sil=0.2055  ch=60120  db=1.342
  k=17: sil=0.1949  ch=58626  db=1.341
  k=18: sil=0.1945  ch=56820  db=1.347
  k=19: sil=0.1951  ch=55562  db=1.344
  k=20: sil=0.1867  ch=54043  db=1.369

Best K-Means k=8 (silhouette=0.2660)


### 2.2 GMM with BIC Selection (k=5..15)

In [None]:
gmm_results = []
for k in range(5, 16):
    gmm = GaussianMixture(n_components=k, covariance_type='full',
                          n_init=3, random_state=42, max_iter=200)
    gmm.fit(X_scaled)
    bic = gmm.bic(X_scaled)
    aic = gmm.aic(X_scaled)
    gmm_results.append({'k': k, 'bic': bic, 'aic': aic})
    print(f"  k={k:2d}: BIC={bic:.0f}  AIC={aic:.0f}")

gmm_results = pd.DataFrame(gmm_results)
best_gmm_k = int(gmm_results.loc[gmm_results['bic'].idxmin(), 'k'])
print(f"\nBest GMM k={best_gmm_k} (BIC={gmm_results['bic'].min():.0f})")

  k= 5: BIC=3255435  AIC=3253535
  k= 6: BIC=3194131  AIC=3191848
  k= 7: BIC=2468153  AIC=2465488
  k= 8: BIC=3077627  AIC=3074580
  k= 9: BIC=2353592  AIC=2350162
  k=10: BIC=2015323  AIC=2011511
  k=11: BIC=2123025  AIC=2118831
  k=12: BIC=1811825  AIC=1807249
  k=13: BIC=1427618  AIC=1422660
  k=14: BIC=1415540  AIC=1410200
  k=15: BIC=1414765  AIC=1409043

Best GMM k=15 (BIC=1414765)


### 2.3 HDBSCAN

In [None]:
try:
    import hdbscan
    clusterer = hdbscan.HDBSCAN(min_cluster_size=500, min_samples=50,
                                 metric='euclidean', cluster_selection_method='eom')
    hdb_labels = clusterer.fit_predict(X_scaled)
    n_clusters = len(set(hdb_labels)) - (1 if -1 in hdb_labels else 0)
    n_noise = (hdb_labels == -1).sum()
    print(f"HDBSCAN: {n_clusters} clusters, {n_noise:,} noise points ({100*n_noise/len(hdb_labels):.1f}%)")
    if n_clusters >= 2:
        valid_mask = hdb_labels != -1
        sil = silhouette_score(X_scaled[valid_mask], hdb_labels[valid_mask],
                               sample_size=min(50000, valid_mask.sum()))
        print(f"Silhouette (excl. noise): {sil:.4f}")
except ImportError:
    print("HDBSCAN not installed, skipping")
    hdb_labels = None

HDBSCAN: 2 clusters, 16,502 noise points (5.5%)
Silhouette (excl. noise): 0.4346


### 2.4 Final Clustering — K-Means with Best k

In [None]:
# Use best K-Means as primary (most interpretable)
print(f"Using K-Means k={best_km_k}")
km_final = KMeans(n_clusters=best_km_k, n_init=20, random_state=42)
df['cluster_id'] = km_final.fit_predict(X_scaled)

# GMM soft assignments
gmm_final = GaussianMixture(n_components=best_gmm_k, covariance_type='full',
                            n_init=5, random_state=42)
gmm_final.fit(X_scaled)
df['gmm_cluster'] = gmm_final.predict(X_scaled)
gmm_probs = gmm_final.predict_proba(X_scaled)
df['gmm_max_prob'] = gmm_probs.max(axis=1)

if hdb_labels is not None:
    df['hdbscan_cluster'] = hdb_labels

# Cluster centroids
centroids = pd.DataFrame(km_final.cluster_centers_, columns=CLUSTER_FEATURES)
centroids['size'] = df['cluster_id'].value_counts().sort_index().values
centroids['pct'] = 100 * centroids['size'] / centroids['size'].sum()

# Auto-name clusters
names = []
for i, row in centroids.iterrows():
    traits = []
    if row['swim_strength_z'] > 0.5: traits.append('StrongSwim')
    if row['bike_strength_z'] > 0.5: traits.append('StrongBike')
    if row['run_strength_z'] > 0.5: traits.append('StrongRun')
    if row['swim_strength_z'] < -0.5: traits.append('WeakSwim')
    if row['bike_strength_z'] < -0.5: traits.append('WeakBike')
    if row['run_strength_z'] < -0.5: traits.append('WeakRun')
    if row['total_races'] > centroids['total_races'].median() + 1: traits.append('Veteran')
    if row['total_races'] < centroids['total_races'].median() - 1: traits.append('Novice')
    if row['avg_fade_ratio'] > 1.1: traits.append('Fader')
    if row['improvement_slope'] < -100: traits.append('Improving')
    name = '_'.join(traits[:3]) if traits else f'Cluster_{i}'
    names.append(name)
centroids['name'] = names
df['cluster_name'] = df['cluster_id'].map(dict(enumerate(names)))

print("\nCluster Summary:")
for i, row in centroids.iterrows():
    print(f"  [{i}] {row['name']:30s} n={int(row['size']):,} ({row['pct']:.1f}%)")
    print(f"       swim_z={row['swim_strength_z']:.2f}  bike_z={row['bike_strength_z']:.2f}  "
          f"run_z={row['run_strength_z']:.2f}  races={row['total_races']:.1f}  fade={row['avg_fade_ratio']:.3f}")

Using K-Means k=8

Cluster Summary:
  [0] Veteran                        n=15,962 (5.3%)
       swim_z=0.30  bike_z=0.22  run_z=0.29  races=3.2  fade=-0.509
  [1] StrongSwim_StrongBike_StrongRun n=37,216 (12.3%)
       swim_z=1.06  bike_z=1.24  run_z=1.18  races=0.0  fade=-0.413
  [2] WeakSwim_WeakBike_WeakRun      n=14,655 (4.9%)
       swim_z=-1.26  bike_z=-1.49  run_z=-1.32  races=-0.4  fade=0.101
  [3] WeakSwim_WeakBike_WeakRun      n=33,377 (11.1%)
       swim_z=-0.51  bike_z=-0.80  run_z=-0.51  races=0.3  fade=-0.555
  [4] StrongRun                      n=91,692 (30.4%)
       swim_z=0.37  bike_z=0.38  run_z=0.52  races=-0.3  fade=-0.560
  [5] WeakRun_Fader                  n=68,343 (22.7%)
       swim_z=-0.32  bike_z=-0.21  run_z=-0.53  races=-0.3  fade=1.171
  [6] WeakSwim_WeakBike_WeakRun      n=23,533 (7.8%)
       swim_z=-1.55  bike_z=-1.77  run_z=-1.78  races=0.1  fade=0.639
  [7] StrongSwim_StrongBike_StrongRun n=16,952 (5.6%)
       swim_z=0.92  bike_z=1.18  run_z=1.08  r

### 2.5 UMAP Visualization

In [None]:
try:
    from umap import UMAP

    n_umap = min(100000, len(X_scaled))
    idx = np.random.RandomState(42).choice(len(X_scaled), n_umap, replace=False)
    print(f"Running UMAP on {n_umap:,} points...")

    reducer = UMAP(n_components=2, n_neighbors=30, min_dist=0.3, random_state=42)
    emb_2d = reducer.fit_transform(X_scaled[idx])

    umap_df = pd.DataFrame({
        'athlete_hash': df.iloc[idx]['athlete_hash'].values,
        'umap_x': emb_2d[:, 0], 'umap_y': emb_2d[:, 1],
        'cluster_id': df.iloc[idx]['cluster_id'].values,
    })
    umap_df.to_csv(CLEANED / 'umap_coords.csv', index=False)
    print(f"Saved UMAP coordinates: {len(umap_df):,} points")

    # Quick scatter plot
    try:
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots(1, 1, figsize=(10, 8))
        scatter = ax.scatter(emb_2d[:, 0], emb_2d[:, 1],
                            c=df.iloc[idx]['cluster_id'].values,
                            cmap='tab10', s=1, alpha=0.3)
        ax.set_title('UMAP — Athlete Clusters')
        ax.set_xlabel('UMAP-1'); ax.set_ylabel('UMAP-2')
        plt.colorbar(scatter, label='Cluster ID')
        plt.tight_layout()
        plt.savefig(CLEANED / 'umap_clusters.png', dpi=150)
        plt.show()
        print("Saved umap_clusters.png")
    except Exception as e:
        print(f"Plotting failed: {e}")

except ImportError:
    print("UMAP not installed, skipping visualization")

UMAP not installed, skipping visualization


## 3. Pacing Archetypes

GMM on [swim_pct, bike_pct, run_pct, fade_ratio] per distance.

In [None]:
pac_cols = ['swim_pct', 'bike_pct', 'run_pct', 'fade_ratio']
pac_df = races.dropna(subset=pac_cols).copy()
pac_df = pac_df[pac_df['is_pro'] != True]
print(f"Records with complete pacing data: {len(pac_df):,}")

results_by_dist = {}

for dist in ['70.3', '140.6']:
    subset = pac_df[pac_df['event_distance'] == dist]
    if len(subset) < 1000:
        print(f"[{dist}] Too few records ({len(subset)}), skipping")
        continue
    print(f"\n[{dist}] {len(subset):,} records")

    X_pac = subset[pac_cols].values
    X_pac = np.clip(X_pac, np.percentile(X_pac, 1, axis=0), np.percentile(X_pac, 99, axis=0))
    pac_scaler = StandardScaler()
    X_pac_scaled = pac_scaler.fit_transform(X_pac)

    # GMM with BIC
    best_bic, best_k = np.inf, 4
    for k in range(3, 9):
        gmm = GaussianMixture(n_components=k, covariance_type='full',
                              n_init=3, random_state=42, max_iter=200)
        gmm.fit(X_pac_scaled)
        bic = gmm.bic(X_pac_scaled)
        if bic < best_bic:
            best_bic, best_k = bic, k
    print(f"  Best GMM components: {best_k}")

    gmm = GaussianMixture(n_components=best_k, covariance_type='full',
                          n_init=5, random_state=42)
    gmm.fit(X_pac_scaled)
    labels = gmm.predict(X_pac_scaled)
    probs = gmm.predict_proba(X_pac_scaled)

    for k_i in range(best_k):
        mask = labels == k_i
        means = subset.loc[mask, pac_cols].mean()
        med_total = subset.loc[mask, 'total_sec'].median()
        archetype = []
        if means['bike_pct'] > subset['bike_pct'].median() + 0.02: archetype.append('AggressiveBike')
        if means['fade_ratio'] > 1.08: archetype.append('HeavyFade')
        if means['fade_ratio'] < 0.95: archetype.append('StrongRun')
        if means['run_pct'] > subset['run_pct'].median() + 0.02: archetype.append('ConservativeBike')
        if not archetype: archetype.append('Balanced')
        name = '_'.join(archetype)
        print(f"  [{k_i}] {name:30s} n={mask.sum():,}  "
              f"swim={means['swim_pct']:.3f} bike={means['bike_pct']:.3f} "
              f"run={means['run_pct']:.3f} fade={means['fade_ratio']:.3f} "
              f"med_total={med_total/3600:.2f}h")

    results_by_dist[dist] = (subset.index, labels, probs)

# Build output
all_labels = pd.Series(np.nan, index=races.index, dtype='float')
all_probs = pd.Series(np.nan, index=races.index, dtype='float')
for dist, (idx, labels, probs) in results_by_dist.items():
    all_labels.loc[idx] = labels
    all_probs.loc[idx] = probs.max(axis=1)

print(f"\nPacing archetypes assigned: {all_labels.notna().sum():,}")

Records with complete pacing data: 3,720,788

[70.3] 2,085,244 records
  Best GMM components: 7
  [0] AggressiveBike                 n=122,374  swim=0.090 bike=0.557 run=0.330 fade=1.026 med_total=6.31h
  [1] HeavyFade                      n=25,196  swim=0.152 bike=0.448 run=0.374 fade=1.258 med_total=6.83h
  [2] HeavyFade_ConservativeBike     n=20,961  swim=0.100 bike=0.404 run=0.479 fade=1.553 med_total=6.92h
  [3] HeavyFade_ConservativeBike     n=469,765  swim=0.105 bike=0.475 run=0.394 fade=1.312 med_total=6.80h
  [4] StrongRun                      n=34,829  swim=0.157 bike=0.510 run=0.312 fade=0.877 med_total=5.80h
  [5] AggressiveBike_StrongRun       n=453,447  swim=0.112 bike=0.534 run=0.333 fade=0.816 med_total=5.00h
  [6] Balanced                       n=958,672  swim=0.113 bike=0.507 run=0.353 fade=0.994 med_total=5.77h

[140.6] 1,408,771 records
  Best GMM components: 7
  [0] StrongRun                      n=122,485  swim=0.119 bike=0.509 run=0.358 fade=0.898 med_total=11.42

## 4. Anomaly Detection

Rule-based flags + Isolation Forest per distance.

In [None]:
flags = pd.DataFrame(index=races.index)

# Sum check
computed_sum = (races['swim_sec'].fillna(0) + races['bike_sec'].fillna(0) +
                races['run_sec'].fillna(0) + races['t1_sec'].fillna(0) + races['t2_sec'].fillna(0))
has_all = races['swim_sec'].notna() & races['bike_sec'].notna() & races['run_sec'].notna()
flags['sum_mismatch'] = has_all & ((races['total_sec'] - computed_sum).abs() > 120)

# Extreme transitions
flags['extreme_t1'] = races['t1_sec'].notna() & (races['t1_sec'] > 900)
flags['extreme_t2'] = races['t2_sec'].notna() & (races['t2_sec'] > 900)

# Impossible split ratios
for seg in ['swim', 'bike', 'run']:
    col = f'{seg}_pct'
    if col in races.columns:
        flags[f'{seg}_pct_extreme'] = races[col].notna() & (
            (races[col] < 0.02) | (races[col] > 0.70))

print("Rule-based flags:")
for col in flags.columns:
    n = flags[col].sum()
    if n > 0:
        print(f"  {col}: {n:,}")

# Isolation Forest per distance
print("\nIsolation Forest per distance...")
iso_cols = ['swim_sec', 'bike_sec', 'run_sec', 'total_sec']
flags['isolation_forest'] = False

for dist in races['event_distance'].dropna().unique():
    subset = races[races['event_distance'] == dist].dropna(subset=iso_cols)
    if len(subset) < 100:
        continue
    X_iso = subset[iso_cols].values
    iso = IsolationForest(contamination=0.01, random_state=42, n_jobs=-1)
    preds = iso.fit_predict(X_iso)
    anomalies = preds == -1
    flags.loc[subset.index[anomalies], 'isolation_forest'] = True
    print(f"  [{dist}] {anomalies.sum():,} anomalies ({100*anomalies.mean():.1f}%)")

# Combined
flags['is_anomaly'] = flags.any(axis=1)
flags['reason'] = flags.apply(
    lambda r: ','.join([c for c in flags.columns if c not in ('is_anomaly','reason') and r[c]]),
    axis=1)
flags['reason'] = flags['reason'].replace('', np.nan)

n_anom = flags['is_anomaly'].sum()
print(f"\nTotal anomalies: {n_anom:,} ({100*n_anom/len(races):.2f}%)")

Rule-based flags:
  sum_mismatch: 941,572
  bike_pct_extreme: 7
  run_pct_extreme: 113

Isolation Forest per distance...
  [70.3] 21,436 anomalies (1.0%)
  [140.6] 15,026 anomalies (1.0%)
  [olympic] 2,380 anomalies (1.0%)
  [sprint] 1,034 anomalies (1.0%)
  [100km] 5 anomalies (1.1%)

Total anomalies: 971,055 (23.54%)


## 5. Save Outputs

In [None]:
# Cluster assignments
cluster_out = df[['athlete_hash', 'cluster_id', 'cluster_name',
                   'gmm_cluster', 'gmm_max_prob']].copy()
if 'hdbscan_cluster' in df.columns:
    cluster_out['hdbscan_cluster'] = df['hdbscan_cluster']
cluster_out.to_csv(CLEANED / 'cluster_assignments.csv', index=False)
centroids.to_csv(CLEANED / 'cluster_centroids.csv', index=False)
print(f"cluster_assignments.csv: {len(cluster_out):,}")

# Pacing archetypes
pac_out = pd.DataFrame({
    'pacing_archetype': all_labels,
    'pacing_confidence': all_probs,
})
pac_out.to_csv(CLEANED / 'pacing_archetypes.csv', index=False)
print(f"pacing_archetypes.csv: {pac_out['pacing_archetype'].notna().sum():,} assigned")

# Anomaly flags
anomaly_out = flags[['is_anomaly', 'reason']]
anomaly_out.to_csv(CLEANED / 'anomaly_flags.csv', index=False)
print(f"anomaly_flags.csv: {flags['is_anomaly'].sum():,} anomalies")

print("\n✅ UNSUPERVISED COMPLETE")

cluster_assignments.csv: 301,730
pacing_archetypes.csv: 3,494,015 assigned
anomaly_flags.csv: 971,055 anomalies

✅ UNSUPERVISED COMPLETE
