In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# ----------------------------------------------------
# 1. Load your dataset
# ----------------------------------------------------
df = pd.read_csv("data.csv")

# ----------------------------------------------------
# 2. Define prediction targets (multi-output regression)
# ----------------------------------------------------
targets = ["target_x", "target_y"]

y = df[targets]

# ----------------------------------------------------
# 3. Define features and feature-sets to evaluate (KNN section)
# ----------------------------------------------------
numeric_features = [
    "x", "y", "s", "a", "o", "dir", "ball_land_x", "ball_land_y",
    "distance_to_land", "angle_to_land", "movement_angle_difference", "orientation_angle_difference",
    "x_mean", "x_std", "x_first", "x_last", "x_min", "x_max",
    "y_mean", "y_std", "y_first", "y_last", "y_min", "y_max",
    "speed_mean", "speed_std", "accel_mean", "accel_std",
    "dir_mean", "dir_std", "dir_first", "dir_last",
    "distance_to_closest_sideline", "dx_to_closest_opponent", "dy_to_closest_opponent", "frame_id"
]

# role columns -> convert explicitly to numeric if present
role_cols = [c for c in ["role_Defensive Coverage", "role_Targeted Receiver"] if c in df.columns]
for c in role_cols:
    df[c] = (df[c].astype(str).str.lower().map({'true': 1, 'false': 0}).astype('Int64'))

# Build a dictionary of named feature-sets (user example names)
feature_sets = {
    'All': numeric_features + role_cols,
    'Minimal': ['x','y','distance_to_closest_sideline','frame_id','ball_land_x','ball_land_y','dir'],
    'Clean': ['x','y','distance_to_closest_sideline','frame_id','ball_land_x','ball_land_y','dir','dir_mean','movement_angle_difference','s'],
    # Clean Hypertuned will use same features as Clean but we'll hypertune the KNN hyperparameters
    'Robust': ['x','y','frame_id','dir','dir_mean','movement_angle_difference','s','x_std','angle_to_land','o','distance_to_land','y_std','x_min','x_max','speed_mean','orientation_angle_difference','y_min','y_max'],
    'Clean Hypertuned': ['x','y','distance_to_closest_sideline','frame_id','ball_land_x','ball_land_y','dir','dir_mean','movement_angle_difference','s'],
}

# ----------------------------------------------------
# Hypertune function for KNN using GridSearchCV
# ----------------------------------------------------

def hypertune_knn(X_train, y_train, cv=3, n_jobs=-1):
    """
    Perform a GridSearchCV over KNN hyperparameters for multioutput regression.
    Returns the best estimator and best params dict.
    """
    param_grid = {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'p': [1, 2],
    }
    knn = KNeighborsRegressor()
    # Use neg_mean_absolute_error as scoring (averaged across outputs)
    gs = GridSearchCV(knn, param_grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=n_jobs, verbose=0)
    gs.fit(X_train, y_train)
    return gs.best_estimator_, gs.best_params_, gs.best_score_

# ----------------------------------------------------
# 3b. Evaluate KNN for each feature-set and collect metrics
# ----------------------------------------------------
results = []
for name, fset in feature_sets.items():
    # keep only columns that exist in df
    cols = [c for c in fset if c in df.columns]
    if len(cols) == 0:
        print(f'Skipping {name}: no matching columns in dataframe')
        continue

    # build working dataframe, drop rows with NaNs in features or targets
    use_cols = cols + targets
    sub = df[use_cols].dropna().copy()
    if sub.shape[0] < 2:
        print(f'Skipping {name}: not enough rows after dropna')
        continue

    X_sub = sub[cols].apply(pd.to_numeric, errors='coerce')
    y_sub = sub[targets].apply(pd.to_numeric, errors='coerce')

    # split
    X_train, X_test, y_train, y_test = train_test_split(X_sub, y_sub, test_size=0.2, random_state=42)

    # scale features
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)

    # hypertune for the Clean Hypertuned set
    hypertuned_info = ''
    if name == 'Clean Hypertuned':
        best_est, best_params, best_score = hypertune_knn(X_train_s, y_train, cv=3, n_jobs=-1)
        knn = best_est
        hypertuned_info = str(best_params)
    else:
        # default KNN
        knn = KNeighborsRegressor(n_neighbors=5)
        knn.fit(X_train_s, y_train)

    # If hypertuned branch used, ensure model is fitted (GridSearchCV already fitted best_est)
    if name != 'Clean Hypertuned':
        # knn already fit above for default case
        pass

    # predictions
    y_train_pred = knn.predict(X_train_s)
    y_test_pred = knn.predict(X_test_s)

    # metrics per-axis
    r2_train_x = float(r2_score(y_train['target_x'], y_train_pred[:,0]))
    r2_train_y = float(r2_score(y_train['target_y'], y_train_pred[:,1]))
    r2_test_x = float(r2_score(y_test['target_x'], y_test_pred[:,0]))
    r2_test_y = float(r2_score(y_test['target_y'], y_test_pred[:,1]))

    mae_train_x = float(mean_absolute_error(y_train['target_x'], y_train_pred[:,0]))
    mae_train_y = float(mean_absolute_error(y_train['target_y'], y_train_pred[:,1]))
    mae_test_x = float(mean_absolute_error(y_test['target_x'], y_test_pred[:,0]))
    mae_test_y = float(mean_absolute_error(y_test['target_y'], y_test_pred[:,1]))

    rmse_test_x = float(np.sqrt(mean_squared_error(y_test['target_x'], y_test_pred[:,0])))
    rmse_test_y = float(np.sqrt(mean_squared_error(y_test['target_y'], y_test_pred[:,1])))

    results.append({
        'Features': name,
        'Training X R^2': r2_train_x,
        'Training Y R^2': r2_train_y,
        'Testing X R^2': r2_test_x,
        'Testing Y R^2': r2_test_y,
        'Training X MAE': mae_train_x,
        'Training Y MAE': mae_train_y,
        'Testing X MAE': mae_test_x,
        'Testing Y MAE': mae_test_y,
        'Testing X RMSE': rmse_test_x,
        'Testing Y RMSE': rmse_test_y,
        'Hypertuned Params': hypertuned_info,
    })

# format and show results
res_df = pd.DataFrame(results)
# round numeric columns for easier reading
if not res_df.empty:
    num_cols = [c for c in res_df.columns if c not in ('Features','Hypertuned Params')]
    res_df[num_cols] = res_df[num_cols].round(3)

# Print human-readable table
print('\nKNN evaluation summary:')
print(res_df.to_string(index=False))

# Print TSV (tab-separated) so you can paste into a spreadsheet where each column goes into its own cell
print('\nTSV output (paste into spreadsheet):')
print(res_df.to_csv(index=False, sep='\t'))



KNN evaluation summary:
        Features  Training X R^2  Training Y R^2  Testing X R^2  Testing Y R^2  Training X MAE  Training Y MAE  Testing X MAE  Testing Y MAE  Testing X RMSE  Testing Y RMSE                                 Hypertuned Params
             All           1.000           0.998          0.999          0.996           0.277           0.343          0.487          0.566           0.829           0.857                                                  
         Minimal           0.998           0.996          0.996          0.993           0.788           0.525          1.065          0.715           1.513           1.096                                                  
           Clean           0.999           0.998          0.997          0.997           0.558           0.337          0.924          0.531           1.324           0.777                                                  
          Robust           1.000           0.999          0.999          0.997     

Features||Training X R^2||Training Y R^2||Testing X R^2||Testing Y R^2||Training X MAE||Training Y MAE||Testing X MAE||Testing Y MAE||Testing X RMSE||Testing Y RMSE||Hypertuned Params
All||1.0||0.998||0.999||0.996||0.277||0.343||0.487||0.566||0.829||0.857	
Minimal||0.998||0.996||0.996||0.993||0.788||0.525||1.065||0.715||1.513||1.096	
Clean||0.999||0.998||0.997||0.997||0.558||0.337||0.924||0.531||1.324||0.777	
Robust||1.0||0.999||0.999||0.997||0.293||0.313||0.518||0.492||0.853||0.734	
Clean Hypertuned||1.0||1.0||1.0||0.999||0.0||0.0||0.225||0.272||0.362||0.41||{'n_neighbors': 3, 'p': 1, 'weights': 'distance'}

In [None]:
output = """
KNN evaluation summary:
        Features  Training X R^2  Training Y R^2  Testing X R^2  Testing Y R^2  Training X MAE  Training Y MAE  Testing X MAE  Testing Y MAE  Testing X RMSE  Testing Y RMSE
             All           1.000           0.998          0.999          0.996           0.277           0.343          0.487          0.566           0.829           0.857
         Minimal           0.998           0.996          0.996          0.993           0.788           0.525          1.065          0.715           1.513           1.096
           Clean           0.999           0.998          0.997          0.997           0.558           0.337          0.924          0.531           1.324           0.777
          Robust           1.000           0.999          0.999          0.997           0.293           0.313          0.518          0.492           0.853           0.734
Clean Hypertuned           0.999           0.998          0.997          0.997           0.558           0.337          0.924          0.531           1.324           0.777"""

In [None]:
output = """
KNN evaluation summary:
        Features\tTraining X R^2\tTraining Y R^2\tTesting X R^2\tTesting Y R^2\tTraining X MAE\tTraining Y MAE\tTesting X MAE\tTesting Y MAE\tTesting X RMSE\tTesting Y RMSE
             All\t1.000\t0.998\t0.999\t0.996\t0.277\t0.343\t0.487\t0.566\t0.829\t0.857
         Minimal\t0.998\t0.996\t0.996\t0.993\t0.788\t0.525\t1.065\t0.715\t1.513\t1.096
           Clean\t0.999\t0.998\t0.997\t0.997\t0.558\t0.337\t0.924\t0.531\t1.324\t0.777
          Robust\t1.000\t0.999\t0.999\t0.997\t0.293\t0.313\t0.518\t0.492\t0.853\t0.734
Clean Hypertuned\t0.999\t0.998\t0.997\t0.997\t0.558\t0.337\t0.924\t0.531\t1.324\t0.777"""