In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

## Loading Data

In [3]:
weeks = []
for i in range(1, 19):
    df = pd.read_csv(f"train/input_2023_w{i:02d}.csv")
    weeks.append(df)

prethrow = pd.concat(weeks, ignore_index=True)

In [4]:
weeks = []
for i in range(1, 19):
    df = pd.read_csv(f"train/output_2023_w{i:02d}.csv")
    weeks.append(df)

postthrow = pd.concat(weeks, ignore_index=True)

## Getting Relevant Features

In [5]:
relevant_prethrow = prethrow[['game_id','play_id','nfl_id','frame_id','x','y','s','a','o','dir','ball_land_x','ball_land_y','player_position','player_role']]

Calculating distance to ball

In [6]:
relevant_prethrow['distance_to_land'] = np.sqrt((relevant_prethrow['ball_land_x'] - relevant_prethrow['x'])**2 + (relevant_prethrow['ball_land_y'] - relevant_prethrow['y'])**2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_prethrow['distance_to_land'] = np.sqrt((relevant_prethrow['ball_land_x'] - relevant_prethrow['x'])**2 + (relevant_prethrow['ball_land_y'] - relevant_prethrow['y'])**2)


Engineering Other Features

In [7]:
dx = relevant_prethrow['x'] - relevant_prethrow['ball_land_x']
dy = relevant_prethrow['y'] - relevant_prethrow['ball_land_y']

angles = np.degrees(np.arctan2(dy, dx))
angles = (angles + 90) % 360

relevant_prethrow['angle_to_land'] = angles
def angle_diff(a, b):
    diff = (a - b + 180) % 360 - 180
    return abs(diff)
relevant_prethrow['movement_angle_difference'] = relevant_prethrow.apply(
    lambda row: angle_diff(row['angle_to_land'], row['dir']), axis=1
)

relevant_prethrow['orientation_angle_difference'] = relevant_prethrow.apply(
    lambda row: angle_diff(row['angle_to_land'], row['o']), axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_prethrow['angle_to_land'] = angles
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_prethrow['movement_angle_difference'] = relevant_prethrow.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_prethrow['orientation_angle_difference'] = relevant_prethrow.apply(


Create rolling summary stats

In [8]:
prethrow = relevant_prethrow.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])

group_cols = ['game_id', 'play_id', 'nfl_id']

In [9]:
def summarize_last10(group):
    roll = group[['x', 'y', 's', 'a', 'dir']].rolling(window=10, min_periods=1)
    
    # Compute stats
    summary = pd.DataFrame({
        'x_mean': roll['x'].mean(),
        'x_std': roll['x'].std(),
        'x_first': group['x'].shift(9).fillna(group['x'].iloc[0]),   # first of last 10 frames
        'x_last': group['x'],
        'x_min': roll['x'].min(),
        'x_max': roll['x'].max(),

        'y_mean': roll['y'].mean(),
        'y_std': roll['y'].std(),
        'y_first': group['y'].shift(9).fillna(group['y'].iloc[0]),
        'y_last': group['y'],
        'y_min': roll['y'].min(),
        'y_max': roll['y'].max(),

        'speed_mean': roll['s'].mean(),
        'speed_std': roll['s'].std(),

        'accel_mean': roll['a'].mean(),
        'accel_std': roll['a'].std(),

        'dir_mean': roll['dir'].mean(),
        'dir_std': roll['dir'].std(),
        'dir_first': group['dir'].shift(9).fillna(group['dir'].iloc[0]),
        'dir_last': group['dir'],
    })

    # Add identifying columns back
    summary[group_cols + ['frame_id']] = group[group_cols + ['frame_id']].values

    # Return only the last frame of this group
    return summary.iloc[[-1]]

In [10]:
prethrow = prethrow.groupby(group_cols, group_keys=False).apply(summarize_last10).reset_index(drop=True)

Merging Back

In [11]:
lastframe = relevant_prethrow.loc[
        lambda df: df.groupby(['game_id', 'play_id', 'nfl_id'])['frame_id'].idxmax()
    ]

In [12]:
lastframe = pd.merge(
    lastframe,
    prethrow,
    how = "left",
    on = ['game_id','play_id','nfl_id']
)

In [13]:
lastframe = lastframe.drop(columns=['frame_id_x','frame_id_y'])

## Getting Post Throw Frames

In [14]:
firstframe = postthrow.loc[
        lambda df: df.groupby(['game_id', 'play_id', 'nfl_id'])['frame_id'].idxmin()
    ]

In [15]:
data = pd.merge(
    lastframe,
    postthrow.rename(columns={'x': 'target_x', 'y': 'target_y'}),
    how = "inner",
    on = ['game_id','play_id','nfl_id']
)

## Modeling

In [16]:
data.columns

Index(['game_id', 'play_id', 'nfl_id', 'x', 'y', 's', 'a', 'o', 'dir',
       'ball_land_x', 'ball_land_y', 'player_position', 'player_role',
       'distance_to_land', 'angle_to_land', 'movement_angle_difference',
       'orientation_angle_difference', 'x_mean', 'x_std', 'x_first', 'x_last',
       'x_min', 'x_max', 'y_mean', 'y_std', 'y_first', 'y_last', 'y_min',
       'y_max', 'speed_mean', 'speed_std', 'accel_mean', 'accel_std',
       'dir_mean', 'dir_std', 'dir_first', 'dir_last', 'frame_id', 'target_x',
       'target_y'],
      dtype='object')

In [17]:
data = pd.get_dummies(data, columns=['player_role'], prefix='role')

In [18]:
subset, _ = train_test_split(data, train_size=0.1, random_state=42)

In [19]:
X = subset.drop(columns=['game_id','play_id','nfl_id','target_x','target_y', 'x_last','y_last','dir_last', 'player_position'])
y = subset[['target_x','target_y']]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

In [22]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [23]:
r2_train_x = r2_score(y_train['target_x'], y_train_pred[:, 0])
r2_train_y = r2_score(y_train['target_y'], y_train_pred[:, 1])

r2_test_x = r2_score(y_test['target_x'], y_test_pred[:, 0])
r2_test_y = r2_score(y_test['target_y'], y_test_pred[:, 1])

print(f"Training R²: x = {r2_train_x:.3f}, y = {r2_train_y:.3f}")
print(f"Testing  R²: x = {r2_test_x:.3f}, y = {r2_test_y:.3f}")

Training R²: x = 0.999, y = 0.998
Testing  R²: x = 0.996, y = 0.985


In [24]:
mae_train_x = mean_absolute_error(y_train['target_x'], y_train_pred[:, 0])
mae_train_y = mean_absolute_error(y_train['target_y'], y_train_pred[:, 1])
mae_test_x = mean_absolute_error(y_test['target_x'], y_test_pred[:, 0])
mae_test_y = mean_absolute_error(y_test['target_y'], y_test_pred[:, 1])

print(f"Training MAE: x = {mae_train_x:.3f}, y = {mae_train_y:.3f}")
print(f"Testing  MAE: x = {mae_test_x:.3f}, y = {mae_test_y:.3f}")

Training MAE: x = 0.371, y = 0.396
Testing  MAE: x = 1.000, y = 1.082


In [25]:
rmse_x = np.sqrt(np.mean((y_test['target_x'] - y_test_pred[:, 0])**2))
rmse_y = np.sqrt(np.mean((y_test['target_y'] - y_test_pred[:, 1])**2))
print("X: ", rmse_x, "\n Y: ", rmse_y)

X:  1.5530257628136492 
 Y:  1.6385495519573139


In [26]:
importances = model.feature_importances_
feature_names = X_train.columns

feat_imp = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

feat_imp

Unnamed: 0,feature,importance
0,x,0.749394
1,y,0.204352
29,frame_id,0.011678
6,ball_land_x,0.011596
7,ball_land_y,0.005382
5,dir,0.005342
26,dir_mean,0.001077
10,movement_angle_difference,0.001058
2,s,0.001036
13,x_std,0.000756


#  Feature Selection

### Minimal Model

In [38]:
X_minimal = subset[['x','y','frame_id','ball_land_x','ball_land_y','dir']]
y = subset[['target_x','target_y']]

In [39]:
X_minimal_train, X_minimal_test, y_train, y_test = train_test_split(X_minimal, y, test_size=0.2, random_state=42)

In [40]:
model_minimal = RandomForestRegressor(n_estimators=200, random_state=42)
model_minimal.fit(X_minimal_train, y_train)

In [41]:
y_train_pred_minimal = model_minimal.predict(X_minimal_train)
y_test_pred_minimal = model_minimal.predict(X_minimal_test)

In [42]:
r2_train_x_minimal = r2_score(y_train['target_x'], y_train_pred_minimal[:, 0])
r2_train_y_minimal = r2_score(y_train['target_y'], y_train_pred_minimal[:, 1])

r2_test_x_minimal = r2_score(y_test['target_x'], y_test_pred_minimal[:, 0])
r2_test_y_minimal = r2_score(y_test['target_y'], y_test_pred_minimal[:, 1])

print(f"Training R²: x = {r2_train_x_minimal:.3f}, y = {r2_train_y_minimal:.3f}")
print(f"Testing  R²: x = {r2_test_x_minimal:.3f}, y = {r2_test_y_minimal:.3f}")

Training R²: x = 0.999, y = 0.998
Testing  R²: x = 0.996, y = 0.986


In [43]:
mae_train_x_minimal = mean_absolute_error(y_train['target_x'], y_train_pred_minimal[:, 0])
mae_train_y_minimal = mean_absolute_error(y_train['target_y'], y_train_pred_minimal[:, 1])
mae_test_x_minimal = mean_absolute_error(y_test['target_x'], y_test_pred_minimal[:, 0])
mae_test_y_minimal = mean_absolute_error(y_test['target_y'], y_test_pred_minimal[:, 1])

print(f"Training MAE: x = {mae_train_x_minimal:.3f}, y = {mae_train_y_minimal:.3f}")
print(f"Testing  MAE: x = {mae_test_x_minimal:.3f}, y = {mae_test_y_minimal:.3f}")

Training MAE: x = 0.376, y = 0.384
Testing  MAE: x = 1.012, y = 1.045


In [44]:
rmse_x_minimal = np.sqrt(np.mean((y_test['target_x'] - y_test_pred_minimal[:, 0])**2))
rmse_y_minimal = np.sqrt(np.mean((y_test['target_y'] - y_test_pred_minimal[:, 1])**2))
print("X: ", rmse_x_minimal, "\n Y: ", rmse_y_minimal)

X:  1.5909612155660997 
 Y:  1.5960264315829193


### Clean Model

In [45]:
X_clean = subset[['x','y','frame_id','ball_land_x','ball_land_y','dir','dir_mean','movement_angle_difference','s']]
y = subset[['target_x','target_y']]

In [46]:
X_clean_train, X_clean_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.2, random_state=42)

In [47]:
model_clean = RandomForestRegressor(n_estimators=200, random_state=42)
model_clean.fit(X_clean_train, y_train)

In [48]:
y_train_pred_clean = model_clean.predict(X_clean_train)
y_test_pred_clean = model_clean.predict(X_clean_test)

In [49]:
r2_train_x_clean = r2_score(y_train['target_x'], y_train_pred_clean[:, 0])
r2_train_y_clean = r2_score(y_train['target_y'], y_train_pred_clean[:, 1])

r2_test_x_clean = r2_score(y_test['target_x'], y_test_pred_clean[:, 0])
r2_test_y_clean = r2_score(y_test['target_y'], y_test_pred_clean[:, 1])

print(f"Training R²: x = {r2_train_x_clean:.3f}, y = {r2_train_y_clean:.3f}")
print(f"Testing  R²: x = {r2_test_x_clean:.3f}, y = {r2_test_y_clean:.3f}")

Training R²: x = 0.999, y = 0.998
Testing  R²: x = 0.996, y = 0.986


In [50]:
mae_train_x_clean = mean_absolute_error(y_train['target_x'], y_train_pred_clean[:, 0])
mae_train_y_clean = mean_absolute_error(y_train['target_y'], y_train_pred_clean[:, 1])
mae_test_x_clean = mean_absolute_error(y_test['target_x'], y_test_pred_clean[:, 0])
mae_test_y_clean = mean_absolute_error(y_test['target_y'], y_test_pred_clean[:, 1])

print(f"Training MAE: x = {mae_train_x_clean:.3f}, y = {mae_train_y_clean:.3f}")
print(f"Testing  MAE: x = {mae_test_x_clean:.3f}, y = {mae_test_y_clean:.3f}")

Training MAE: x = 0.357, y = 0.376
Testing  MAE: x = 0.964, y = 1.025


In [51]:
rmse_x_clean = np.sqrt(np.mean((y_test['target_x'] - y_test_pred_clean[:, 0])**2))
rmse_y_clean = np.sqrt(np.mean((y_test['target_y'] - y_test_pred_clean[:, 1])**2))
print("X: ", rmse_x_clean, "\n Y: ", rmse_y_clean)

X:  1.5041925735028108 
 Y:  1.5646200940456734


### Robust Model

In [52]:
X_robust = subset[['x','y','frame_id','dir','dir_mean','movement_angle_difference','s', "x_std", "angle_to_land", "o", "distance_to_land","y_std","x_min", "x_max","speed_mean","orientation_angle_difference","y_min", "y_max"]]
y = subset[['target_x','target_y']]

In [53]:
X_robust_train, X_robust_test, y_train, y_test = train_test_split(X_robust, y, test_size=0.2, random_state=42)

In [54]:
model_robust = RandomForestRegressor(n_estimators=200, random_state=42)
model_robust.fit(X_robust_train, y_train)

In [55]:
y_train_pred_robust = model_robust.predict(X_robust_train)
y_test_pred_robust = model_robust.predict(X_robust_test)

In [56]:
r2_train_x_robust = r2_score(y_train['target_x'], y_train_pred_robust[:, 0])
r2_train_y_robust = r2_score(y_train['target_y'], y_train_pred_robust[:, 1])

r2_test_x_robust = r2_score(y_test['target_x'], y_test_pred_robust[:, 0])
r2_test_y_robust = r2_score(y_test['target_y'], y_test_pred_robust[:, 1])

print(f"Training R²: x = {r2_train_x_robust:.3f}, y = {r2_train_y_robust:.3f}")
print(f"Testing  R²: x = {r2_test_x_robust:.3f}, y = {r2_test_y_robust:.3f}")

Training R²: x = 0.999, y = 0.998
Testing  R²: x = 0.996, y = 0.985


In [57]:
mae_train_x_robust = mean_absolute_error(y_train['target_x'], y_train_pred_robust[:, 0])
mae_train_y_robust = mean_absolute_error(y_train['target_y'], y_train_pred_robust[:, 1])
mae_test_x_robust = mean_absolute_error(y_test['target_x'], y_test_pred_robust[:, 0])
mae_test_y_robust = mean_absolute_error(y_test['target_y'], y_test_pred_robust[:, 1])

print(f"Training MAE: x = {mae_train_x_robust:.3f}, y = {mae_train_y_robust:.3f}")
print(f"Testing  MAE: x = {mae_test_x_robust:.3f}, y = {mae_test_y_robust:.3f}")

Training MAE: x = 0.360, y = 0.392
Testing  MAE: x = 0.980, y = 1.070


In [58]:
rmse_x_robust = np.sqrt(np.mean((y_test['target_x'] - y_test_pred_robust[:, 0])**2))
rmse_y_robust = np.sqrt(np.mean((y_test['target_y'] - y_test_pred_robust[:, 1])**2))
print("X: ", rmse_x_robust, "\n Y: ", rmse_y_robust)

X:  1.5591133246499242 
 Y:  1.6487959696629488


Out of the full model, minimal model, clean model, and robust model, the clean model performs best in all metrics

## Hypertune Robust Model

In [60]:
param_dist = {
    "n_estimators": [200, 400, 600, 800, 1000],
    "max_depth": [None, 10, 20, 30, 40, 50],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 6],
    "max_features": ["auto", "sqrt", 0.3, 0.5, 0.7],
    "bootstrap": [True, False]
}

In [61]:
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=40,           # number of combinations to try
    cv=5,                # 5-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring="neg_mean_squared_error"  # or accuracy for classifier
)

In [62]:
rf_random.fit(X_clean_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


KeyboardInterrupt: 

In [None]:
best_model = rf_random.best_estimator_

print("Best Parameters:")
print(rf_random.best_params_)

print("\nBest CV Score:")
print(rf_random.best_score_)