In [88]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

## Loading Data

In [89]:
weeks = []
for i in range(1, 19):
    df = pd.read_csv(f"train/input_2023_w{i:02d}.csv")
    weeks.append(df)

prethrow = pd.concat(weeks, ignore_index=True)

In [90]:
weeks = []
for i in range(1, 19):
    df = pd.read_csv(f"train/output_2023_w{i:02d}.csv")
    weeks.append(df)

postthrow = pd.concat(weeks, ignore_index=True)

## Getting Relevant Features

In [91]:
relevant_prethrow = prethrow[['game_id','play_id','nfl_id','frame_id','x','y','s','a','o','dir','ball_land_x','ball_land_y','player_position','player_role']]

Calculating distance to ball

In [92]:
relevant_prethrow['distance_to_land'] = np.sqrt((relevant_prethrow['ball_land_x'] - relevant_prethrow['x'])**2 + (relevant_prethrow['ball_land_y'] - relevant_prethrow['y'])**2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_prethrow['distance_to_land'] = np.sqrt((relevant_prethrow['ball_land_x'] - relevant_prethrow['x'])**2 + (relevant_prethrow['ball_land_y'] - relevant_prethrow['y'])**2)


Engineering Other Features

In [93]:
dx = relevant_prethrow['x'] - relevant_prethrow['ball_land_x']
dy = relevant_prethrow['y'] - relevant_prethrow['ball_land_y']

angles = np.degrees(np.arctan2(dy, dx))
angles = (angles + 90) % 360

relevant_prethrow['angle_to_land'] = angles
def angle_diff(a, b):
    diff = (a - b + 180) % 360 - 180
    return abs(diff)
relevant_prethrow['movement_angle_difference'] = relevant_prethrow.apply(
    lambda row: angle_diff(row['angle_to_land'], row['dir']), axis=1
)

relevant_prethrow['orientation_angle_difference'] = relevant_prethrow.apply(
    lambda row: angle_diff(row['angle_to_land'], row['o']), axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_prethrow['angle_to_land'] = angles
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_prethrow['movement_angle_difference'] = relevant_prethrow.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_prethrow['orientation_angle_difference'] = relevant_prethrow.apply(


Create rolling summary stats

In [94]:
prethrow = relevant_prethrow.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])

group_cols = ['game_id', 'play_id', 'nfl_id']

In [95]:
def summarize_last10(group):
    roll = group[['x', 'y', 's', 'a', 'dir']].rolling(window=10, min_periods=1)
    
    # Compute stats
    summary = pd.DataFrame({
        'x_mean': roll['x'].mean(),
        'x_std': roll['x'].std(),
        'x_first': group['x'].shift(9).fillna(group['x'].iloc[0]),   # first of last 10 frames
        'x_last': group['x'],
        'x_min': roll['x'].min(),
        'x_max': roll['x'].max(),

        'y_mean': roll['y'].mean(),
        'y_std': roll['y'].std(),
        'y_first': group['y'].shift(9).fillna(group['y'].iloc[0]),
        'y_last': group['y'],
        'y_min': roll['y'].min(),
        'y_max': roll['y'].max(),

        'speed_mean': roll['s'].mean(),
        'speed_std': roll['s'].std(),

        'accel_mean': roll['a'].mean(),
        'accel_std': roll['a'].std(),

        'dir_mean': roll['dir'].mean(),
        'dir_std': roll['dir'].std(),
        'dir_first': group['dir'].shift(9).fillna(group['dir'].iloc[0]),
        'dir_last': group['dir'],
    })

    # Add identifying columns back
    summary[group_cols + ['frame_id']] = group[group_cols + ['frame_id']].values

    # Return only the last frame of this group
    return summary.iloc[[-1]]

In [96]:
prethrow = prethrow.groupby(group_cols, group_keys=False).apply(summarize_last10).reset_index(drop=True)

Merging Back

In [97]:
lastframe = relevant_prethrow.loc[
        lambda df: df.groupby(['game_id', 'play_id', 'nfl_id'])['frame_id'].idxmax()
    ]

In [98]:
lastframe = pd.merge(
    lastframe,
    prethrow,
    how = "left",
    on = ['game_id','play_id','nfl_id']
)

In [99]:
lastframe = lastframe.drop(columns=['frame_id_x','frame_id_y'])

Distance to Closest Sideline

In [100]:
lastframe["distance_to_closest_sideline"] = np.where(
    lastframe["y"] <= 53.3 - lastframe["y"],
    -lastframe["y"],
    53.3 - lastframe["y"]
)

Distance to Closest Attacker/Defender

In [101]:
df = lastframe.copy()

# Identify the target receiver for each play
target_info = df[df["player_role"] == "Targeted Receiver"][["game_id","play_id","x","y","nfl_id"]]
target_info = target_info.rename(columns={"x":"tgt_x", "y":"tgt_y", "nfl_id":"tgt_nfl_id"})

# Merge the target receiver location onto all rows in the play
df = df.merge(target_info, on=["game_id","play_id"], how="left")

In [102]:
df.loc[df["player_role"] == "Defensive Coverage", "dx_to_target"] = \
    df["x"] - df["tgt_x"]

df.loc[df["player_role"] == "Defensive Coverage", "dy_to_target"] = \
    df["y"] - df["tgt_y"]

In [103]:
def compute_closest_defender(group):
    # Separate targeted receiver & defenders
    tgt = group[group["player_role"] == "Targeted Receiver"]
    defs = group[group["player_role"] == "Defensive Coverage"]
    
    # If no defender or no target, return group unchanged
    if len(tgt) == 0 or len(defs) == 0:
        return group
    
    tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
    
    # Compute distances
    defs = defs.assign(
        dist = np.sqrt((defs["x"] - tgt_x)**2 + (defs["y"] - tgt_y)**2)
    )
    
    # Identify closest defender
    closest_def = defs.loc[defs["dist"].idxmin()]
    
    # Assign dx/dy to target receiver only
    group.loc[group["player_role"] == "Targeted Receiver", "dx_to_closest_defender"] = \
        tgt_x - closest_def["x"]
    group.loc[group["player_role"] == "Targeted Receiver", "dy_to_closest_defender"] = \
        tgt_y - closest_def["y"]

    return group

In [104]:
df = df.groupby(["game_id","play_id"], group_keys=False).apply(compute_closest_defender)

  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])
  tgt_x, tgt_y = float(tgt["x"]), float(tgt["y"])


In [105]:
df["dx_to_closest_opponent"] = df["dx_to_target"].combine_first(df["dx_to_closest_defender"])
df["dy_to_closest_opponent"] = df["dy_to_target"].combine_first(df["dy_to_closest_defender"])

In [107]:
lastframe = pd.merge(
    lastframe,
    df[['game_id','play_id','nfl_id','dx_to_closest_opponent','dy_to_closest_opponent']],
    on = ['game_id','play_id','nfl_id']
)

## Getting Post Throw Frames

In [14]:
firstframe = postthrow.loc[
        lambda df: df.groupby(['game_id', 'play_id', 'nfl_id'])['frame_id'].idxmin()
    ]

In [127]:
data = pd.merge(
    lastframe,
    postthrow.rename(columns={'x': 'target_x', 'y': 'target_y'}),
    how = "inner",
    on = ['game_id','play_id','nfl_id']
)

In [136]:
data

Unnamed: 0,game_id,play_id,nfl_id,x,y,s,a,o,dir,ball_land_x,...,dir_mean,dir_std,dir_first,dir_last,distance_to_closest_sideline,dx_to_closest_opponent,dy_to_closest_opponent,frame_id,target_x,target_y
0,2023090700,101,44930,52.43,14.14,7.90,2.68,106.80,99.25,63.259998,...,87.542,7.987770,76.18,99.25,-14.14,4.42,1.70,1,53.20,13.98
1,2023090700,101,44930,52.43,14.14,7.90,2.68,106.80,99.25,63.259998,...,87.542,7.987770,76.18,99.25,-14.14,4.42,1.70,2,53.96,13.78
2,2023090700,101,44930,52.43,14.14,7.90,2.68,106.80,99.25,63.259998,...,87.542,7.987770,76.18,99.25,-14.14,4.42,1.70,3,54.70,13.54
3,2023090700,101,44930,52.43,14.14,7.90,2.68,106.80,99.25,63.259998,...,87.542,7.987770,76.18,99.25,-14.14,4.42,1.70,4,55.41,13.27
4,2023090700,101,44930,52.43,14.14,7.90,2.68,106.80,99.25,63.259998,...,87.542,7.987770,76.18,99.25,-14.14,4.42,1.70,5,56.09,12.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562931,2024010713,4018,52647,44.40,13.09,3.03,3.76,142.21,208.21,32.139999,...,226.784,9.905507,234.02,208.21,-13.09,5.95,-4.15,14,44.30,13.33
562932,2024010713,4018,52647,44.40,13.09,3.03,3.76,142.21,208.21,32.139999,...,226.784,9.905507,234.02,208.21,-13.09,5.95,-4.15,15,44.23,13.47
562933,2024010713,4018,52647,44.40,13.09,3.03,3.76,142.21,208.21,32.139999,...,226.784,9.905507,234.02,208.21,-13.09,5.95,-4.15,16,44.15,13.59
562934,2024010713,4018,52647,44.40,13.09,3.03,3.76,142.21,208.21,32.139999,...,226.784,9.905507,234.02,208.21,-13.09,5.95,-4.15,17,44.06,13.70


## Modeling

In [137]:
data.columns

Index(['game_id', 'play_id', 'nfl_id', 'x', 'y', 's', 'a', 'o', 'dir',
       'ball_land_x', 'ball_land_y', 'player_position', 'player_role',
       'distance_to_land', 'angle_to_land', 'movement_angle_difference',
       'orientation_angle_difference', 'x_mean', 'x_std', 'x_first', 'x_last',
       'x_min', 'x_max', 'y_mean', 'y_std', 'y_first', 'y_last', 'y_min',
       'y_max', 'speed_mean', 'speed_std', 'accel_mean', 'accel_std',
       'dir_mean', 'dir_std', 'dir_first', 'dir_last',
       'distance_to_closest_sideline', 'dx_to_closest_opponent',
       'dy_to_closest_opponent', 'frame_id', 'target_x', 'target_y'],
      dtype='object')

In [138]:
data = pd.get_dummies(data, columns=['player_role'], prefix='role')

In [139]:
subset, _ = train_test_split(data, train_size=0.1, random_state=42)

In [140]:
X = subset.drop(columns=['game_id','play_id','nfl_id','target_x','target_y', 'x_last','y_last','dir_last', 'player_position'])
y = subset[['target_x','target_y']]

In [143]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [144]:
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

In [145]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [146]:
r2_train_x = r2_score(y_train['target_x'], y_train_pred[:, 0])
r2_train_y = r2_score(y_train['target_y'], y_train_pred[:, 1])

r2_test_x = r2_score(y_test['target_x'], y_test_pred[:, 0])
r2_test_y = r2_score(y_test['target_y'], y_test_pred[:, 1])

print(f"Training R²: x = {r2_train_x:.3f}, y = {r2_train_y:.3f}")
print(f"Testing  R²: x = {r2_test_x:.3f}, y = {r2_test_y:.3f}")

Training R²: x = 0.999, y = 0.998
Testing  R²: x = 0.996, y = 0.986


In [147]:
mae_train_x = mean_absolute_error(y_train['target_x'], y_train_pred[:, 0])
mae_train_y = mean_absolute_error(y_train['target_y'], y_train_pred[:, 1])
mae_test_x = mean_absolute_error(y_test['target_x'], y_test_pred[:, 0])
mae_test_y = mean_absolute_error(y_test['target_y'], y_test_pred[:, 1])

print(f"Training MAE: x = {mae_train_x:.3f}, y = {mae_train_y:.3f}")
print(f"Testing  MAE: x = {mae_test_x:.3f}, y = {mae_test_y:.3f}")

Training MAE: x = 0.381, y = 0.399
Testing  MAE: x = 1.026, y = 1.069


In [148]:
rmse_x = np.sqrt(np.mean((y_test['target_x'] - y_test_pred[:, 0])**2))
rmse_y = np.sqrt(np.mean((y_test['target_y'] - y_test_pred[:, 1])**2))
print("X: ", rmse_x, "\n Y: ", rmse_y)

X:  1.5835327006450235 
 Y:  1.5994626040354822


In [149]:
importances = model.feature_importances_
feature_names = X_train.columns

feat_imp = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

feat_imp

Unnamed: 0,feature,importance
0,x,0.750476
1,y,0.191024
29,distance_to_closest_sideline,0.01293
32,frame_id,0.011542
6,ball_land_x,0.011134
5,dir,0.005331
7,ball_land_y,0.005251
2,s,0.001067
26,dir_mean,0.001041
10,movement_angle_difference,0.000986


#  Feature Selection

### Minimal Model

In [150]:
X_minimal = subset[['x','y','distance_to_closest_sideline','frame_id','ball_land_x','ball_land_y','dir']]
y = subset[['target_x','target_y']]

In [151]:
X_minimal_train, X_minimal_test, y_train, y_test = train_test_split(X_minimal, y, test_size=0.2, random_state=42)

In [152]:
model_minimal = RandomForestRegressor(n_estimators=200, random_state=42)
model_minimal.fit(X_minimal_train, y_train)

In [153]:
y_train_pred_minimal = model_minimal.predict(X_minimal_train)
y_test_pred_minimal = model_minimal.predict(X_minimal_test)

In [154]:
r2_train_x_minimal = r2_score(y_train['target_x'], y_train_pred_minimal[:, 0])
r2_train_y_minimal = r2_score(y_train['target_y'], y_train_pred_minimal[:, 1])

r2_test_x_minimal = r2_score(y_test['target_x'], y_test_pred_minimal[:, 0])
r2_test_y_minimal = r2_score(y_test['target_y'], y_test_pred_minimal[:, 1])

print(f"Training R²: x = {r2_train_x_minimal:.3f}, y = {r2_train_y_minimal:.3f}")
print(f"Testing  R²: x = {r2_test_x_minimal:.3f}, y = {r2_test_y_minimal:.3f}")

Training R²: x = 0.999, y = 0.998
Testing  R²: x = 0.996, y = 0.986


In [155]:
mae_train_x_minimal = mean_absolute_error(y_train['target_x'], y_train_pred_minimal[:, 0])
mae_train_y_minimal = mean_absolute_error(y_train['target_y'], y_train_pred_minimal[:, 1])
mae_test_x_minimal = mean_absolute_error(y_test['target_x'], y_test_pred_minimal[:, 0])
mae_test_y_minimal = mean_absolute_error(y_test['target_y'], y_test_pred_minimal[:, 1])

print(f"Training MAE: x = {mae_train_x_minimal:.3f}, y = {mae_train_y_minimal:.3f}")
print(f"Testing  MAE: x = {mae_test_x_minimal:.3f}, y = {mae_test_y_minimal:.3f}")

Training MAE: x = 0.384, y = 0.385
Testing  MAE: x = 1.032, y = 1.037


In [156]:
rmse_x_minimal = np.sqrt(np.mean((y_test['target_x'] - y_test_pred_minimal[:, 0])**2))
rmse_y_minimal = np.sqrt(np.mean((y_test['target_y'] - y_test_pred_minimal[:, 1])**2))
print("X: ", rmse_x_minimal, "\n Y: ", rmse_y_minimal)

X:  1.6203342612949412 
 Y:  1.5779041086916237


### Clean Model

In [157]:
X_clean = subset[['x','y','frame_id','distance_to_closest_sideline','ball_land_x','ball_land_y','dir','dir_mean','movement_angle_difference','s']]
y = subset[['target_x','target_y']]

In [158]:
X_clean_train, X_clean_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.2, random_state=42)

In [159]:
model_clean = RandomForestRegressor(n_estimators=200, random_state=42)
model_clean.fit(X_clean_train, y_train)

In [160]:
y_train_pred_clean = model_clean.predict(X_clean_train)
y_test_pred_clean = model_clean.predict(X_clean_test)

In [161]:
r2_train_x_clean = r2_score(y_train['target_x'], y_train_pred_clean[:, 0])
r2_train_y_clean = r2_score(y_train['target_y'], y_train_pred_clean[:, 1])

r2_test_x_clean = r2_score(y_test['target_x'], y_test_pred_clean[:, 0])
r2_test_y_clean = r2_score(y_test['target_y'], y_test_pred_clean[:, 1])

print(f"Training R²: x = {r2_train_x_clean:.3f}, y = {r2_train_y_clean:.3f}")
print(f"Testing  R²: x = {r2_test_x_clean:.3f}, y = {r2_test_y_clean:.3f}")

Training R²: x = 0.999, y = 0.998
Testing  R²: x = 0.996, y = 0.987


In [162]:
mae_train_x_clean = mean_absolute_error(y_train['target_x'], y_train_pred_clean[:, 0])
mae_train_y_clean = mean_absolute_error(y_train['target_y'], y_train_pred_clean[:, 1])
mae_test_x_clean = mean_absolute_error(y_test['target_x'], y_test_pred_clean[:, 0])
mae_test_y_clean = mean_absolute_error(y_test['target_y'], y_test_pred_clean[:, 1])

print(f"Training MAE: x = {mae_train_x_clean:.3f}, y = {mae_train_y_clean:.3f}")
print(f"Testing  MAE: x = {mae_test_x_clean:.3f}, y = {mae_test_y_clean:.3f}")

Training MAE: x = 0.367, y = 0.378
Testing  MAE: x = 0.984, y = 1.012


In [163]:
rmse_x_clean = np.sqrt(np.mean((y_test['target_x'] - y_test_pred_clean[:, 0])**2))
rmse_y_clean = np.sqrt(np.mean((y_test['target_y'] - y_test_pred_clean[:, 1])**2))
print("X: ", rmse_x_clean, "\n Y: ", rmse_y_clean)

X:  1.5314722196942887 
 Y:  1.5347983034122634


### Robust Model

In [164]:
X_robust = subset[['x','y','frame_id','dir','dir_mean','movement_angle_difference','s', "x_std", "angle_to_land", "o", "distance_to_land","y_std","x_min", "x_max","speed_mean","orientation_angle_difference","y_min", "y_max", "dx_to_closest_opponent", "dy_to_closest_opponent", "a",'accel_mean']]
y = subset[['target_x','target_y']]

In [165]:
X_robust_train, X_robust_test, y_train, y_test = train_test_split(X_robust, y, test_size=0.2, random_state=42)

In [166]:
model_robust = RandomForestRegressor(n_estimators=200, random_state=42)
model_robust.fit(X_robust_train, y_train)

In [167]:
y_train_pred_robust = model_robust.predict(X_robust_train)
y_test_pred_robust = model_robust.predict(X_robust_test)

In [168]:
r2_train_x_robust = r2_score(y_train['target_x'], y_train_pred_robust[:, 0])
r2_train_y_robust = r2_score(y_train['target_y'], y_train_pred_robust[:, 1])

r2_test_x_robust = r2_score(y_test['target_x'], y_test_pred_robust[:, 0])
r2_test_y_robust = r2_score(y_test['target_y'], y_test_pred_robust[:, 1])

print(f"Training R²: x = {r2_train_x_robust:.3f}, y = {r2_train_y_robust:.3f}")
print(f"Testing  R²: x = {r2_test_x_robust:.3f}, y = {r2_test_y_robust:.3f}")

Training R²: x = 0.999, y = 0.998
Testing  R²: x = 0.996, y = 0.985


In [169]:
mae_train_x_robust = mean_absolute_error(y_train['target_x'], y_train_pred_robust[:, 0])
mae_train_y_robust = mean_absolute_error(y_train['target_y'], y_train_pred_robust[:, 1])
mae_test_x_robust = mean_absolute_error(y_test['target_x'], y_test_pred_robust[:, 0])
mae_test_y_robust = mean_absolute_error(y_test['target_y'], y_test_pred_robust[:, 1])

print(f"Training MAE: x = {mae_train_x_robust:.3f}, y = {mae_train_y_robust:.3f}")
print(f"Testing  MAE: x = {mae_test_x_robust:.3f}, y = {mae_test_y_robust:.3f}")

Training MAE: x = 0.375, y = 0.399
Testing  MAE: x = 1.019, y = 1.077


In [170]:
rmse_x_robust = np.sqrt(np.mean((y_test['target_x'] - y_test_pred_robust[:, 0])**2))
rmse_y_robust = np.sqrt(np.mean((y_test['target_y'] - y_test_pred_robust[:, 1])**2))
print("X: ", rmse_x_robust, "\n Y: ", rmse_y_robust)

X:  1.5937097628768309 
 Y:  1.6363219590914726


Out of the full model, minimal model, clean model, and robust model, the clean model performs best in all metrics

## Hypertune Robust Model

In [171]:
param_dist = {
    "n_estimators": [200, 400],           
    "max_depth": [10, 20, None],          
    "min_samples_split": [2, 5],          
    "min_samples_leaf": [1, 2],           
    "max_features": ["sqrt", 0.5],        
    "bootstrap": [True]               
}

In [172]:
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=40,           # number of combinations to try
    cv=5,                # 5-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring="neg_mean_squared_error"  # or accuracy for classifier
)

In [173]:
rf_random.fit(X_clean_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [174]:
best_model = rf_random.best_estimator_

print("Best Parameters:")
print(rf_random.best_params_)

print("\nBest CV Score:")
print(rf_random.best_score_)

Best Parameters:
{'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'max_depth': None, 'bootstrap': True}

Best CV Score:
-2.7236308576261044


In [175]:
best_model_clean = RandomForestRegressor(
    rf_random.best_params_,
    random_state=42,
    n_jobs=-1
)

In [176]:
model_clean_best = RandomForestRegressor(n_estimators=200, random_state=42)
model_clean_best.fit(X_clean_train, y_train)

In [177]:
y_train_pred_clean_best = model_clean_best.predict(X_clean_train)
y_test_pred_clean_best = model_clean_best.predict(X_clean_test)

In [178]:
r2_train_x_clean_best = r2_score(y_train['target_x'], y_train_pred_clean_best[:, 0])
r2_train_y_clean_best = r2_score(y_train['target_y'], y_train_pred_clean_best[:, 1])

r2_test_x_clean_best = r2_score(y_test['target_x'], y_test_pred_clean_best[:, 0])
r2_test_y_clean_best = r2_score(y_test['target_y'], y_test_pred_clean_best[:, 1])

print(f"Training R²: x = {r2_train_x_clean_best:.3f}, y = {r2_train_y_clean_best:.3f}")
print(f"Testing  R²: x = {r2_test_x_clean_best:.3f}, y = {r2_test_y_clean_best:.3f}")

Training R²: x = 0.999, y = 0.998
Testing  R²: x = 0.996, y = 0.987


In [179]:
mae_train_x_clean_best = mean_absolute_error(y_train['target_x'], y_train_pred_clean_best[:, 0])
mae_train_y_clean_best = mean_absolute_error(y_train['target_y'], y_train_pred_clean_best[:, 1])
mae_test_x_clean_best = mean_absolute_error(y_test['target_x'], y_test_pred_clean_best[:, 0])
mae_test_y_clean_best = mean_absolute_error(y_test['target_y'], y_test_pred_clean_best[:, 1])

print(f"Training MAE: x = {mae_train_x_clean_best:.3f}, y = {mae_train_y_clean_best:.3f}")
print(f"Testing  MAE: x = {mae_test_x_clean_best:.3f}, y = {mae_test_y_clean_best:.3f}")

Training MAE: x = 0.367, y = 0.378
Testing  MAE: x = 0.984, y = 1.012


In [180]:
rmse_x_clean_best = np.sqrt(np.mean((y_test['target_x'] - y_test_pred_clean_best[:, 0])**2))
rmse_y_clean_best = np.sqrt(np.mean((y_test['target_y'] - y_test_pred_clean_best[:, 1])**2))
print("X: ", rmse_x_clean_best, "\n Y: ", rmse_y_clean_best)

X:  1.5314722196942887 
 Y:  1.5347983034122634
