In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import numpy as np
import pandas as pd

df = pd.read_csv("nba_salaries_2.csv")

# Initial setup
all_features = [
    "Age",
    "GP",
    "GS",
    "MP",
    "FG",
    "FGA",
    "FG%",
    "3P",
    "3PA",
    "3P%",
    "2P",
    "2PA",
    "2P%",
    "eFG%",
    "FT",
    "FTA",
    "FT%",
    "ORB",
    "DRB",
    "TRB",
    "AST",
    "STL",
    "BLK",
    "TOV",
    "PF",
    "PTS",
    "Linear_PER",
]

# Target
y = df["Salary"]

# Start with initial features
selected_features = ["Age", "PTS"]
remaining_features = [f for f in all_features if f not in selected_features]

best_rmse = float("inf")
rmse_progression = []

# Forward Selection Loop
while remaining_features:
    best_candidate = None
    best_candidate_rmse = float("inf")

    for feature in remaining_features:
        current_features = selected_features + [feature]
        X = df[current_features]

        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        model = XGBRegressor()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        if rmse < best_candidate_rmse:
            best_candidate_rmse = rmse
            best_candidate = feature

    # Stop if adding a new feature doesn't improve RMSE
    if best_candidate_rmse >= best_rmse:
        print("No further RMSE improvement. Stopping.")
        break

    # Update
    selected_features.append(best_candidate)
    remaining_features.remove(best_candidate)
    best_rmse = best_candidate_rmse
    rmse_progression.append((selected_features.copy(), best_rmse))

    print(f"Added '{best_candidate}' ➜ RMSE: {best_rmse:.2f}")

# Final result
print("\nBest feature set found:")
for i, (features, rmse) in enumerate(rmse_progression):
    print(f"{i+1}. Features: {features} ➜ RMSE: {rmse:.2f}")

Added 'GP' ➜ RMSE: 4994110.25
Added 'FT%' ➜ RMSE: 4641424.33
No further RMSE improvement. Stopping.

Best feature set found:
1. Features: ['Age', 'PTS', 'GP'] ➜ RMSE: 4994110.25
2. Features: ['Age', 'PTS', 'GP', 'FT%'] ➜ RMSE: 4641424.33
