In [1]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, ndcg_score
import matplotlib.pyplot as plt

# Set seeds for reproducibility
seed = 42
os.environ['TF_DETERMINISTIC_OPS'] = '1'
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Load and preprocess data
data = pd.read_csv('2024_2025_season.csv')  # Replace with your actual file name

# Remove rows with missing values and reset the index
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

# Calculate the fantasy score, scaled by the number of games played (GP)
data['fantasy_score'] = (
    (2 * data['FGM'] - data['FGA'] + data['FTM'] - data['FTA'] +
     data['3P'] + data['PPG'] + 2 * data['APG'] + data['RPG'] +
     4 * (data['STL'] + data['BLK']) - 2 * data['TOV']) * data['GP']
)

# Normalize features
features = ['FGM', 'FGA', 'FTM', 'FTA', '3P', 'PPG', 'APG', 'RPG', 'STL', 'BLK', 'TOV', 'GP']
data[features] = (data[features] - data[features].mean()) / data[features].std()

# Group data for listwise ranking (grouped by 'Team' for example)
grouped_data = data.groupby('Team')
X, y = [], []
for _, group in grouped_data:
    X.append(group[features].values)
    y.append(group['fantasy_score'].values)

# Pad sequences for TensorFlow model
max_group_size = max(len(group) for group in X)
X_padded = tf.keras.preprocessing.sequence.pad_sequences(
    X, maxlen=max_group_size, padding='post', dtype='float32'
)
y_padded = tf.keras.preprocessing.sequence.pad_sequences(
    y, maxlen=max_group_size, padding='post', dtype='float32'
)

# Prepare TensorFlow dataset
def prepare_dataset(X, y):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    dataset = dataset.shuffle(buffer_size=100, seed=seed).batch(32)  # Fixed seed for shuffling
    return dataset

train_dataset = prepare_dataset(X_padded, y_padded)

# Define a neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(max_group_size, len(features))),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # Output relevance score
])

# Custom listwise loss function
def listwise_loss(y_true, y_pred):
    y_pred = tf.nn.softmax(y_pred, axis=1)
    y_true = tf.nn.softmax(y_true, axis=1)
    return -tf.reduce_sum(y_true * tf.math.log(y_pred + 1e-9))

# Compile model with AdamW optimizer
model.compile(
    optimizer=tf.keras.optimizers.AdamW(learning_rate=0.001, weight_decay=1e-4),
    loss=listwise_loss,
    metrics=['accuracy']
)

# Train the model
model.fit(train_dataset, epochs=20)

# Predict rankings using the trained model
predictions = model.predict(X_padded)

# Flatten predictions and align with original data
predictions_flat = []
for pred, group in zip(predictions, X):  # Match group sizes
    predictions_flat.extend(pred[:len(group)])  # Keep only valid predictions

# Ensure arrays are aligned
predicted_scores_flat = np.array(predictions_flat).reshape(-1, 1)
true_fantasy_scores = data['fantasy_score'].values.reshape(-1, 1)

# Sanity check
assert len(predicted_scores_flat) == len(true_fantasy_scores), "Lengths do not match!"

# Add predictions to the dataset
data['predicted_fantasy_points'] = predicted_scores_flat

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    predicted_scores_flat, true_fantasy_scores, test_size=0.2, random_state=seed
)

# Train the Random Forest Regressor
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_regressor = RandomForestRegressor(random_state=seed)
grid_search = GridSearchCV(rf_regressor, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train.ravel())
best_rf = grid_search.best_estimator_

# Evaluate the Random Forest model
mapped_fantasy_points = best_rf.predict(predicted_scores_flat)
mae = mean_absolute_error(y_test, best_rf.predict(X_test))
ndcg = ndcg_score([y_test.ravel()], [best_rf.predict(X_test)])
print(f"Mean Absolute Error: {mae}")
print(f"NDCG Score: {ndcg}")

# Sort and print all players by predicted fantasy points
ranked_players = data.sort_values(by='predicted_fantasy_points', ascending=False)
all_players = ranked_players[['Name', 'Team', 'Position', 'predicted_fantasy_points']]

print("Full List of Fantasy Basketball Players (Mapped Fantasy Points, Adjusted for Games Played):")
print(all_players.to_string(index=False))

# Feature importance visualization
importances = best_rf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.gca().invert_yaxis()
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 342ms/step - accuracy: 0.3931 - loss: 46.2589
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3931 - loss: 42.1337
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3931 - loss: 40.1201
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3931 - loss: 36.2986
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.3931 - loss: 34.4387
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.3931 - loss: 30.8177
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3931 - loss: 27.5496
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.3931 - loss: 25.7915
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

ValueError: All arrays must be of the same length