# NBA Draft Prediction

## Data Preprocessing

### Load the data

In [1]:
import pandas as pd
import sys
sys.path.append("../src")

metadata = pd.read_csv('../data/raw/metadata.csv')
train_data = pd.read_csv('../data/raw//train.csv')
test_data = pd.read_csv('../data/raw//test.csv')

metadata

  train_data = pd.read_csv('../data/raw//train.csv')


Unnamed: 0,feature,name,description
0,1,team,Name of team
1,2,conf,Name of conference
2,3,GP,Games played
3,4,Min_per,Player's percentage of available team minutes ...
4,5,ORtg,ORtg - Offensive Rating (available since the 1...
...,...,...,...
59,66,stl,STL - Steals (available since the 1973-74 seas...
60,67,blk,BLK - Blocks (available since the 1973-74 seas...
61,68,pts,PTS - Points
62,69,player_id,Unique identifier of player


In [None]:
train_data

In [None]:
train_data.shape

In [None]:
train_data.info()

In [None]:
train_data.describe(include="all")

## Data Cleaning

### Handle missing values

In [2]:
original_train_data = train_data.copy()

In [3]:
from data.data_preprocessing import preprocess_func

preprocessed_train_data, training_statistics = preprocess_func(train_data, original_df=original_train_data)

## Feature Engineering

In [4]:
from sklearn.model_selection import train_test_split

# Split the data into train and validation sets
X = preprocessed_train_data.drop(['player_id', 'drafted'], axis=1)
y = preprocessed_train_data['drafted']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [6]:
def columns_with_nan(df):
    """Return columns in the DataFrame that contain NaN values."""
    nan_columns = df.columns[df.isnull().any()].tolist()
    return nan_columns
columns_with_nan_values = columns_with_nan(X_train)
print(columns_with_nan_values)


[]


## Model Training

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Define the model
model = LogisticRegression(max_iter=1000, random_state=42)

# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Preprocessing of training data, train model
pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
val_preds = pipeline.predict_proba(X_val)[:, 1]

# Get the AUROC score
val_score = roc_auc_score(y_val, val_preds)
val_score

0.9914168665067945

## Generate Predictions

In [8]:
# 1. Preprocess the test data using the train_stats from the training data preprocessing
preprocessed_test_data, _ = preprocess_func(test_data, train_stats=training_statistics)

# 2. Make predictions using the trained pipeline
test_preds = pipeline.predict_proba(preprocessed_test_data.drop('player_id', axis=1))[:, 1]

# 3. Create a submission DataFrame
submission = pd.DataFrame({'player_id': preprocessed_test_data['player_id'], 'drafted': test_preds})

# 4. Save the submission DataFrame to a CSV file
submission_path = 'week2_logistic_regression.csv'
submission.to_csv(submission_path, index=False)

## XGBOOST

In [21]:
preprocessed_train_data.head()

Unnamed: 0,team,conf,GP,Min_per,Ortg,usg,eFG,TS_per,ORB_per,DRB_per,AST_per,TO_per,FTM,FTA,FT_per,twoPM,twoPA,twoP_per,TPM,TPA,TP_per,blk_per,stl_per,ftr,yr,ht,porpag,adjoe,pfr,year,type,ast_tov,rimmade,rimmade_rimmiss,midmade,midmade_midmiss,rim_ratio,mid_ratio,dunksmade,dunksmiss_dunksmade,dunks_ratio,drtg,adrtg,dporpag,stops,bpm,obpm,dbpm,gbpm,mp,ogbpm,dgbpm,oreb,dreb,treb,ast,stl,blk,pts,player_id,drafted
0,South Alabama,SB,26,29.5,97.3,16.6,42.5,44.43,1.6,4.6,15.8,16.3,10,14,0.714,26,68,0.382,13,39,0.333,0.3,1.5,13.1,2,74.0,0.258086,89.3938,2.5,2009,all,1.823646,18.0,33.0,8.0,25.0,0.5806,0.3333,0.0,1.0,1.0,108.321,108.527,0.893017,49.9644,-4.99514,-1.62336,-3.37178,-4.72315,14.5769,-2.78199,-1.94115,0.1923,0.6154,0.8077,1.1923,0.3462,0.0385,3.8846,7be2aead-da4e-4d13-a74b-4c1e692e2368,0.0
1,Utah St.,WAC,34,60.9,108.3,14.9,52.4,54.48,3.8,6.3,13.6,19.8,30,45,0.667,56,113,0.496,20,51,0.392,1.0,1.1,27.4,2,76.0,1.33492,100.066,3.4,2009,all,1.631621,18.0,33.0,8.0,25.0,0.5806,0.3333,0.0,1.0,1.0,105.078,104.017,1.88003,111.929,0.593024,1.38549,-0.792469,-0.300196,24.5294,-0.052263,-0.247934,0.6765,1.2647,1.9412,1.8235,0.4118,0.2353,5.9412,61de55d9-1582-4ea4-b593-44f6aa6524a6,0.0
2,South Florida,BE,27,72.0,96.2,21.8,45.7,47.98,2.1,8.0,14.7,15.9,45,67,0.672,67,157,0.427,50,154,0.325,0.0,0.9,21.5,4,76.0,2.27407,104.107,1.7,2009,all,1.081662,18.0,33.0,8.0,25.0,0.5806,0.3333,0.0,1.0,1.0,107.556,102.154,2.76387,115.021,0.003161,1.02477,-1.02161,0.665065,33.1852,1.54823,-0.883163,0.6296,2.3333,2.963,1.963,0.4815,0.0,12.1852,efdc4cfc-9dd0-4bf8-acef-7273e4d5b655,0.0
3,Pepperdine,WCC,30,44.5,97.7,16.0,53.6,53.69,4.1,9.4,13.7,23.8,14,27,0.519,25,63,0.397,28,62,0.452,0.8,1.9,21.6,4,76.0,0.552857,93.2086,2.0,2009,all,0.94283,18.0,33.0,8.0,25.0,0.5806,0.3333,0.0,1.0,1.0,108.858,106.556,1.21871,84.1698,-0.977798,-0.502574,-0.475224,-0.736233,17.9667,-0.342775,-0.393459,0.7,1.4333,2.1333,1.1,0.5667,0.1333,4.9333,14f05660-bb3c-4868-b3dd-09bcdb64279d,0.0
4,Pacific,BW,33,56.2,96.5,22.0,52.8,54.31,8.3,18.6,8.2,22.7,64,114,0.561,93,176,0.528,0,0,0.0,1.9,1.3,64.8,4,80.0,1.06013,97.8554,3.8,2009,all,0.491229,18.0,33.0,8.0,25.0,0.5806,0.3333,0.0,1.0,1.0,101.052,100.724,1.99279,128.028,-1.83606,-1.94604,0.109983,-2.35318,22.9091,-1.68486,-0.668318,1.4242,3.303,4.7273,0.8485,0.4545,0.3333,7.5758,a58db52f-fbba-4e7b-83d0-371efcfed039,0.0


In [20]:
pd.set_option('display.max_columns', None)
metadata.loc[31, 'description']

'Type of metrics displayed: `All` for all types, `C` for conference`, `NC` for non-conference, `PC` for pre-conference tour, `R` for regular season, `P` for post-season, `T` for NCAA'

In [23]:
# Feature Engineering

# One-hot encoding for categorical variables
X_features = preprocessed_train_data.drop(columns=['drafted', 'team', 'player_id', 'conf'])
X_encoded = pd.get_dummies(X_features, drop_first=True)

# Standard Scaling for numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)
y_target = preprocessed_train_data['drafted']

# Data Splitting
from sklearn.model_selection import train_test_split

X_train_xgb, X_val_xgb, y_train_xgb, y_val_xgb = train_test_split(X_scaled, y_target, test_size=0.2, random_state=42)

# Model Training
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=1000, learning_rate=0.01, n_jobs=-1)
xgb_model.fit(X_train_xgb, y_train_xgb, 
              early_stopping_rounds=10, 
              eval_metric='auc', 
              eval_set=[(X_val_xgb, y_val_xgb)], 
              verbose=True)

# Model Evaluation
from sklearn.metrics import roc_auc_score

y_val_preds_xgb = xgb_model.predict_proba(X_val_xgb)[:, 1]
auc_roc = roc_auc_score(y_val_xgb, y_val_preds_xgb)
print(f"Validation AUC-ROC Score: {auc_roc:.4f}")





[0]	validation_0-auc:0.93281
[1]	validation_0-auc:0.93325
[2]	validation_0-auc:0.93322
[3]	validation_0-auc:0.93319
[4]	validation_0-auc:0.93774
[5]	validation_0-auc:0.93822
[6]	validation_0-auc:0.94626
[7]	validation_0-auc:0.94599
[8]	validation_0-auc:0.94605
[9]	validation_0-auc:0.94609
[10]	validation_0-auc:0.94615
[11]	validation_0-auc:0.94612
[12]	validation_0-auc:0.94610
[13]	validation_0-auc:0.94642
[14]	validation_0-auc:0.94638
[15]	validation_0-auc:0.94644
[16]	validation_0-auc:0.94652
[17]	validation_0-auc:0.95057
[18]	validation_0-auc:0.95052
[19]	validation_0-auc:0.95051
[20]	validation_0-auc:0.95047
[21]	validation_0-auc:0.95060
[22]	validation_0-auc:0.95065
[23]	validation_0-auc:0.95064
[24]	validation_0-auc:0.95843
[25]	validation_0-auc:0.95853
[26]	validation_0-auc:0.95851
[27]	validation_0-auc:0.95831
[28]	validation_0-auc:0.95851
[29]	validation_0-auc:0.95864
[30]	validation_0-auc:0.95884
[31]	validation_0-auc:0.95884
[32]	validation_0-auc:0.95888
[33]	validation_0-au

## SMOTE & XGBOOST

In [4]:
preprocessed_train_data.head()

Unnamed: 0,team,conf,GP,Min_per,Ortg,usg,eFG,TS_per,ORB_per,DRB_per,...,dgbpm,oreb,dreb,treb,ast,stl,blk,pts,player_id,drafted
0,South Alabama,SB,26,29.5,97.3,16.6,42.5,44.43,1.6,4.6,...,-1.94115,0.1923,0.6154,0.8077,1.1923,0.3462,0.0385,3.8846,7be2aead-da4e-4d13-a74b-4c1e692e2368,0.0
1,Utah St.,WAC,34,60.9,108.3,14.9,52.4,54.48,3.8,6.3,...,-0.247934,0.6765,1.2647,1.9412,1.8235,0.4118,0.2353,5.9412,61de55d9-1582-4ea4-b593-44f6aa6524a6,0.0
2,South Florida,BE,27,72.0,96.2,21.8,45.7,47.98,2.1,8.0,...,-0.883163,0.6296,2.3333,2.963,1.963,0.4815,0.0,12.1852,efdc4cfc-9dd0-4bf8-acef-7273e4d5b655,0.0
3,Pepperdine,WCC,30,44.5,97.7,16.0,53.6,53.69,4.1,9.4,...,-0.393459,0.7,1.4333,2.1333,1.1,0.5667,0.1333,4.9333,14f05660-bb3c-4868-b3dd-09bcdb64279d,0.0
4,Pacific,BW,33,56.2,96.5,22.0,52.8,54.31,8.3,18.6,...,-0.668318,1.4242,3.303,4.7273,0.8485,0.4545,0.3333,7.5758,a58db52f-fbba-4e7b-83d0-371efcfed039,0.0


In [4]:
# Splitting the preprocessed data
from sklearn.model_selection import train_test_split

# Ensure all non-numeric columns are encoded
X_features_smote = pd.get_dummies(preprocessed_train_data.drop(columns=['drafted', 'player_id', 'team', 'conf']), drop_first=True)

y_smote = preprocessed_train_data['drafted']
X_train_smote, X_val_smote, y_train_smote, y_val_smote = train_test_split(X_features_smote, y_smote, test_size=0.2, random_state=42)

# Applying SMOTE to balance the dataset
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_smote, y_train_smote)

# Training XGBoost on the resampled data
import xgboost as xgb

xgb_model_smote_corrected = xgb.XGBClassifier(objective='binary:logistic', n_estimators=1000, learning_rate=0.01, n_jobs=-1)
xgb_model_smote_corrected.fit(X_train_resampled, y_train_resampled, 
                              early_stopping_rounds=10, 
                              eval_metric='auc', 
                              eval_set=[(X_val_smote, y_val_smote)], 
                              verbose=True)

# Model Evaluation
from sklearn.metrics import roc_auc_score

y_val_preds_smote_corrected = xgb_model_smote_corrected.predict_proba(X_val_smote)[:, 1]
auc_roc_smote_corrected = roc_auc_score(y_val_smote, y_val_preds_smote_corrected)
print(f"Validation AUC-ROC Score using SMOTE (corrected): {auc_roc_smote_corrected:.4f}")




[0]	validation_0-auc:0.95514
[1]	validation_0-auc:0.95529
[2]	validation_0-auc:0.95511
[3]	validation_0-auc:0.95515
[4]	validation_0-auc:0.95504
[5]	validation_0-auc:0.95525
[6]	validation_0-auc:0.95516
[7]	validation_0-auc:0.96237
[8]	validation_0-auc:0.96238
[9]	validation_0-auc:0.96289
[10]	validation_0-auc:0.96285
[11]	validation_0-auc:0.96274
[12]	validation_0-auc:0.96364
[13]	validation_0-auc:0.96428
[14]	validation_0-auc:0.97050
[15]	validation_0-auc:0.97103
[16]	validation_0-auc:0.97095
[17]	validation_0-auc:0.97100
[18]	validation_0-auc:0.97101
[19]	validation_0-auc:0.97110
[20]	validation_0-auc:0.97101
[21]	validation_0-auc:0.97132
[22]	validation_0-auc:0.97148
[23]	validation_0-auc:0.97158
[24]	validation_0-auc:0.97152
[25]	validation_0-auc:0.97162
[26]	validation_0-auc:0.97165
[27]	validation_0-auc:0.97198
[28]	validation_0-auc:0.97186
[29]	validation_0-auc:0.97201
[30]	validation_0-auc:0.97196
[31]	validation_0-auc:0.97203
[32]	validation_0-auc:0.97212
[33]	validation_0-au

## Random Forest & XGBoost

In [24]:
X_try1 = preprocessed_train_data.copy()

# Drop the 'type' column
X_try1.drop(columns=['type'], inplace=True)

# Target encode the 'team' column
mean_encode = X_try1.groupby('team')['drafted'].mean()
X_try1['team_encoded'] = X_try1['team'].map(mean_encode)

# Drop the original 'team' and 'player_id' columns
X_try1.drop(columns=['team', 'player_id'], inplace=True)

# One-hot encode the remaining categorical columns
X_try1_encoded = pd.get_dummies(X_try1, drop_first=True)

# Split the data into features (X) and target (y)
X = X_try1_encoded.drop(columns=['drafted'])
y = X_try1_encoded['drafted']

X.shape, y.shape


((55655, 92), (55655,))

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train a RandomForest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train_scaled, y_train)
rf_predictions = rf_model.predict_proba(X_val_scaled)[:, 1]
rf_auc = roc_auc_score(y_val, rf_predictions)

# Train a GradientBoosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train_scaled, y_train)
gb_predictions = gb_model.predict_proba(X_val_scaled)[:, 1]
gb_auc = roc_auc_score(y_val, gb_predictions)

rf_auc, gb_auc


(0.9729820578381237, 0.9869748925030181)

## Attempt #1

In [4]:
from sklearn.model_selection import train_test_split

# Split the data into train and validation sets
X = preprocessed_train_data.drop(['player_id', 'drafted'], axis=1)
y = preprocessed_train_data['drafted']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Define the model
model = LogisticRegression(max_iter=1000, random_state=42)

# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Preprocessing of training data, train model
pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
val_preds = pipeline.predict_proba(X_val)[:, 1]

# Get the AUROC score
val_score = roc_auc_score(y_val, val_preds)
val_score

0.9911871944539821

## Attempt #2

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Split the data into train and validation sets
X = preprocessed_train_data.drop(['player_id', 'drafted'], axis=1)
y = preprocessed_train_data['drafted']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [5]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Extend the preprocessor to include polynomial features for numerical columns
polynomial_transformer = Pipeline(steps=[
    ('num', numerical_transformer),
    ('poly', PolynomialFeatures(degree=2, interaction_only=False, include_bias=False))
])

# Update the column transformer to use the polynomial transformer for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', polynomial_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Define the model
model = LogisticRegression(max_iter=2000, random_state=42)

# Update the pipeline to use the new preprocessor
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Preprocessing of training data, train model
pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
val_preds_poly = pipeline.predict_proba(X_val)[:, 1]

# Get the AUROC score
val_score_poly = roc_auc_score(y_val, val_preds_poly)
val_score_poly


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9855471624074165

## Attempt 3

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
import xgboost as xgb

# TODO: Replace this with loading your dataset
# train_data = pd.read_csv('YOUR_DATASET_PATH_HERE.csv')

# Split the data into training and validation sets
X = preprocessed_train_data.drop(['drafted'], axis=1)
y = preprocessed_train_data['drafted']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply preprocessing
X_train_scaled = preprocessor.fit_transform(X_train)
X_val_scaled = preprocessor.transform(X_val)

# Define the XGBoost model and hyperparameters for tuning
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42, n_jobs=-1, eval_metric='auc')
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.5, 0.7, 0.9, 1],
    'colsample_bytree': [0.5, 0.7, 0.9, 1],
    'gamma': [0, 0.25, 0.5, 1]
}

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=50, scoring='roc_auc', 
                                   n_jobs=-1, cv=3, random_state=42)
random_search.fit(X_train_scaled, y_train)

# Get the best model
best_xgb_model = random_search.best_estimator_

# Predict on the validation set
xgb_preds = best_xgb_model.predict_proba(X_val_scaled)[:, 1]

# Get the AUROC score
xgb_val_score = roc_auc_score(y_val, xgb_preds)

print(f"Best AUROC Score: {xgb_val_score}")
print(f"Best Hyperparameters: {random_search.best_params_}")


Best AUROC Score: 0.990820947989732
Best Hyperparameters: {'subsample': 0.7, 'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.05, 'gamma': 0.25, 'colsample_bytree': 0.9}


In [6]:
# 1. Preprocess the test data using the train_stats from the training data preprocessing
preprocessed_test_data, _ = preprocess_func(test_data, train_stats=training_statistics)

In [9]:
preprocessed_test_data.head()

Unnamed: 0,team,conf,GP,Min_per,Ortg,usg,eFG,TS_per,ORB_per,DRB_per,...,ogbpm,dgbpm,oreb,dreb,treb,ast,stl,blk,pts,player_id
0,Morgan St.,MEAC,2,3.0,115.1,4.7,50.0,50.0,0.0,4.6,...,-2.46774,-2.27566,0.0,0.3333,0.3333,0.0,0.0,0.0,1.0,cf302b4d-84f7-4124-a25d-a75eed31978b
1,South Carolina St.,MEAC,11,17.6,61.1,18.6,34.7,35.18,2.5,15.7,...,-7.49472,-4.41253,0.2727,1.4545,1.7273,0.4545,0.1818,0.0,2.3636,f91837cd-4f49-4b70-963d-aeb82c6ce3da
2,Binghamton,AE,9,28.6,91.9,23.8,54.1,52.49,6.4,22.5,...,-2.92495,1.71789,1.3333,4.4444,5.7778,1.0,0.6667,1.8889,8.8889,53ec2a29-1e7d-4c6d-86d7-d60d02af8916
3,Illinois,B10,7,1.3,111.0,10.4,83.3,83.33,0.0,13.4,...,-0.767911,0.962469,0.0,0.2857,0.2857,0.0,0.0,0.0,0.7143,32402798-471c-4a54-8cb4-29cd95199014
4,Iowa St.,B12,23,78.5,103.1,21.5,54.0,56.12,3.6,10.2,...,2.89361,-1.019,1.0435,2.8696,3.913,1.1739,0.8261,0.087,14.3043,73b960f9-27b8-4431-9d23-a760e9bbc360


In [11]:
preprocessed_test_data_without = preprocessed_test_data.drop('player_id', axis=1)

In [13]:
# 1. Load your test data (if not already loaded)
# test_data = pd.read_csv('PATH_TO_YOUR_TEST_DATA.csv')

# 2. Apply preprocessing to the test data
X_test_scaled = preprocessor.transform(preprocessed_test_data)

# 3. Use the best XGBoost model to make predictions
test_preds_xgb = best_xgb_model.predict_proba(X_test_scaled)[:, 1]

# 4. Create a submission DataFrame
submission_xgb = pd.DataFrame({'player_id': preprocessed_test_data['player_id'], 'drafted': test_preds_xgb})

# 5. Save the submission DataFrame to a CSV file
submission_path_xgb = 'submissions_xgboostmodel.csv'
submission_xgb.to_csv(submission_path_xgb, index=False)
submission_path_xgb

'submissions_xgboostmodel.csv'