# NBA Draft Prediction with XGBoost

## Data Preprocessing

### Load the data

In [1]:
import pandas as pd

metadata = pd.read_csv('../data/raw/metadata.csv')
train_data = pd.read_csv('../data/raw/train.csv')
test_data = pd.read_csv('../data/raw/test.csv')

  train_data = pd.read_csv('../data/raw/train.csv')


### Handle missing values

In [2]:
# Drop columns with more than 50% missing data
columns_to_drop = ['pick', 'Rec_Rank', 'dunks_ratio', 'ht', 'num']
train_data.drop(columns_to_drop, axis=1, inplace=True)
test_data.drop(columns_to_drop, axis=1, inplace=True)

# Impute missing values with median for remaining columns with missing data
columns_to_impute = ['mid_ratio', 'rim_ratio', 'rimmade', 'rimmade_rimmiss', 'midmade', 'midmade_midmiss',
                     'dunksmade', 'dunksmiss_dunksmade', 'ast_tov', 'obpm', 'dgbpm',
                     'ogbpm', 'gbpm', 'dbpm', 'adrtg', 'bpm', 'stops', 'dporpag', 'drtg', 'mp', 'oreb',
                     'dreb', 'treb', 'ast', 'stl', 'blk', 'pts']

for column in columns_to_impute:
    median_value = train_data[column].median()
    train_data[column].fillna(median_value, inplace=True)
    test_data[column].fillna(median_value, inplace=True)
    
year_mapping = {'Fr': 1, 'So': 2, 'Jr': 3, 'Sr': 4}
train_data['yr'] = train_data['yr'].map(year_mapping)
test_data['yr'] = test_data['yr'].map(year_mapping)

## Feature Selection

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# Identify the categorical columns
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()

# Remove 'player_id' from the list of categorical columns
categorical_cols.remove('player_id')

# Apply one-hot encoding to the categorical columns
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
train_encoded = pd.DataFrame(encoder.fit_transform(train_data[categorical_cols]))

# Concatenate the one-hot encoded columns with the original numerical columns
train_data_preprocessed = pd.concat([train_data.drop(categorical_cols, axis=1), train_encoded], axis=1)

# Define the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest model
rf_model.fit(train_data_preprocessed.drop(['player_id', 'drafted'], axis=1), train_data['drafted'])

# Get feature importances from the Random Forest model
feature_importances = rf_model.feature_importances_

# Get sorted indices of feature importances in descending order
sorted_indices = feature_importances.argsort()[::-1]

# Select top features
num_features = 50
top_features = train_data_preprocessed.drop(['player_id', 'drafted'], axis=1).columns[sorted_indices[:num_features]]

# Select top features in training and test data
train_data_selected = train_data_preprocessed[top_features]



TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

## Model Training

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import roc_auc_score

# Split the data into train and validation sets
X = train_data_selected
y = train_data['drafted']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost model with reduced number of estimators and max depth
xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=50, max_depth=5, random_state=42, n_jobs=-1)

# Train the XGBoost model on the selected features
xgb_model.fit(X_train, y_train)

# Get predictions on validation data
xgb_val_preds = xgb_model.predict_proba(X_val)[:, 1]

# Get the AUROC score
xgb_val_score = roc_auc_score(y_val, xgb_val_preds)
xgb_val_score

## Generate Predictions

In [None]:
# Make predictions on the test data
test_preds = xgb_model.predict_proba(test_data_selected)[:, 1]

# Create a submission DataFrame
submission = pd.DataFrame({'player_id': test_data['player_id'], 'drafted': test_preds})

# Save the submission DataFrame to a CSV file
submission.to_csv('submissions_xgb.csv', index=False)