# NBA Draft Prediction

## Data Preprocessing

### Load the data

In [5]:
import pandas as pd

metadata = pd.read_csv('../data/raw/metadata.csv')
train_data = pd.read_csv('../data/raw//train.csv')
test_data = pd.read_csv('../data/raw//test.csv')

  train_data = pd.read_csv('../data/raw//train.csv')


### Handle non-numeric values

In [6]:
# Handle non-numeric values in 'num' column
median_value = pd.to_numeric(train_data['num'], errors='coerce').median()
train_data['num'] = pd.to_numeric(train_data['num'], errors='coerce').fillna(median_value)
test_data['num'] = pd.to_numeric(test_data['num'], errors='coerce').fillna(median_value)

# Convert 'yr' column to ordinal numbers
year_mapping = {'Fr': 1, 'So': 2, 'Jr': 3, 'Sr': 4}
train_data['yr'] = train_data['yr'].map(year_mapping)
test_data['yr'] = test_data['yr'].map(year_mapping)

### Handle missing values

In [7]:
# Drop columns with more than 50% missing data
columns_to_drop = ['pick', 'Rec_Rank', 'dunks_ratio', 'ht']
train_data.drop(columns_to_drop, axis=1, inplace=True)
test_data.drop(columns_to_drop, axis=1, inplace=True)

# Impute missing values with median for remaining columns with missing data
columns_to_impute = ['mid_ratio', 'rim_ratio', 'rimmade', 'rimmade_rimmiss', 'midmade', 'midmade_midmiss',
                     'dunksmade', 'dunksmiss_dunksmade', 'num', 'ast_tov', 'yr', 'obpm', 'dgbpm',
                     'ogbpm', 'gbpm', 'dbpm', 'adrtg', 'bpm', 'stops', 'dporpag', 'drtg', 'mp', 'oreb',
                     'dreb', 'treb', 'ast', 'stl', 'blk', 'pts']

for column in columns_to_impute:
    median_value = train_data[column].median()
    train_data[column].fillna(median_value, inplace=True)
    test_data[column].fillna(median_value, inplace=True)

## Model Training

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Split the data into train and validation sets
X = train_data.drop(['player_id', 'drafted'], axis=1)
y = train_data['drafted']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Preprocessing of training data, train model
pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
val_preds = pipeline.predict_proba(X_val)[:, 1]

# Get the AUROC score
val_score = roc_auc_score(y_val, val_preds)
val_score

0.9836798742823925

## Generate Predictions

In [10]:
# Make predictions on the test data
test_preds = pipeline.predict_proba(test_data.drop('player_id', axis=1))[:, 1]

# Create a submission DataFrame
submission = pd.DataFrame({'player_id': test_data['player_id'], 'drafted': test_preds})

# Save the submission DataFrame to a CSV file
submission_path = 'submissions.csv'
submission.to_csv(submission_path, index=False)
submission_path

'submissions.csv'