In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
# read in dataset
nba_draftees = pd.read_csv('../data/nba_draftees.csv')

In [3]:
# split data into groups based on draft_season
player_data = nba_draftees[nba_draftees['draft_season'] <= 2014]

# exclude players who did not meet requirements for model data
player_data = player_data[player_data['classification'] != 'Exclude'].reset_index(drop = True)

# drop draft_season column
player_data.drop(['draft_season'], axis = 1, inplace = True)

In [4]:
# create dummy values for position, conference
player_data = pd.get_dummies(player_data, columns=['position', 'conference'])

# create target variable
target = player_data['classification']

# drop data points correlated with existing ones
player_data.drop(['assist_percentage', 'block_percentage', 'defensive_rebounds', 'offensive_rebounds',
                  'total_rebound_percentage', 'defensive_win_shares', 'offensive_win_shares',
                  'win_shares_per_40_minutes', 'effective_field_goal_percentage', 'field_goals',
                  'free_throws', 'games_played', 'three_pointers', 'true_shooting_percentage',
                  'turnover_percentage', 'two_pointers', 'standing_vertical'], axis = 1, inplace = True)

# subset remaining columns with numeric values
model_data = player_data.select_dtypes(include=[np.number])

# shuffle data to prepare for cross validation
model_data = model_data.sample(frac = 1, random_state = 23).reset_index(drop = True)

In [5]:
# split data into train & test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(model_data, target, random_state = 23)

# split categorical data using same train/test split
info_cols = player_data.columns.difference(model_data.columns)

player_info = player_data[info_cols]

train_info, test_info, train_class, test_class = train_test_split(player_info, target, random_state = 23)

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [20, 30, 40, 50, 60, 70, 80, 90],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [2, 3, 4, 5, 6],
    'min_samples_split': [2, 4, 6, 8, 10],
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800]
}

# Create a based model
rfc = RandomForestClassifier(random_state = 23)

# Instantiate the grid search model
rf_GS = GridSearchCV(estimator = rfc, param_grid = param_grid, 
                     cv = 5, verbose = 2, n_jobs = -1)

# Fit the grid search to the data
rf_GS.fit(X_train, y_train)

In [None]:
# evaluate the grid search model
best_GS = rf_GS.best_estimator_

print(f'Grid Search Score: {best_GS.score(X_test, y_test)}')

# print best parameters
rf_GS.best_params_

In [None]:
test_raw = pd.DataFrame(zip(test_info['name'], best_GS.predict_proba(X_test), best_GS.predict(X_test)))

In [None]:
# unpack test_raw df
player = []
bust = []
rp = []
st = []
asg = []
tClass = []

for index, row in test_raw.iterrows():
    player.append(row[0])
    asg.append(row[1][0])
    st.append(row[1][3])
    rp.append(row[1][2])
    bust.append(row[1][1])
    tClass.append(row[2])

test_predictions = pd.DataFrame({'Name': player,
                                 'All-Star %': asg,
                                 'Starter %': st,
                                 'Role Player %': rp,
                                 'Bust %': bust,
                                 'True Classification': tClass})

test_predictions.to_csv('../data/rf_gs.csv', index = False)