In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
# read in dataset
nba_draftees = pd.read_csv('../data/nba_draftees.csv')

In [3]:
# split data into groups based on draft_season
player_data = nba_draftees[nba_draftees['draft_season'] <= 2014]

# exclude players who did not meet requirements for model data
player_data = player_data[player_data['classification'] != 'Exclude'].reset_index(drop = True)

# drop draft_season column
player_data.drop(['draft_season'], axis = 1, inplace = True)

In [4]:
# create dummy values for position, conference
player_data = pd.get_dummies(player_data, columns=['position', 'conference'])

# create target variable
target = player_data['classification']

# drop data points correlated with existing ones
player_data.drop(['assist_percentage', 'block_percentage', 'defensive_rebounds', 'offensive_rebounds',
                  'total_rebound_percentage', 'defensive_win_shares', 'offensive_win_shares',
                  'win_shares_per_40_minutes', 'effective_field_goal_percentage', 'field_goals',
                  'free_throws', 'games_played', 'three_pointers', 'true_shooting_percentage',
                  'turnover_percentage', 'two_pointers', 'standing_vertical'], axis = 1, inplace = True)

# subset remaining columns with numeric values
model_data = player_data.select_dtypes(include=[np.number])

# shuffle data to prepare for cross validation
model_data = model_data.sample(frac = 1, random_state = 23).reset_index(drop = True)

In [5]:
# test efficacy of remaining features on gridSearch model
from collections import defaultdict
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier

r = np.random.RandomState(23)

features = model_data.columns.values
rs = ShuffleSplit(n_splits = 100, test_size = 0.3, random_state = 23)

X = model_data.as_matrix()
Y = target

scores = defaultdict(list)

for train_idx, test_idx in rs.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    Y_train, Y_test = Y[train_idx], Y[test_idx]
    
    rf_gs = RandomForestClassifier(bootstrap = True, max_depth = 20, max_features = 'sqrt',
                                   min_samples_leaf = 5, min_samples_split = 3, n_estimators = 1200)
    rf_gs = rf_gs.fit(X_train, Y_train)
    acc = rf_gs.score(X_test, Y_test)
    
    for i in range(X.shape[1]):
        X_t = X_test.copy()
        r.shuffle(X_t[:, i])
        shuff_acc = rf_gs.score(X_t, Y_test)
        scores[features[i]].append((acc - shuff_acc) / acc)
    
mda_features = [f for f in scores.keys()]
mda_importance = [(np.mean(score)) for score in scores.values()]
mda_indices = np.argsort(mda_importance)

mda_df = pd.DataFrame({'feature': mda_features,
                       'importance': mda_importance})

# Original code come from http://blog.datadive.net/selecting-good-features-part-iii-random-forests/

  # This is added back by InteractiveShellApp.init_path()


KeyboardInterrupt: 

In [None]:
# plot feature testing results
plt.figure(figsize = (10,14))
plt.title('Feature Importances')
plt.barh(range(len(mda_indices)), [mda_importance[i] for i in mda_indices], color='green', alpha = 0.5, align='center')
plt.yticks(range(len(mda_indices)), [mda_features[i] for i in mda_indices])
plt.xlabel('Mean decrease accuracy')
plt.show()

In [None]:
acc_cols = []
for index, row in mda_df.iterrows():
    if row['importance'] > 0:
        acc_cols.append(row['feature'])
    else:
        continue
        
acc_data = model_data[acc_cols]

acc_data.head()

In [None]:
# split data into train & test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(acc_data, target, random_state = 23)

# split categorical data using same train/test split
info_cols = player_data.columns.difference(model_data.columns)

player_info = player_data[info_cols]

train_info, test_info, train_class, test_class = train_test_split(player_info, target, random_state = 23)

In [None]:
# run preliminary model using grid search parameters
rf_ft_v2 = RandomForestClassifier(bootstrap = True, max_depth = 20, max_features = 'sqrt',
                                  min_samples_leaf = 5, min_samples_split = 3, n_estimators = 1200)
rf_ft_v2 = rf_ft_v2.fit(X_train, y_train)
rf_ft_v2.score(X_test, y_test)

In [None]:
test_raw = pd.DataFrame(zip(test_info['name'], rf_ft_v2.predict_proba(X_test), rf_ft_v2.predict(X_test)))

In [None]:
# unpack test_raw df
player = []
bust = []
rp = []
st = []
asg = []
tClass = []

for index, row in test_raw.iterrows():
    player.append(row[0])
    asg.append(row[1][0])
    st.append(row[1][3])
    rp.append(row[1][2])
    bust.append(row[1][1])
    tClass.append(row[2])

test_predictions = pd.DataFrame({'Name': player,
                                 'All-Star %': asg,
                                 'Starter %': st,
                                 'Role Player %': rp,
                                 'Bust %': bust,
                                 'Model Classification': tClass})

test_predictions.to_csv('../data/rf_ft_v2.csv', index = False)