# nba_tune_model

### Uses grid search to select optimal parameters for random forest model

In [1]:
# Import dependencies
import numpy as np
np.set_printoptions(suppress=True)
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
# Preprocess data for random forest
# ---------------------------- #
max_depth = []
max_features = []
min_samples_leaf = []

def rfPreprocess():
    
    
    # Read data and turn binary columns to 0/1
    results = pd.read_csv('..\\data\\results.csv')
    results['teamRslt'] = [1 if x == 'Win' else 0 for x in results['teamRslt']]   # Win = 1, Loss = 0
    results['teamLoc'] = [1 if x == 'Home' else 0 for x in results['teamLoc']]  # Home = 1, Away = 0
    
    
    # X/Y and train/test split
    Y = results['teamRslt']
    X = results.loc[:, ['teamLoc', 'diff_starting_WS', 'diff_starting_BPM', 
                        'diff_starting_MP_per_game', 'diff_starting_GSpct', 'diff_bench_WS', 
                        'diff_bench_BPM', 'diff_bench_MP_per_game', 'diff_bench_GSpct', 
                        'diff_WinPct2', 'timeSincePrev', 'distSincePrev']]
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
    
    
    # Separate PCA cols
    PCA_cols_starting = ['diff_starting_WS', 'diff_starting_MP_per_game', 'diff_starting_GSpct']
    PCA_cols_bench = ['diff_bench_WS', 'diff_bench_MP_per_game', 'diff_bench_GSpct']
    X_train_PCA_cols_starting = X_train.loc[:, PCA_cols_starting]
    X_train_PCA_cols_bench = X_train.loc[:, PCA_cols_bench]
    
    
    # PCA on starter metrics
    pca_starters = PCA(n_components=0.75, random_state=1)
    pca_starters.fit(X_train_PCA_cols_starting)
    X_train_PCA_cols_starting_transformed = pd.Series(pca_starters.transform(X_train_PCA_cols_starting)[:,0])
    
    
    # PCA on bench metrics
    pca_bench = PCA(n_components=0.75, random_state=1)
    pca_bench.fit(X_train_PCA_cols_bench)
    X_train_PCA_cols_bench_transformed = pd.Series(pca_bench.transform(X_train_PCA_cols_bench)[:,0])
    
    
    # Combine PCA cols with non-PCA cols (training)
    X_train_noPCA = X_train.drop(PCA_cols_starting+PCA_cols_bench, axis=1).reset_index()
    X_train_new = pd.concat([X_train_noPCA, X_train_PCA_cols_starting_transformed, X_train_PCA_cols_bench_transformed], axis=1)
    X_train_new = X_train_new.rename(columns = {0:'PC_starters', 1:'PC_bench'})
    X_train_new = X_train_new.drop('index', axis=1)
    
       
    # Fit random forest model on PCA'd data
    # Random forest (w/ Grid Search)
    params = {
        'max_depth':[3,4,5],
        'max_features':[3,4,5],
        'min_samples_leaf':[5,7,9],
        'n_estimators':[251],
    }

    RF_classifier = RandomForestClassifier()
    
    grid_search_RF = GridSearchCV(estimator=RF_classifier, param_grid=params, cv=5, verbose=1, n_jobs=-1)
    grid_search_RF.fit(X_train_new, Y_train)
    
    # Save best parameters to list
    max_depth.append(grid_search_RF.best_params_['max_depth'])
    max_features.append(grid_search_RF.best_params_['max_features'])
    min_samples_leaf.append(grid_search_RF.best_params_['min_samples_leaf'])

In [3]:
# Perform grid search 25 times
for i in range(25):
    print(i)
    rfPreprocess()

0
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   37.3s finished


1
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   37.2s finished


2
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   37.7s finished


3
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   40.4s finished


4
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   40.4s finished


5
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   40.4s finished


6
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   42.5s finished


7
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   41.1s finished


8
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   40.1s finished


9
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   40.5s finished


10
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   39.9s finished


11
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   41.1s finished


12
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   40.4s finished


13
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   39.9s finished


14
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   40.2s finished


15
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   40.2s finished


16
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   41.0s finished


17
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   41.3s finished


18
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   40.3s finished


19
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   43.4s finished


20
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   43.5s finished


21
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   41.7s finished


22
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   42.4s finished


23
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   41.2s finished


24
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   42.1s finished


In [4]:
# Print optimal parameters
print('Max Depth')
print(pd.Series(max_depth).value_counts())
print('\n')
print('Max Features')
print(pd.Series(max_features).value_counts())
print('\n')
print('Min Samples Leaf')
print(pd.Series(min_samples_leaf).value_counts())

Max Depth
4    12
3     7
5     6
dtype: int64


Max Features
5    11
4     9
3     5
dtype: int64


Min Samples Leaf
5    12
7     8
9     5
dtype: int64
