In [None]:
# load packages
import pandas as pd
import numpy as np
from IPython import display

import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model, ensemble, tree, feature_selection, model_selection, metrics
from xgboost import XGBClassifier, XGBRegressor

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
sns.set_style('white')
plt.rcParams['figure.figsize'] = 15,10

# Make all columns in dataframe output visible, and display more rows/list items.
pd.options.display.max_columns = 999
pd.options.display.max_seq_items=200

In [None]:
df = pd.DataFrame(pd.read_csv('ModelReadyPlayerData.csv'))
df = df[pd.notnull(df['ep_PIPM'])]

In [None]:
from sklearn.metrics.scorer import make_scorer
from sklearn.metrics import r2_score

def custom_scorer(true_y, pred_y, greater_is_better=True):
    arr_true_y = np.array(true_y['ep_PIPM'])
    
    remove_indices = []
    for i in range(len(arr_true_y)):
        if arr_true_y[i] == -6:
            remove_indices.append(i)

    filtered_true_y = np.delete(arr_true_y, remove_indices)
    filtered_pred_y = np.delete(pred_y, remove_indices)
    
    return r2_score(filtered_true_y, filtered_pred_y)

filtered_r2 = make_scorer(custom_scorer, greater_is_better=True)

In [None]:
MLA = [

    ensemble.AdaBoostRegressor(),
    ensemble.BaggingRegressor(),
    ensemble.ExtraTreesRegressor(),
    ensemble.GradientBoostingRegressor(),
    ensemble.RandomForestRegressor(),
    
    linear_model.RidgeCV(),
    
    XGBRegressor()
]

x= ['age_AST%', 'age_BLK%', 'age_DRB%', 'age_DRtg', 'age_ORtg', 'age_PER',
        'age_PPR', 'age_PPS', 'age_STL%', 'age_TOV%', 'age_TRB%',
       'age_TS%', 'age_Total S %', 'age_USG%', 'age_eFG%', 'age_BPM',
       'age_MIN:GP', 'age_USGxTS', 'age_FTRate', 'age_STLK%', 'age_PF:MIN', 'age_PF:STLK', 'age_AST%_ppctl',
       'age_BLK%_ppctl', 'age_DRB%_ppctl', 'age_DRtg_ppctl', 'age_ORtg_ppctl',
        'age_PPR_ppctl', 'age_PPS_ppctl', 'age_STL%_ppctl',
       'age_TOV%_ppctl', 'age_TRB%_ppctl', 'age_TS%_ppctl',
       'age_Total S %_ppctl', 'age_USG%_ppctl', 'age_eFG%_ppctl',
       'age_BPM_ppctl', 'age_USGxTS_ppctl', 'age_STLK%_ppctl',
       'age_PF:MIN_ppctl', 'age_PF:STLK_ppctl', 'age_rFG%', 'age_r3P%',
       'age_rFT%', 'Height', 'Weight', 'Height_ppctl', 'Weight_ppctl', 'Age',
       'RSCI', 'age_pDBPM_ppctl', 'age_pDBPM', 'age_PER_ppctl', 'age_FTRate_ppctl']

y= ['ep_PIPM']

# cross validation data split.
# run model 10x with 60/30 split intentionally leaving out 10%
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 )

# Evaluation dataframe for algorithm metrics.
MLA_columns = ['MLA Name', 'MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD']
MLA_compare = pd.DataFrame(columns = MLA_columns)

# create table to compare MLA predictions
MLA_predict = df[y]
print(MLA_predict.shape)

# index through MLA and save performance to table
row_index = 0

for alg in MLA:

    print(alg.__class__.__name__)
    # set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name

    # score model with cross validation
    cv_results = model_selection.cross_validate(alg, df[x], df[y], cv=cv_split) # scoring=filtered_r2

    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
    # if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   # let's know the worst that can happen!

    row_index+=1


# print and sort table
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare

## Optimizing Individual Models

### Ridge Regression

In [None]:
#base model
rdg = linear_model.RidgeCV()
base_results = model_selection.cross_validate(rdg, df[x], df[y], cv=cv_split, n_jobs=-1)
rdg.fit(df[x], df[y])

print('BEFORE Parameters: ', rdg.get_params())
print("BEFORE Training  score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print('-'*10)

param_grid = {'alphas': ((165.0,), (170.0,), (175.0,), (180.0,), (190.0,), (200.0,))
             }
tune_model = model_selection.GridSearchCV(linear_model.RidgeCV(), param_grid=param_grid, cv=cv_split, n_jobs=-1)
tune_model.fit(df[x], df[y])


print('AFTER Parameters: ', tune_model.best_params_)
print("AFTER Training score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
print("AFTER Test score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER Test score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

In [None]:
### Feature Tuning
rdg = linear_model.RidgeCV(alphas=(175.0,))
base_results = model_selection.cross_validate(rdg, df[x], df[y], cv=cv_split, n_jobs=-1)
rdg.fit(df[x], df[y])


print('BEFORE RFE Training Shape Old: ', df[x].shape) 
print('BEFORE RFE Training Columns Old: ', df[x].columns.values)

print("BEFORE RFE Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE RFE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE RFE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print('-'*10)



#feature selection
rdg_rfe = feature_selection.RFECV(rdg, step = 1, cv = cv_split, n_jobs=-1)
rdg_rfe.fit(df[x], df[y])

#transform x&y to reduced features and fit new model
X_rfe = df[x].columns.values[rdg_rfe.get_support()]
rfe_results = model_selection.cross_validate(rdg, df[X_rfe], df[y], cv  = cv_split, n_jobs=-1)

print('AFTER RFE Training Shape New: ', df[X_rfe].shape) 
print('AFTER RFE Training Columns New: ', X_rfe)

print("AFTER RFE Training score mean: {:.2f}". format(rfe_results['train_score'].mean()*100)) 
print("AFTER RFE Test score mean: {:.2f}". format(rfe_results['test_score'].mean()*100))
print("AFTER RFE Test score 3*std: +/- {:.2f}". format(rfe_results['test_score'].std()*100*3))
print('-'*10)

In [None]:
rdg_x = ['age_BLK%', 'age_DRB%', 'age_DRtg', 'age_PER', 'age_PPR', 'age_STL%', 'age_TRB%',
 'age_BPM', 'age_MIN:GP', 'age_USGxTS', 'age_FTRate', 'age_STLK%',
 'age_DRB%_ppctl', 'age_TRB%_ppctl', 'age_rFG%', 'Height', 'Weight_ppctl', 'Age',
 'age_pDBPM_ppctl', 'age_pDBPM']

#base model
rdg = linear_model.RidgeCV(alphas=(175.0,))
base_results = model_selection.cross_validate(rdg, df[rdg_x], df[y], cv=cv_split, n_jobs=-1)
rdg.fit(df[rdg_x], df[y])

print('BEFORE Parameters: ', rdg.get_params())
print("BEFORE Training  score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print('-'*10)

param_grid = {'alphas': ((110.0,), (115.0,), (90.0,), (95.0,), (100.0,), (105.0,))
             }
tune_model = model_selection.GridSearchCV(linear_model.RidgeCV(), param_grid=param_grid, cv=cv_split, n_jobs=-1)
tune_model.fit(df[rdg_x], df[y])


print('AFTER Parameters: ', tune_model.best_params_)
print("AFTER Training score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
print("AFTER Test score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER Test score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

In [None]:
# final ridge
rdg_x = ['age_BLK%', 'age_DRB%', 'age_DRtg', 'age_PER', 'age_PPR', 'age_STL%', 'age_TRB%',
 'age_BPM', 'age_MIN:GP', 'age_USGxTS', 'age_FTRate', 'age_STLK%',
 'age_DRB%_ppctl', 'age_TRB%_ppctl', 'age_rFG%', 'Height', 'Weight_ppctl', 'Age',
 'age_pDBPM_ppctl', 'age_pDBPM']

#base model
rdg = linear_model.RidgeCV(alphas=(110.0,))
base_results = model_selection.cross_validate(rdg, df[rdg_x], df[y], cv=cv_split, n_jobs=-1)
rdg.fit(df[rdg_x], df[y])

print('Parameters: ', tune_model.best_params_)
print("Training score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
print("Test score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("Test score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

### Gradient Boosting

In [None]:
# Model selection process with cross validation.
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) 
# run model 10x with 60/30 split intentionally leaving out 10%

#base model
grb = ensemble.GradientBoostingRegressor(random_state=2)
base_results = model_selection.cross_validate(grb, df[x], df[y], cv=cv_split, n_jobs=-1)
grb.fit(df[x], df[y])


print('BEFORE Parameters: ', grb.get_params())
print("BEFORE Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print("BEFORE Test set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)


#tune hyper-parameters: 
param_grid = {
                'max_depth': [3, 4, 5],
                'n_estimators': [35, 40, 45],
                'min_samples_split': [2],
                'min_samples_leaf': [9, 10, 11],
                'subsample': [.7, .75, .8],
                'max_features': [None, .1, .2, .3]
             }


# choose best model with grid_search: 
tune_model = model_selection.GridSearchCV(grb, param_grid=param_grid, cv=cv_split, n_jobs=-1)
tune_model.fit(df[x], df[y])

print('AFTER Parameters: ', tune_model.best_params_)
print("AFTER Training score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
print("AFTER Test score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER Test score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

In [None]:
### Feature Tuning

#base model
grb = ensemble.GradientBoostingRegressor(max_depth=3, min_samples_leaf=10, n_estimators=40, max_features=.1, subsample=.75)
base_results = model_selection.cross_validate(grb, df[x], df[y], cv=cv_split, n_jobs=-1)
grb.fit(df[x], df[y])

print('BEFORE RFE Training Shape Old: ', df[x].shape) 
print('BEFORE RFE Training Columns Old: ', df[x].columns.values)

print("BEFORE RFE Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE RFE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE RFE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print('-'*10)



#feature selection
grb_rfe = feature_selection.RFECV(grb, step = 1, cv = cv_split, n_jobs=-1)
grb_rfe.fit(df[x], df[y])

#transform x&y to reduced features and fit new model
X_rfe = df[x].columns.values[grb_rfe.get_support()]
rfe_results = model_selection.cross_validate(grb, df[X_rfe], df[y], cv  = cv_split, n_jobs=-1)

print('AFTER RFE Training Shape New: ', df[X_rfe].shape) 
print('AFTER RFE Training Columns New: ', X_rfe)

print("AFTER RFE Training score mean: {:.2f}". format(rfe_results['train_score'].mean()*100)) 
print("AFTER RFE Test score mean: {:.2f}". format(rfe_results['test_score'].mean()*100))
print("AFTER RFE Test score 3*std: +/- {:.2f}". format(rfe_results['test_score'].std()*100*3))
print('-'*10)

In [None]:
grb_x = ['age_AST%', 'age_DRB%', 'age_DRtg', 'age_ORtg', 'age_PER', 'age_PPR', 'age_PPS',
 'age_STL%', 'age_TOV%', 'age_TRB%', 'age_TS%', 'age_Total S %', 'age_eFG%',
 'age_BPM', 'age_MIN:GP', 'age_USGxTS', 'age_FTRate', 'age_PF:STLK',
 'age_AST%_ppctl', 'age_BLK%_ppctl', 'age_DRB%_ppctl', 'age_DRtg_ppctl',
 'age_ORtg_ppctl', 'age_PPR_ppctl', 'age_PPS_ppctl', 'age_STL%_ppctl',
 'age_TS%_ppctl', 'age_Total S %_ppctl', 'age_USG%_ppctl', 'age_eFG%_ppctl',
 'age_BPM_ppctl', 'age_USGxTS_ppctl', 'age_STLK%_ppctl', 'age_PF:STLK_ppctl',
 'age_rFG%', 'age_rFT%', 'Height_ppctl', 'Weight_ppctl', 'Age',
 'age_pDBPM_ppctl', 'age_PER_ppctl']

# final grb
grb = ensemble.GradientBoostingRegressor(max_depth=3, min_samples_leaf=10, n_estimators=40, max_features=.1, subsample=.75)
base_results = model_selection.cross_validate(grb, df[grb_x], df[y], cv=cv_split, n_jobs=-1)
grb.fit(df[grb_x], df[y])


print('Parameters: ', grb.get_params())
print("Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print("Test set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)

### XG Boost

In [None]:
# Model selection process with cross validation.
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) 
# run model 10x with 60/30 split intentionally leaving out 10%

#base model
xgb = XGBRegressor()
base_results = model_selection.cross_validate(xgb, df[x], df[y], cv=cv_split, n_jobs=-1)
xgb.fit(df[x], df[y])


print('BEFORE Parameters: ', xgb.get_params())
print("BEFORE Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print("BEFORE Test set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)


#tune hyper-parameters: 
param_grid = {
                'max_depth': [3, 4, 5],
                'n_estimators': [25, 30, 35],
                'subsample': [.8, .85, .9, .95, 1.0],
                'colsample_bylevel': [.2, .25, .3],
                'colsample_bytree': [.2, .25, .3]
             }

# choose best model with grid_search: 
tune_model = model_selection.GridSearchCV(XGBRegressor(), param_grid=param_grid, cv=cv_split, n_jobs=-1)
tune_model.fit(df[x], df[y])


print('AFTER Parameters: ', tune_model.best_params_)
print("AFTER Training score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
print("AFTER Test score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER Test score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

In [None]:
#base model
xgb = XGBRegressor(colsample_bylevel=0.25, colsample_bytree=0.25, max_depth=4, n_estimators=30)
base_results = model_selection.cross_validate(xgb, df[x], df[y], cv=cv_split, n_jobs=-1)
xgb.fit(df[x], df[y])


print('Parameters: ', xgb.get_params())
print("Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print("Test set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)

### Bagging Regressor

In [None]:
#base model
bag = ensemble.BaggingRegressor()
base_results = model_selection.cross_validate(bag, df[x], df[y], cv=cv_split, n_jobs=-1)
bag.fit(df[x], df[y])


print('BEFORE Parameters: ', bag.get_params())
print("BEFORE Training  score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print("BEFORE Test set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)


#tune hyper-parameters: 
param_grid = {
                'base_estimator': [ensemble.GradientBoostingRegressor(n_estimators=125, max_depth=3, min_samples_split=7, min_samples_leaf=1)],
                'n_estimators': [35],
                'max_samples': [.4],
                'bootstrap_features': [False],
                'random_state': [2]
             }


# choose best model with grid_search: 
tune_model = model_selection.GridSearchCV(ensemble.BaggingRegressor(), param_grid=param_grid, cv=cv_split, n_jobs=-1)
tune_model.fit(df[x], df[y])

print('AFTER DT Parameters: ', tune_model.best_params_)
print("AFTER DT Training score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
print("AFTER DT Test score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER DT Test score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

In [None]:
#final model
bag = ensemble.BaggingRegressor(ensemble.GradientBoostingRegressor(n_estimators=125, max_depth=3, min_samples_split=7),
                                n_estimators=35,
                                max_samples=.4,
                                bootstrap_features=False)
base_results = model_selection.cross_validate(bag, df[x], df[y], cv=cv_split, n_jobs=-1)
bag.fit(df[x], df[y])

print("Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print('-'*10)

### Adaboost (base: decision tree regressor)

In [None]:
#base model
ada = ensemble.AdaBoostRegressor()
base_results = model_selection.cross_validate(ada, df[x], df[y], cv=cv_split, n_jobs=-1)
ada.fit(df[x], df[y])


print('BEFORE Parameters: ', ada.get_params())
print("BEFORE Training  score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print("BEFORE Test set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)


#tune hyper-parameters: 
param_grid = {
                'base_estimator': [tree.DecisionTreeRegressor(max_depth=5)],
                'n_estimators': [60],
                'random_state': [2]
             }


# choose best model with grid_search: 
tune_model = model_selection.GridSearchCV(ensemble.AdaBoostRegressor(), param_grid=param_grid, cv=cv_split, n_jobs=-1)
tune_model.fit(df[x], df[y])

print('AFTER Parameters: ', tune_model.best_params_)
print("AFTER Training score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
print("AFTER Test score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER Test score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

In [None]:
### Feature Tuning

#base model
ada = ensemble.AdaBoostRegressor(tree.DecisionTreeRegressor(max_depth=5), n_estimators=60)
base_results = model_selection.cross_validate(ada, df[x], df[y], cv=cv_split, n_jobs=-1)
ada.fit(df[x], df[y])

print('BEFORE RFE Training Shape Old: ', df[x].shape) 
print('BEFORE RFE Training Columns Old: ', df[x].columns.values)

print("BEFORE RFE Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE RFE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE RFE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print('-'*10)



#feature selection
ada_rfe = feature_selection.RFECV(ada, step = 1, cv = cv_split, n_jobs=-1)
ada_rfe.fit(df[x], df[y])

#transform x&y to reduced features and fit new model
X_rfe = df[x].columns.values[ada_rfe.get_support()]
rfe_results = model_selection.cross_validate(ada, df[X_rfe], df[y], cv  = cv_split, n_jobs=-1)

print('AFTER RFE Training Shape New: ', df[X_rfe].shape) 
print('AFTER RFE Training Columns New: ', X_rfe)

print("AFTER RFE Training score mean: {:.2f}". format(rfe_results['train_score'].mean()*100)) 
print("AFTER RFE Test score mean: {:.2f}". format(rfe_results['test_score'].mean()*100))
print("AFTER RFE Test score 3*std: +/- {:.2f}". format(rfe_results['test_score'].std()*100*3))
print('-'*10)

In [None]:
ada_x = ['age_AST%', 'age_DRB%', 'age_DRtg', 'age_ORtg', 'age_PER', 'age_PPR', 'age_STL%',
 'age_BPM', 'age_MIN:GP', 'age_USGxTS', 'age_STLK%', 'age_BLK%_ppctl',
 'age_DRtg_ppctl', 'age_PPR_ppctl', 'age_PPS_ppctl', 'age_STL%_ppctl',
 'age_TOV%_ppctl', 'age_TRB%_ppctl', 'age_USG%_ppctl', 'age_eFG%_ppctl',
 'age_BPM_ppctl', 'age_PF:STLK_ppctl', 'age_rFG%', 'age_r3P%', 'age_rFT%',
 'Height_ppctl', 'Weight_ppctl', 'Age', 'RSCI', 'age_pDBPM_ppctl', 'age_pDBPM',
 'age_PER_ppctl', 'age_FTRate_ppctl']

#base model
ada = ensemble.AdaBoostRegressor(tree.DecisionTreeRegressor(max_depth=5), n_estimators=60)
base_results = model_selection.cross_validate(ada, df[ada_x], df[y], cv=cv_split, n_jobs=-1)
ada.fit(df[ada_x], df[y])


print('BEFORE Parameters: ', ada.get_params())
print("BEFORE Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print("BEFORE Test set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)


#tune hyper-parameters: 
param_grid = {
                'base_estimator': [tree.DecisionTreeRegressor(max_depth=5, max_features=None),
                                   tree.DecisionTreeRegressor(max_depth=5, max_features=.1),
                                   tree.DecisionTreeRegressor(max_depth=5, max_features=.2),
                                   tree.DecisionTreeRegressor(max_depth=5, max_features=.3),
                                   tree.DecisionTreeRegressor(max_depth=5, max_features=.4),
                                   tree.DecisionTreeRegressor(max_depth=5, max_features=.5),
                                   tree.DecisionTreeRegressor(max_depth=5, max_features=.6),
                                   tree.DecisionTreeRegressor(max_depth=5, max_features=.7),
                                   tree.DecisionTreeRegressor(max_depth=5, max_features=.8),
                                   tree.DecisionTreeRegressor(max_depth=5, max_features=.9)],
                'n_estimators': [60],
                'random_state': [2]
             }

# choose best model with grid_search: 
tune_model = model_selection.GridSearchCV(ensemble.AdaBoostRegressor(), param_grid=param_grid, cv=cv_split, n_jobs=-1)
tune_model.fit(df[ada_x], df[y])

print('AFTER DT Parameters: ', tune_model.best_params_)
print("AFTER DT Training score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
print("AFTER DT Test score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER DT Test score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

In [None]:
ada_x = ['age_AST%', 'age_DRB%', 'age_DRtg', 'age_ORtg', 'age_PER', 'age_PPR', 'age_STL%',
 'age_BPM', 'age_MIN:GP', 'age_USGxTS', 'age_STLK%', 'age_BLK%_ppctl',
 'age_DRtg_ppctl', 'age_PPR_ppctl', 'age_PPS_ppctl', 'age_STL%_ppctl',
 'age_TOV%_ppctl', 'age_TRB%_ppctl', 'age_USG%_ppctl', 'age_eFG%_ppctl',
 'age_BPM_ppctl', 'age_PF:STLK_ppctl', 'age_rFG%', 'age_r3P%', 'age_rFT%',
 'Height_ppctl', 'Weight_ppctl', 'Age', 'RSCI', 'age_pDBPM_ppctl', 'age_pDBPM',
 'age_PER_ppctl', 'age_FTRate_ppctl']

#final model
ada = ensemble.AdaBoostRegressor(tree.DecisionTreeRegressor(max_depth=5, max_features=0.2), n_estimators=60)
base_results = model_selection.cross_validate(ada, df[ada_x], df[y], cv=cv_split, n_jobs=-1)
ada.fit(df[ada_x], df[y])


print('Parameters: ', ada.get_params())
print("Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print("Test set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)

### Random Forest

In [None]:
#base model
rfr = ensemble.RandomForestRegressor(random_state=2, n_jobs=-1)
base_results = model_selection.cross_validate(rfr, df[x], df[y], cv=cv_split, n_jobs=-1)
rfr.fit(df[x], df[y])


print('BEFORE Parameters: ', rfr.get_params())
print("BEFORE Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE DT Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE DT Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print("BEFORE DT Test w/bin set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)


#tune hyper-parameters: 
param_grid = {
                'n_estimators': [750, 800, 850],
                'max_features': [.25, .3, .35],
                'min_samples_split': [10, 12, 14],
                'min_samples_leaf': [1],
                'max_leaf_nodes': [17, 18, 19],
                'bootstrap': [True],
                'random_state': [2]
             }


# choose best model with grid_search: 
tune_model = model_selection.GridSearchCV(rfr, param_grid=param_grid, cv=cv_split, n_jobs=-1)
tune_model.fit(df[x], df[y])

print('AFTER Parameters: ', tune_model.best_params_)
print("AFTER Training score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
print("AFTER Test score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER Test score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

In [None]:
# feature tuning

#base model
rfr = ensemble.RandomForestRegressor(bootstrap=True, 
                                     max_features=0.3, 
                                     max_leaf_nodes=18,
                                     min_samples_split=12, 
                                     n_estimators=800,
                                     random_state=2, n_jobs=-1)
base_results = model_selection.cross_validate(rfr, df[x], df[y], cv=cv_split, n_jobs=-1)
rfr.fit(df[x], df[y])

print('BEFORE RFE Training Shape Old: ', df[x].shape) 
print('BEFORE RFE Training Columns Old: ', df[x].columns.values)

print("BEFORE RFE Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE RFE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE RFE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print('-'*10)



#feature selection
rfr_rfe = feature_selection.RFECV(rfr, step = 1, cv = cv_split, n_jobs=-1)
rfr_rfe.fit(df[x], df[y])

#transform x&y to reduced features and fit new model
X_rfe = df[x].columns.values[rfr_rfe.get_support()]
rfe_results = model_selection.cross_validate(rfr, df[X_rfe], df[y], cv  = cv_split, n_jobs=-1)

print('AFTER RFE Training Shape New: ', df[X_rfe].shape) 
print('AFTER RFE Training Columns New: ', X_rfe)

print("AFTER RFE Training score mean: {:.2f}". format(rfe_results['train_score'].mean()*100)) 
print("AFTER RFE Test score mean: {:.2f}". format(rfe_results['test_score'].mean()*100))
print("AFTER RFE Test score 3*std: +/- {:.2f}". format(rfe_results['test_score'].std()*100*3))
print('-'*10)

In [None]:
rfr_x = ['age_DRtg', 'age_ORtg', 'age_PER', 'age_PPR', 'age_STL%', 'age_BPM',
 'age_DRB%_ppctl', 'age_DRtg_ppctl', 'age_ORtg_ppctl', 'age_PPR_ppctl',
 'age_STL%_ppctl', 'age_TS%_ppctl', 'age_eFG%_ppctl', 'age_BPM_ppctl',
 'age_rFT%', 'Height_ppctl', 'Age', 'age_pDBPM_ppctl', 'age_pDBPM',
 'age_PER_ppctl']

#base model
rfr = ensemble.RandomForestRegressor(random_state=2, n_jobs=-1)
base_results = model_selection.cross_validate(rfr, df[rfr_x], df[y], cv=cv_split, n_jobs=-1)
rfr.fit(df[rfr_x], df[y])


print('BEFORE Parameters: ', rfr.get_params())
print("BEFORE Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print("BEFORE Test set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)


#tune hyper-parameters: 
param_grid = {
                'n_estimators': [850, 900, 1000],
                'max_features': [.35, .4, .45],
                'min_samples_split': [6, 8, 10],
                'min_samples_leaf': [1],
                'max_leaf_nodes': [19, 20, 21],
                'bootstrap': [True],
                'random_state': [2]
             }


# choose best model with grid_search: 
tune_model = model_selection.GridSearchCV(rfr, param_grid=param_grid, cv=cv_split, n_jobs=-1)
tune_model.fit(df[rfr_x], df[y])

print('AFTER Parameters: ', tune_model.best_params_)
print("AFTER Training score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
print("AFTER Test score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER Test score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

In [None]:
rfr_x = ['age_DRtg', 'age_ORtg', 'age_PER', 'age_PPR', 'age_STL%', 'age_BPM',
 'age_DRB%_ppctl', 'age_DRtg_ppctl', 'age_ORtg_ppctl', 'age_PPR_ppctl',
 'age_STL%_ppctl', 'age_TS%_ppctl', 'age_eFG%_ppctl', 'age_BPM_ppctl',
 'age_rFT%', 'Height_ppctl', 'Age', 'age_pDBPM_ppctl', 'age_pDBPM',
 'age_PER_ppctl']

# final model
rfr = ensemble.RandomForestRegressor(bootstrap=True, 
                                     max_features=0.35, 
                                     max_leaf_nodes=19,
                                     min_samples_split=10, 
                                     n_estimators=850,
                                     random_state=2, n_jobs=-1)
base_results = model_selection.cross_validate(rfr, df[rfr_x], df[y], cv=cv_split, n_jobs=-1)
rfr.fit(df[rfr_x], df[y])

print('Parameters: ', rfr.get_params())
print("Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print('-'*10)

### Extra Trees

In [None]:
#base model
etr = ensemble.ExtraTreesRegressor(random_state=2, n_jobs=-1)
base_results = model_selection.cross_validate(etr, df[x], df[y], cv=cv_split, n_jobs=-1)
etr.fit(df[x], df[y])


print('BEFORE Parameters: ', etr.get_params())
print("BEFORE Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print("BEFORE Test set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)


#tune hyper-parameters: 
param_grid = {
                'n_estimators': [1800],
                'max_features': [.5],
                'min_samples_split': [8],
                'min_samples_leaf': [1],
                'bootstrap': [True],
                'random_state': [2]
             }


# choose best model with grid_search: 
tune_model = model_selection.GridSearchCV(etr, param_grid=param_grid, cv=cv_split, n_jobs=-1)
tune_model.fit(df[x], df[y])

print('AFTER Parameters: ', tune_model.best_params_)
print("AFTER Training score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
print("AFTER Test score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER Test score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

In [None]:
# feature tuning

#base model
etr = ensemble.ExtraTreesRegressor(bootstrap=True, 
                                   max_features=0.5, 
                                   min_samples_split=8, 
                                   n_estimators=1800,
                                   random_state=2, n_jobs=-1)
base_results = model_selection.cross_validate(etr, df[x], df[y], cv=cv_split, n_jobs=-1)
etr.fit(df[x], df[y])

print('BEFORE RFE Training Shape Old: ', df[x].shape) 
print('BEFORE RFE Training Columns Old: ', df[x].columns.values)

print("BEFORE RFE Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE RFE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE RFE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print('-'*10)

#feature selection
etr_rfe = feature_selection.RFECV(etr, step = 1, cv = cv_split, n_jobs=-1)
etr_rfe.fit(df[x], df[y])

#transform x&y to reduced features and fit new model
X_rfe = df[x].columns.values[etr_rfe.get_support()]
rfe_results = model_selection.cross_validate(etr, df[X_rfe], df[y], cv  = cv_split, n_jobs=-1)

print('AFTER RFE Training Shape New: ', df[X_rfe].shape) 
print('AFTER RFE Training Columns New: ', X_rfe)

print("AFTER RFE Training score mean: {:.2f}". format(rfe_results['train_score'].mean()*100)) 
print("AFTER RFE Test score mean: {:.2f}". format(rfe_results['test_score'].mean()*100))
print("AFTER RFE Test score 3*std: +/- {:.2f}". format(rfe_results['test_score'].std()*100*3))
print('-'*10)

In [None]:
etr_x = ['age_DRtg', 'age_ORtg', 'age_PER', 'age_PPR', 'age_STL%', 'age_BPM',
 'age_MIN:GP', 'age_USGxTS', 'age_BLK%_ppctl', 'age_DRB%_ppctl',
 'age_DRtg_ppctl', 'age_ORtg_ppctl', 'age_PPR_ppctl', 'age_PPS_ppctl',
 'age_STL%_ppctl', 'age_Total S %_ppctl', 'age_eFG%_ppctl', 'age_BPM_ppctl',
 'age_STLK%_ppctl', 'age_PF:STLK_ppctl', 'Height', 'Height_ppctl', 'Age',
 'age_pDBPM_ppctl', 'age_PER_ppctl']

#base model
etr = ensemble.ExtraTreesRegressor(random_state=2, n_jobs=-1)
base_results = model_selection.cross_validate(etr, df[etr_x], df[y], cv=cv_split, n_jobs=-1)
etr.fit(df[etr_x], df[y])


print('BEFORE Parameters: ', etr.get_params())
print("BEFORE Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("BEFORE Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print("BEFORE Test set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)


#tune hyper-parameters: 
param_grid = {
                'n_estimators': [1800],
                'max_features': [.55, .6],
                'min_samples_split': [7, 8],
                'bootstrap': [True],
                'random_state': [2]
             }


# choose best model with grid_search: 
tune_model = model_selection.GridSearchCV(etr, param_grid=param_grid, cv=cv_split, n_jobs=-1)
tune_model.fit(df[etr_x], df[y])

print('AFTER Parameters: ', tune_model.best_params_)
print("AFTER Training score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100)) 
print("AFTER Test score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER Test score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)

In [None]:
etr_x = ['age_DRtg', 'age_ORtg', 'age_PER', 'age_PPR', 'age_STL%', 'age_BPM',
 'age_MIN:GP', 'age_USGxTS', 'age_BLK%_ppctl', 'age_DRB%_ppctl',
 'age_DRtg_ppctl', 'age_ORtg_ppctl', 'age_PPR_ppctl', 'age_PPS_ppctl',
 'age_STL%_ppctl', 'age_Total S %_ppctl', 'age_eFG%_ppctl', 'age_BPM_ppctl',
 'age_STLK%_ppctl', 'age_PF:STLK_ppctl', 'Height', 'Height_ppctl', 'Age',
 'age_pDBPM_ppctl', 'age_PER_ppctl']

#final model
etr = ensemble.ExtraTreesRegressor(bootstrap=True, 
                                   max_features=0.55, 
                                   min_samples_split=7, 
                                   n_estimators=1800,
                                   random_state=2, n_jobs=-1)
base_results = model_selection.cross_validate(etr, df[etr_x], df[y], cv=cv_split, n_jobs=-1)
etr.fit(df[etr_x], df[y])


print("Training score mean: {:.2f}". format(base_results['train_score'].mean()*100)) 
print("Test score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("Test score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print('-'*10)

## Blend Models

In [None]:
from mlxtend.regressor import StackingCVRegressor
from sklearn.model_selection import cross_val_score

random_state = 2
np.random.seed(random_state)

# models before feature tuning
etr = ensemble.ExtraTreesRegressor(bootstrap=True, 
                                   max_features=0.5, 
                                   min_samples_split=8, 
                                   n_estimators=1800,
                                   random_state=2, n_jobs=-1)

rfr = ensemble.RandomForestRegressor(bootstrap=True, 
                                     max_features=0.3, 
                                     max_leaf_nodes=18,
                                     min_samples_split=12, 
                                     n_estimators=800,
                                     random_state=2, n_jobs=-1)

ada = ensemble.AdaBoostRegressor(tree.DecisionTreeRegressor(max_depth=5, max_features=0.2), n_estimators=60)

bag = ensemble.BaggingRegressor(ensemble.GradientBoostingRegressor(n_estimators=125, max_depth=3, min_samples_split=7),
                                n_estimators=35,
                                max_samples=.4,
                                bootstrap_features=False)

xgb = XGBRegressor(colsample_bylevel=0.25, colsample_bytree=0.25, max_depth=4, n_estimators=30)

grb = ensemble.GradientBoostingRegressor(max_depth=3, min_samples_leaf=10, n_estimators=40, max_features=.1, subsample=.75)

rdg = linear_model.RidgeCV(alphas=(175.0,))

meta_regr = ensemble.RandomForestRegressor()

models = [etr, rfr, ada, bag, xgb, grb, rdg]

stack = StackingCVRegressor(regressors=models,
                            meta_regressor=meta_regr)

models_plus_stack = models + [stack]

x = ['age_DRtg', 'age_ORtg', 'age_PER', 'age_PPR', 'age_STL%', 'age_BPM',
 'age_MIN:GP', 'age_USGxTS', 'age_BLK%_ppctl', 'age_DRB%_ppctl',
 'age_DRtg_ppctl', 'age_ORtg_ppctl', 'age_PPR_ppctl', 'age_PPS_ppctl',
 'age_STL%_ppctl', 'age_Total S %_ppctl', 'age_eFG%_ppctl', 'age_BPM_ppctl',
 'age_STLK%_ppctl', 'age_PF:STLK_ppctl', 'Height', 'Height_ppctl', 'Age',
 'age_pDBPM_ppctl', 'age_PER_ppctl']
X = np.array(df[x])
y = np.array(df['ep_PIPM'].tolist())


print('5-fold cross validation scores:\n')

for clf, label in zip(models_plus_stack, ['ExtraTrees', 'RandomForest', 'AdaBoost', 'Bagging', 
                                          'XGBoost', 'GradientBoost', 'RidgeRegr', 'StackingCVRegressor']):
    scores = cross_val_score(clf, X, y, cv=5)
    print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (
        scores.mean(), scores.std(), label))