#### TODO
- run larger grid search
- add in ALL golfers and train
- test against full 18 hole round (RF Classifier vs Poisson)
- Remove course features and see about differences
- Add in additional golfer features
- Run cumulative stats against last 450 HOLES played (25 rounds or 6.25 events) - future work: see what amount history is best
- visualize decision tree graph
- fill NA values for golfer features with mean values

#### RESULTS
- course features don't matter (63.7% best accuracy for both)
- most important features are par and yardage of hole
- SG approach and driving distance are next best predictors
- 10 out of 21 features in best model with course data
- 12 out of 12 features in best model without course data

In [17]:
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
from time import time
import math
import pickle

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import poisson,skellam
from scipy.stats import randint as sp_randint

from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

In [2]:
TRAIN_FILE = './data/holes-2016.csv'
VAL_FILE = './data/holes-2017.csv'
TEST_FILE = './data/holes-2018.csv'

In [142]:
def load_and_process_simple(file):
    columns = [
        'Id', 'Name', 'Tournament Year', 'Tournament Schedule', 'Event Name', 'Course Name', 'Round', 
        'Hole', 'Driving Accuracy', 'Driving Distance', 'Green in Regulation', 'Scrambling Success',
        'Putts Per Hole', 'Putts Per Hole GIR',
        'Total SG Putting', 'Avg SG Putting', 'Total SG OTT', 'Avg SG OTT', 'Total SG Approach', 
        'Avg SG Approach', 'Total SG Around the Green', 'Avg SG Around the Green', 'Par', 'Actual Yard', 
        'Fwy Firmness', 'Grn Firmness', 'Stimp', 'Fwy Height', 'Grn Height', 'Rough Height', 'Tee Grass', 
        'Fwy. Grass', 'Rough Grass', 'Actual 250 Distance', 'Actual 275 Distance', 'Actual 300 Distance', 
        'Actual 325 Distance', 'Actual 350 Distance', 'AM Wind Spd', 'AM Wind Dir', 'PM Wind Spd', 
        'PM Wind Dir', 'Time Hole Finished', 'Hit Fwy', 'Hit Green', 'Hit Greenside Bunker', 'Tee Shot Landing Loc', 
        'Score'
    ]    
    df = pd.read_csv(file, index_col=None, names=columns)
    
    # drop SG totals, use averages only
    df.drop(['Total SG Putting', 'Total SG OTT', 'Total SG Approach', 'Total SG Around the Green'], axis=1, inplace=True)
    
    # drop wind and time
    df.drop(['AM Wind Dir', 'AM Wind Spd', 'PM Wind Dir', 'PM Wind Spd'], axis=1, inplace=True)
    df.drop(['Time Hole Finished'], axis=1, inplace=True)
    
    # drop result columns (include later for potential simulations?)
    df.drop(['Hit Fwy', 'Hit Green', 'Hit Greenside Bunker', 'Tee Shot Landing Loc'], axis=1, inplace=True)
    
    # drop non-features (only for identification)
    df.drop(['Id', 'Name', 'Tournament Year', 'Tournament Schedule', 'Event Name', 'Course Name', 'Round', 'Hole'], axis=1, inplace=True)
    
    # drop categorical firmness and grass
    df.drop(['Fwy Firmness', 'Grn Firmness', 'Tee Grass', 'Fwy. Grass', 'Rough Grass'], axis=1, inplace=True)
    
    # replace na values with column mean
    df.fillna(df.mean(), inplace=True)
    
    # rename columns to remove non-alpha chars
    renamed_cols = {}
    for x in df.columns:
        renamed_cols[x] = x.replace('/','').replace('&','').replace('#','').replace(' ','').replace('.','')
        
    df.rename(columns=renamed_cols, inplace=True)
        
    return df

In [143]:
def load_and_process_no_course(file):
    columns = [
        'Id', 'Name', 'Tournament Year', 'Tournament Schedule', 'Event Name', 'Course Name', 'Round', 
        'Hole', 'Driving Accuracy', 'Driving Distance', 'Green in Regulation', 'Scrambling Success',
        'Putts Per Hole', 'Putts Per Hole GIR',
        'Total SG Putting', 'Avg SG Putting', 'Total SG OTT', 'Avg SG OTT', 'Total SG Approach', 
        'Avg SG Approach', 'Total SG Around the Green', 'Avg SG Around the Green', 'Par', 'Actual Yard', 
        'Fwy Firmness', 'Grn Firmness', 'Stimp', 'Fwy Height', 'Grn Height', 'Rough Height', 'Tee Grass', 
        'Fwy. Grass', 'Rough Grass', 'Actual 250 Distance', 'Actual 275 Distance', 'Actual 300 Distance', 
        'Actual 325 Distance', 'Actual 350 Distance', 'AM Wind Spd', 'AM Wind Dir', 'PM Wind Spd', 
        'PM Wind Dir', 'Time Hole Finished', 'Hit Fwy', 'Hit Green', 'Hit Greenside Bunker', 'Tee Shot Landing Loc', 
        'Score'
    ]    
    df = pd.read_csv(file, index_col=None, names=columns)
    
    # drop course features
    df.drop(['Stimp', 'Fwy Height', 'Grn Height', 'Rough Height', 'Actual 250 Distance', 'Actual 275 Distance', 'Actual 300 Distance', 'Actual 325 Distance', 'Actual 350 Distance'], axis=1, inplace=True)
    
    # drop SG totals, use averages only
    df.drop(['Total SG Putting', 'Total SG OTT', 'Total SG Approach', 'Total SG Around the Green'], axis=1, inplace=True)
    
    # drop wind and time
    df.drop(['AM Wind Dir', 'AM Wind Spd', 'PM Wind Dir', 'PM Wind Spd'], axis=1, inplace=True)
    df.drop(['Time Hole Finished'], axis=1, inplace=True)
    
    # drop result columns (include later for potential simulations?)
    df.drop(['Hit Fwy', 'Hit Green', 'Hit Greenside Bunker', 'Tee Shot Landing Loc'], axis=1, inplace=True)
    
    # drop non-features (only for identification)
    df.drop(['Id', 'Name', 'Tournament Year', 'Tournament Schedule', 'Event Name', 'Course Name', 'Round', 'Hole'], axis=1, inplace=True)
    
    # drop categorical firmness and grass
    df.drop(['Fwy Firmness', 'Grn Firmness', 'Tee Grass', 'Fwy. Grass', 'Rough Grass'], axis=1, inplace=True)
    
    # replace na values with column mean
    df.fillna(df.mean(), inplace=True)
    
    # rename columns to remove non-alpha chars
    renamed_cols = {}
    for x in df.columns:
        renamed_cols[x] = x.replace('/','').replace('&','').replace('#','').replace(' ','').replace('.','')
        
    df.rename(columns=renamed_cols, inplace=True)
        
    return df

In [5]:
def split_data(data):
    X = data.drop('Score', axis=1, inplace=False)
    y = data['Score']
    return X, y

# not used
def normalize_data(X):
    cols = X.columns
    return pd.DataFrame(MinMaxScaler().fit_transform(X), columns=cols)

In [16]:
THRESHOLD = 2

def scoring(model, X, y):
    return score(y, model.predict(X))

def score(y, y_hat):
    return np.mean((y_hat > THRESHOLD) == (y > 2))

def run_cross_val(model, X, y, s=scoring):
    return cross_val_score(model, X, y, scoring=s)

In [144]:
df_train = load_and_process_simple(TRAIN_FILE)
df_val = load_and_process_simple(VAL_FILE)
df_test = load_and_process_simple(TEST_FILE)

In [145]:
df_train.head()

Unnamed: 0,DrivingAccuracy,DrivingDistance,GreeninRegulation,ScramblingSuccess,PuttsPerHole,PuttsPerHoleGIR,AvgSGPutting,AvgSGOTT,AvgSGApproach,AvgSGAroundtheGreen,Par,ActualYard,Stimp,FwyHeight,GrnHeight,RoughHeight,Actual250Distance,Actual275Distance,Actual300Distance,Actual325Distance,Actual350Distance,Score
0,0.53073,290.38214,0.60222,0.60894,1.56667,1.73432,0.00886,-0.00866,-0.00952,0.01008,4,444,11.0,0.44,0.1,2.0,26.0,28.0,31.0,33.0,0.0,4.0
1,0.53073,290.38214,0.60222,0.60894,1.56667,1.73432,0.00886,-0.00866,-0.00952,0.01008,4,358,11.0,0.44,0.1,2.0,25.0,28.0,30.0,33.0,0.0,3.0
2,0.53073,290.38214,0.60222,0.60894,1.56667,1.73432,0.00886,-0.00866,-0.00952,0.01008,4,455,11.0,0.44,0.1,2.0,25.0,28.0,30.0,33.0,35.0,4.0
3,0.53073,290.38214,0.60222,0.60894,1.56667,1.73432,0.00886,-0.00866,-0.00952,0.01008,3,160,11.0,0.44,0.1,2.0,0.0,0.0,0.0,0.0,0.0,3.0
4,0.53073,290.38214,0.60222,0.60894,1.56667,1.73432,0.00886,-0.00866,-0.00952,0.01008,5,531,11.0,0.44,0.1,2.0,25.0,28.0,30.0,33.0,35.0,5.0


In [8]:
X_train, y_train = split_data(df_train)
X_val, y_val = split_data(df_val)
X_test, y_test = split_data(df_test)

In [9]:
print('Training X Shape', X_train.shape)
print('Training y Shape', y_train.shape)
print('Validation X Shape', X_val.shape)
print('Validation y Shape', y_val.shape)
print('Test X Shape', X_test.shape)
print('Test y Shape', y_test.shape)

Training X Shape (12244, 21)
Training y Shape (12244,)
Validation X Shape (13137, 21)
Validation y Shape (13137,)
Test X Shape (13184, 21)
Test y Shape (13184,)


In [120]:
def create_test_dataframe():
    return pd.DataFrame(data={
        'DrivingAccuracy': 0.48023,
        'DrivingDistance': 292.38235,
        'GreeninRegulation': 0.55333,
        'ScramblingSuccess': 0.51938,
        'PuttsPerHole': 1.31333,
        'PuttsPerHoleGIR': 1.67871,
        'AvgSGPutting': 0.03364, 
        'AvgSGOTT': -0.00313, 
        'AvgSGApproach': 0.00649, 
        'AvgSGAroundtheGreen': -0.00318,
        'Par': 5, 
        'ActualYard': 652, 
        'Stimp': 12.5, 
        'FwyHeight': 0.300000012, 
        'GrnHeight': 0.100000001, 
        'RoughHeight': 3,
        'Actual250Distance': 31, 
        'Actual275Distance': 33, 
        'Actual300Distance': 36,
        'Actual325Distance': 38, 
        'Actual350Distance': 41
    }, index=[1])

def create_test_nocourse_dataframe():
    return pd.DataFrame(data={
        'DrivingAccuracy': 0.48023,
        'DrivingDistance': 292.38235,
        'GreeninRegulation': 0.55333,
        'ScramblingSuccess': 0.51938,
        'PuttsPerHole': 1.31333,
        'PuttsPerHoleGIR': 1.67871,
        'AvgSGPutting': 0.03364, 
        'AvgSGOTT': -0.00313, 
        'AvgSGApproach': 0.00649, 
        'AvgSGAroundtheGreen': -0.00318,
        'Par': 5, 
        'ActualYard': 652
    }, index=[1])

def poisson_probabilities(model, df, max_score=9):
    hole_avg = model.predict(df).values[0]
    hole_pred = [poisson.pmf(i, hole_avg) for i in range(0, max_score+1)]
    return hole_pred

In [121]:
def run_test_hole(model, is_nocourse=False):
    # testing Phil Mickelson, 2018, Round 1, Hole 16 at Firestone South
    if is_nocourse:
        test_data = create_test_nocourse_dataframe()
    else:
        test_data = create_test_dataframe()
    
    print('Actual Score:', 5)
    print('Predicted Score:', model.predict(test_data))
    return test_data

In [35]:
def run_simple_model_eval(model, X, y, y_pred, is_rf=True):
    print('Actual Scores:',y[:5])
    print('Predicted Scores:', y_pred[:5])
    print('Mean Actual Scores:', y.mean())
    print('Mean Predicted Scores:', y_pred.mean())
    
    if is_rf:
        print('Cross Validation:', run_cross_val(model, X, y, s=scoring))
        print()
        feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = X.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
        print(feature_importances.loc[feature_importances['importance'] >= 0.01])

In [19]:
def search_cv_report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [20]:
def run_search_cv(X, y, is_random=True):
    model = RandomForestClassifier(n_estimators=20, n_jobs=-1)
    
    if is_random:
        param_dist = {"max_depth": [3, None],
                      "max_features": sp_randint(1, len(X.columns)),
                      "min_samples_split": sp_randint(2, 11),
                      "bootstrap": [True, False],
                      "criterion": ["gini", "entropy"]}
        n_iter_search = 20
        random_search = RandomizedSearchCV(model, 
                                           param_distributions=param_dist, 
                                           n_iter=n_iter_search, cv=5)

        start = time()
        random_search.fit(X, y)
        print("RandomizedSearchCV took %.2f seconds for %d candidates"
              " parameter settings." % ((time() - start), n_iter_search))
        search_cv_report(random_search.cv_results_)
    else:
        param_grid = {"max_depth": [3, None],
                      "max_features": [1, 3, 10, len(X.columns)],
                      "min_samples_split": [2, 3, 10],
                      "bootstrap": [True, False],
                      "criterion": ["gini", "entropy"]}

        # run grid search
        grid_search = GridSearchCV(model, param_grid=param_grid, cv=5)
        start = time()
        grid_search.fit(X, y)

        print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
              % (time() - start, len(grid_search.cv_results_['params'])))
        search_cv_report(grid_search.cv_results_)

In [44]:
col_list = list(X_train.columns)
poisson_formula = ' + '.join(col_list)

poisson_model = smf.glm(formula="Score ~ " + poisson_formula, data=df_train, family=sm.families.Poisson()).fit()
poisson_model.summary()

0,1,2,3
Dep. Variable:,Score,No. Observations:,12244
Model:,GLM,Df Residuals:,12222
Model Family:,Poisson,Df Model:,21
Link Function:,log,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-20431.
Date:,"Thu, 28 Mar 2019",Deviance:,1502.1
Time:,21:43:09,Pearson chi2:,1.44e+03
No. Iterations:,4,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.9368,0.496,1.888,0.059,-0.036,1.909
DrivingAccuracy,-0.0076,0.149,-0.051,0.959,-0.299,0.284
DrivingDistance,0.0002,0.001,0.201,0.841,-0.002,0.002
GreeninRegulation,0.0803,0.224,0.358,0.720,-0.359,0.520
ScramblingSuccess,0.0484,0.164,0.295,0.768,-0.273,0.370
PuttsPerHole,-0.1557,0.275,-0.567,0.571,-0.694,0.382
PuttsPerHoleGIR,-0.0425,0.256,-0.166,0.868,-0.545,0.460
AvgSGPutting,-0.5075,0.389,-1.305,0.192,-1.269,0.254
AvgSGOTT,-0.3177,0.428,-0.743,0.458,-1.156,0.521


In [45]:
y_pred_poisson = poisson_model.predict(df_val)
y_pred_poisson

0        4.063705
1        3.117942
2        4.058668
3        4.054458
4        4.702102
5        4.074998
6        3.118848
7        3.943098
8        4.707962
9        3.827717
10       3.114449
11       4.052439
12       4.065764
13       4.056647
14       3.113544
15       4.709721
16       3.947684
17       4.709135
18       4.063368
19       3.116519
20       4.055468
21       4.054458
22       4.702298
23       4.072326
24       3.120661
25       3.915431
26       4.708939
27       3.826751
28       3.115484
29       4.050085
           ...   
13107    3.950658
13108    3.941832
13109    3.031079
13110    3.939051
13111    3.026176
13112    3.934455
13113    3.940981
13114    3.930097
13115    3.029569
13116    3.959190
13117    3.956377
13118    4.575259
13119    3.946756
13120    3.026929
13121    3.936252
13122    3.945270
13123    3.945741
13124    4.567858
13125    3.952462
13126    3.944451
13127    3.027055
13128    3.939051
13129    3.028940
13130    3.936743
13131    3

In [14]:
rf_default_model = RandomForestClassifier()
rf_default_model.fit(X_train, y_train)
y_pred_rf_default = rf_default_model.predict(X_val)
rf_default_model.score(X_val, y_val)

0.591078632868996

In [21]:
# run random search for best classifier
run_search_cv(X_train, y_train, True)

RandomizedSearchCV took 39.06 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.635 (std: 0.011)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 13, 'min_samples_split': 9}

Model with rank: 2
Mean validation score: 0.635 (std: 0.008)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 13, 'min_samples_split': 6}

Model with rank: 3
Mean validation score: 0.635 (std: 0.008)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 13, 'min_samples_split': 10}



In [22]:
# run grid search for best classifier
run_search_cv(X_train, y_train, False)

GridSearchCV took 185.66 seconds for 96 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.635 (std: 0.008)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 10, 'min_samples_split': 2}

Model with rank: 2
Mean validation score: 0.635 (std: 0.008)
Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 21, 'min_samples_split': 2}

Model with rank: 3
Mean validation score: 0.635 (std: 0.010)
Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 21, 'min_samples_split': 10}

Model with rank: 3
Mean validation score: 0.635 (std: 0.008)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 21, 'min_samples_split': 10}



In [29]:
# create model for best model from searches
rf_best_model = RandomForestClassifier(bootstrap=True, criterion='entropy', max_depth=3,
                                      max_features=10, min_samples_split=2)
rf_best_model.fit(X_train, y_train)
y_pred_rf_best = rf_best_model.predict(X_val)
rf_best_model.score(X_val, y_val)

0.6295957981274264

In [46]:
pickle.dump(rf_default_model, open('./models/hole-prediction-rf-default.pkl', 'wb'))
pickle.dump(rf_best_model, open('./models/hole-prediction-rf-best.pkl', 'wb'))
pickle.dump(poisson_model, open('./models/hole-prediction-poisson.pkl', 'wb'))

In [32]:
run_simple_model_eval(rf_default_model, X_train, y_train, y_pred_rf_default)

Actual Scores: 0    4.0
1    3.0
2    4.0
3    3.0
4    5.0
Name: Score, dtype: float64
Predicted Scores: [4. 3. 4. 4. 4.]
Mean Actual Scores: 3.893988892518785
Mean Predicted Scores: 3.8093171957067824
Cross Validation: [0.9627907  0.90884587 0.96493379]

                     importance
ActualYard             0.484666
Par                    0.077043
Actual275Distance      0.050660
Actual325Distance      0.045333
Actual250Distance      0.041854
Actual300Distance      0.036939
Actual350Distance      0.028449
AvgSGApproach          0.020862
PuttsPerHoleGIR        0.020167
DrivingDistance        0.019994
ScramblingSuccess      0.019645
AvgSGAroundtheGreen    0.019306
AvgSGPutting           0.019096
DrivingAccuracy        0.018928
AvgSGOTT               0.018821
GreeninRegulation      0.018524
PuttsPerHole           0.016413
Stimp                  0.013600
FwyHeight              0.011421
RoughHeight            0.010966


In [33]:
run_simple_model_eval(rf_best_model, X_train, y_train, y_pred_rf_best)

Actual Scores: 0    4.0
1    3.0
2    4.0
3    3.0
4    5.0
Name: Score, dtype: float64
Predicted Scores: [4. 3. 4. 4. 4.]
Mean Actual Scores: 3.893988892518785
Mean Predicted Scores: 3.830707162974804
Cross Validation: [0.96768666 0.96814506 0.96812163]

                   importance
Par                  0.546671
ActualYard           0.225564
Actual250Distance    0.138425
Actual300Distance    0.067795


In [36]:
run_simple_model_eval(poisson_model, X_train, y_train, y_pred_poisson, False)

Actual Scores: 0    4.0
1    3.0
2    4.0
3    3.0
4    5.0
Name: Score, dtype: float64
Predicted Scores: 0    4.063705
1    3.117942
2    4.058668
3    4.054458
4    4.702102
dtype: float64
Mean Actual Scores: 3.893988892518785
Mean Predicted Scores: 3.8758726162133437


In [62]:
test_data = run_test_hole(rf_default_model)

Actual Score: 5
Predicted Score: [4.]


In [63]:
test_data = run_test_hole(rf_best_model)

Actual Score: 5
Predicted Score: [5.]


pandas.core.frame.DataFrame

In [66]:
test_data = run_test_hole(poisson_model)

Actual Score: 5
Predicted Score: 1    4.994491
dtype: float64


In [67]:
poisson_probabilities(poisson_model, test_data)

[0.006775170419789624,
 0.03383852604768396,
 0.08450310281016818,
 0.1406833220115902,
 0.1756603879091694,
 0.1754668368030591,
 0.14606124913409432,
 0.10421450842127776,
 0.06506229989911982,
 0.036105895062357146]

In [68]:
# run test of a full 18 holes
file = './data/holes-phil-test.csv'

df_sample = load_and_process_simple(file)
X_sample, y_sample = split_data(df_sample)

In [69]:
rf_default_model.score(X_sample, y_sample)

0.5555555555555556

In [70]:
rf_best_model.score(X_sample, y_sample)

0.7222222222222222

In [107]:
y_sample_rf_default = rf_default_model.predict(X_sample)
y_sample_rf_best = rf_best_model.predict(X_sample)
y_sample_poisson = np.round(poisson_model.predict(X_sample),0)

In [108]:
results = [
    ['Actual'] + list(y_sample),
    ['Default'] + list(y_sample_rf_default),
    ['Best'] + list(y_sample_rf_best),
    ['Poisson'] + list(y_sample_poisson)
]

In [109]:
df_results = pd.DataFrame(results, columns=['Type',1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18])
df_results.head()

Unnamed: 0,Type,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,Actual,4.0,3.0,4.0,4.0,2.0,5.0,2.0,4.0,4.0,3.0,4.0,3.0,4.0,4.0,3.0,5.0,4.0,4.0
1,Default,4.0,4.0,4.0,4.0,3.0,3.0,3.0,5.0,4.0,4.0,4.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0
2,Best,4.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,3.0,5.0,4.0,4.0
3,Poisson,4.0,5.0,4.0,4.0,3.0,4.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,3.0,5.0,4.0,4.0


In [110]:
print('Round total (Actual):', y_sample.sum())
print('Round total (Default):', y_sample_rf_default.sum())
print('Round total (Best):', y_sample_rf_best.sum())
print('Round total (Poisson):', y_sample_poisson.sum())

Round total (Actual): 66.0
Round total (Default): 67.0
Round total (Best): 69.0
Round total (Poisson): 70.0


In [111]:
# full test data prediction
print('Accuracy (Default):', rf_default_model.score(X_test, y_test))
print('Accuracy (Best):', rf_best_model.score(X_test, y_test))

Accuracy (Default): 0.6007281553398058
Accuracy (Best): 0.6378944174757282


In [113]:
poisson_probabilities(poisson_model, df_sample)

[0.014973643202322985,
 0.06291121901923442,
 0.13215960287714323,
 0.1850879266044648,
 0.1944100532364599,
 0.16336135800006948,
 0.1143928037814641,
 0.06865960251127441,
 0.03605885382231366,
 0.016833329662039497]

In [129]:
df_train_nocourse = load_and_process_no_course(TRAIN_FILE)
df_val_nocourse = load_and_process_no_course(VAL_FILE)

X_train_nocourse, y_train_nocourse = split_data(df_train_nocourse)
X_val_nocourse, y_val_nocourse = split_data(df_val_nocourse)

In [130]:
rf_default_nocourse_model = RandomForestClassifier()
rf_default_nocourse_model.fit(X_train_nocourse, y_train_nocourse)
y_pred_rf_default_nocourse = rf_default_nocourse_model.predict(X_val_nocourse)
rf_default_nocourse_model.score(X_val_nocourse, y_val_nocourse)

0.5944279515871204

In [118]:
run_search_cv(X_train_nocourse, y_train_nocourse, True)

RandomizedSearchCV took 49.05 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.635 (std: 0.009)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 9, 'min_samples_split': 10}

Model with rank: 2
Mean validation score: 0.633 (std: 0.010)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 3, 'max_features': 10, 'min_samples_split': 4}

Model with rank: 3
Mean validation score: 0.631 (std: 0.011)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 7, 'min_samples_split': 5}



In [119]:
run_search_cv(X_train_nocourse, y_train_nocourse, False)

GridSearchCV took 225.21 seconds for 96 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.635 (std: 0.008)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 12, 'min_samples_split': 3}

Model with rank: 2
Mean validation score: 0.635 (std: 0.008)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 10, 'min_samples_split': 2}

Model with rank: 2
Mean validation score: 0.635 (std: 0.008)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 10, 'min_samples_split': 3}

Model with rank: 2
Mean validation score: 0.635 (std: 0.008)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 10, 'min_samples_split': 10}

Model with rank: 2
Mean validation score: 0.635 (std: 0.008)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 12, 'min_samples_split': 2}

Model with rank: 2
Mean validation 

In [132]:
# create model for best model from searches
rf_best_nocourse_model = RandomForestClassifier(bootstrap=True, criterion='entropy', max_depth=3,
                                      max_features=12, min_samples_split=3)
rf_best_nocourse_model.fit(X_train_nocourse, y_train_nocourse)
y_pred_rf_best_nocourse = rf_best_nocourse_model.predict(X_val_nocourse)
rf_best_nocourse_model.score(X_val_nocourse, y_val_nocourse)

0.6295196772474689

In [133]:
pickle.dump(rf_default_nocourse_model, open('./models/hole-prediction-rf-default-nocourse.pkl', 'wb'))
pickle.dump(rf_best_nocourse_model, open('./models/hole-prediction-rf-best-nocourse.pkl', 'wb'))

In [134]:
run_simple_model_eval(rf_default_nocourse_model, X_train_nocourse, y_train_nocourse, y_pred_rf_default_nocourse)

Actual Scores: 0    4.0
1    3.0
2    4.0
3    3.0
4    5.0
Name: Score, dtype: float64
Predicted Scores: [4. 3. 4. 3. 5.]
Mean Actual Scores: 3.893988892518785
Mean Predicted Scores: 3.8263682728172337
Cross Validation: [0.9630355  0.66723842 0.96248161]

                     importance
ActualYard             0.729075
Par                    0.090464
DrivingDistance        0.019564
AvgSGApproach          0.019094
PuttsPerHoleGIR        0.018558
AvgSGOTT               0.018350
AvgSGAroundtheGreen    0.018149
GreeninRegulation      0.018086
AvgSGPutting           0.017874
DrivingAccuracy        0.017459
ScramblingSuccess      0.017389
PuttsPerHole           0.015939


In [135]:
run_simple_model_eval(rf_best_nocourse_model, X_train_nocourse, y_train_nocourse, y_pred_rf_best_nocourse)

Actual Scores: 0    4.0
1    3.0
2    4.0
3    3.0
4    5.0
Name: Score, dtype: float64
Predicted Scores: [4. 3. 4. 4. 4.]
Mean Actual Scores: 3.893988892518785
Mean Predicted Scores: 3.829717591535358
Cross Validation: [0.96768666 0.96814506 0.96812163]

            importance
Par           0.937325
ActualYard    0.053690


In [136]:
test_data = run_test_hole(rf_default_nocourse_model, is_nocourse=True)

Actual Score: 5
Predicted Score: [5.]


In [137]:
test_data = run_test_hole(rf_best_nocourse_model, is_nocourse=True)

Actual Score: 5
Predicted Score: [5.]


In [138]:
# run test of a full 18 holes
file = './data/holes-phil-test.csv'

df_sample_nocourse = load_and_process_no_course(file)
X_sample_nocourse, y_sample_nocourse = split_data(df_sample_nocourse)

print('Accuracy (Default):', rf_default_nocourse_model.score(X_sample_nocourse, y_sample_nocourse))
print('Accuracy (Best):', rf_best_nocourse_model.score(X_sample_nocourse, y_sample_nocourse))

Accuracy (Default): 0.6666666666666666
Accuracy (Best): 0.7222222222222222


In [139]:
y_sample_rf_default_nocourse = rf_default_nocourse_model.predict(X_sample_nocourse)
y_sample_rf_best_nocourse = rf_best_nocourse_model.predict(X_sample_nocourse)

results_nocourse = [
    ['Actual'] + list(y_sample_nocourse),
    ['Default'] + list(y_sample_rf_default_nocourse),
    ['Best'] + list(y_sample_rf_best_nocourse)
]
df_results_nocourse = pd.DataFrame(results_nocourse, columns=['Type',1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18])
df_results_nocourse.head()

Unnamed: 0,Type,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,Actual,4.0,3.0,4.0,4.0,2.0,5.0,2.0,4.0,4.0,3.0,4.0,3.0,4.0,4.0,3.0,5.0,4.0,4.0
1,Default,4.0,5.0,4.0,4.0,3.0,4.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,3.0,5.0,3.0,4.0
2,Best,4.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,3.0,5.0,4.0,4.0


In [140]:
print('Round total (Actual):', y_sample_nocourse.sum())
print('Round total (Default):', y_sample_rf_default_nocourse.sum())
print('Round total (Best):', y_sample_rf_best_nocourse.sum())

Round total (Actual): 66.0
Round total (Default): 69.0
Round total (Best): 69.0


In [141]:
# full test data prediction
df_test_nocourse = load_and_process_no_course(TEST_FILE)
X_test_nocourse, y_test_nocourse = split_data(df_test_nocourse)
print('Accuracy (Default):', rf_default_nocourse_model.score(X_test_nocourse, y_test_nocourse))
print('Accuracy (Best):', rf_best_nocourse_model.score(X_test_nocourse, y_test_nocourse))

Accuracy (Default): 0.6014866504854369
Accuracy (Best): 0.6378185679611651


In [146]:
run_simple_model_eval(rf_best_nocourse_model, X_train_nocourse, y_train_nocourse, y_pred_rf_best_nocourse)

Actual Scores: 0    4.0
1    3.0
2    4.0
3    3.0
4    5.0
Name: Score, dtype: float64
Predicted Scores: [4. 3. 4. 4. 4.]
Mean Actual Scores: 3.893988892518785
Mean Predicted Scores: 3.829717591535358
Cross Validation: [0.96768666 0.96814506 0.96812163]

            importance
Par           0.937325
ActualYard    0.053690


In [148]:
run_simple_model_eval(rf_default_nocourse_model, X_train_nocourse, y_train_nocourse, y_pred_rf_best_nocourse)

Actual Scores: 0    4.0
1    3.0
2    4.0
3    3.0
4    5.0
Name: Score, dtype: float64
Predicted Scores: [4. 3. 4. 4. 4.]
Mean Actual Scores: 3.893988892518785
Mean Predicted Scores: 3.829717591535358
Cross Validation: [0.96621787 0.73462387 0.96125552]

                     importance
ActualYard             0.729075
Par                    0.090464
DrivingDistance        0.019564
AvgSGApproach          0.019094
PuttsPerHoleGIR        0.018558
AvgSGOTT               0.018350
AvgSGAroundtheGreen    0.018149
GreeninRegulation      0.018086
AvgSGPutting           0.017874
DrivingAccuracy        0.017459
ScramblingSuccess      0.017389
PuttsPerHole           0.015939


In [149]:
len(X_train_nocourse.columns)

12

In [150]:
len(X_train.columns)

21