### TODO

- Download OWGR historical information
- Apply OWGR as strength of field feature
- Gather golfer summary statistics as features
- Update score prediction for the hole after each shot?
- Gather SG stats for golfer and merge with course features

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import time
import math
import pickle

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle

from sklearn import linear_model
from sklearn import ensemble

from sklearn import metrics

%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
TRAIN_FILE = './data/justin-thomas-holes-train.csv'
VALIDATION_FILE = './data/justin-thomas-holes-validation.csv'
TEST_FILE = './data/justin-thomas-holes-test.csv'

In [3]:
df = pd.read_csv(TRAIN_FILE, index_col=None)
df.head()

Unnamed: 0,Year,PlayerNumber,HoleNumber,CourseNumber,Course,AvgPuttsGained,AvgSGT2G,AvgSGTotal,AvgOTTSG,AvgAppSG,AvgARGSG,DrivingDistanceAvg,DrivingAccuracyAvg,AvgEagles,AvgBirdies,AvgPars,AvgBogies,AvgDoubles,AvgOthers,App5075ftAvg,App75100ftAvg,App100125ftAvg,App50125ftAvg,App125150ftAvg,App150175ftAvg,App175200ftAvg,App200ftAvg,AvgGIR,AvgProximityToHole,AvgScramblingProximityToHole,AvgSandProximityToHole,PuttingInside5Ft,PuttingInside10Ft,Putting5Ft10Ft,Putting4Ft8Ft,Putting10Ft15Ft,Putting15Ft20Ft,Putting20Ft25Ft,PuttingOver10Ft,PuttingOver25Ft,Yardage,Par,FairwayHeight,GreenHeight,RoughHeight,Stimp,FwyWidth250,FwyWidth275,FwyWidth300,FwyWidth325,FwyWidth350,Score
0,2016,33448,1,776,Sea Island Resort (Seaside),-0.185013,0.505507,0.320453,0.029067,0.493667,-0.017453,293.604762,0.534286,0,3,11,2,0,0,14.213,19.695299,18.040868,18.260482,22.183331,24.792382,31.326881,44.423923,0.657037,35.287862,8.578867,9.810045,1.029693,0.880661,1.745856,0.66879,0.247826,0.184211,0.102564,0.136476,0.042345,411,4,0.5,0.9,2.25,11.0,25,27,30,32,35,4
1,2016,33448,1,776,Sea Island Resort (Seaside),-0.185013,0.505507,0.320453,0.029067,0.493667,-0.017453,293.604762,0.534286,0,3,11,2,0,0,14.213,19.695299,18.040868,18.260482,22.183331,24.792382,31.326881,44.423923,0.657037,35.287862,8.578867,9.810045,1.029693,0.880661,1.745856,0.66879,0.247826,0.184211,0.102564,0.136476,0.042345,409,4,0.5,0.9,2.25,11.0,25,27,30,32,35,4
2,2016,33448,1,776,Sea Island Resort (Seaside),-0.185013,0.505507,0.320453,0.029067,0.493667,-0.017453,293.604762,0.534286,0,3,11,2,0,0,14.213,19.695299,18.040868,18.260482,22.183331,24.792382,31.326881,44.423923,0.657037,35.287862,8.578867,9.810045,1.029693,0.880661,1.745856,0.66879,0.247826,0.184211,0.102564,0.136476,0.042345,420,4,0.5,0.9,2.25,11.0,25,27,30,32,35,5
3,2016,33448,1,36,Firestone CC (South),-0.185013,0.505507,0.320453,0.029067,0.493667,-0.017453,293.604762,0.534286,0,3,11,2,0,0,14.213,19.695299,18.040868,18.260482,22.183331,24.792382,31.326881,44.423923,0.657037,35.287862,8.578867,9.810045,1.029693,0.880661,1.745856,0.66879,0.247826,0.184211,0.102564,0.136476,0.042345,395,4,0.3,0.1,3.0,12.5,26,28,31,33,36,4
4,2016,33448,1,36,Firestone CC (South),-0.185013,0.505507,0.320453,0.029067,0.493667,-0.017453,293.604762,0.534286,0,3,11,2,0,0,14.213,19.695299,18.040868,18.260482,22.183331,24.792382,31.326881,44.423923,0.657037,35.287862,8.578867,9.810045,1.029693,0.880661,1.745856,0.66879,0.247826,0.184211,0.102564,0.136476,0.042345,394,4,0.3,0.1,3.0,12.5,26,28,31,33,36,4


In [4]:
def load_and_process_data(file):
    df = pd.read_csv(file, index_col=None)
    
    df.drop(
        columns=['Year', 'PlayerNumber', 'HoleNumber', 'CourseNumber', 'Course'],
        axis=1,
        inplace=True
    )
    
    df.drop(
        columns=['AvgEagles', 'AvgBirdies', 'AvgPars', 'AvgBogies', 'AvgDoubles', 'AvgOthers'],
        axis=1,
        inplace=True
    )
    
    # drop nan
    df.dropna(inplace=True)
    
    return df

In [5]:
def split_data(df):
    #return X, y
    return df[df.columns[:-1]], df['Score']

In [6]:
def normalize_data(X):
    cols = X.columns
    return pd.DataFrame(MinMaxScaler().fit_transform(X), columns=cols)

In [7]:
df_train = load_and_process_data(TRAIN_FILE)
df_validation = load_and_process_data(VALIDATION_FILE)
df_test = load_and_process_data(TEST_FILE)

In [8]:
print('Training Shape', df_train.shape)
print('Validation Shape', df_validation.shape)
print('Test Shape', df_test.shape)

Training Shape (1544, 41)
Validation Shape (1436, 41)
Test Shape (1455, 41)


In [9]:
X_train, y_train = split_data(df_train)
X_validation, y_validation = split_data(df_validation)
X_test, y_test = split_data(df_test)

print('Training X Shape', X_train.shape)
print('Training y Shape', y_train.shape)
print('Validation X Shape', X_validation.shape)
print('Validation y Shape', y_validation.shape)
print('Test X Shape', X_test.shape)
print('Test y Shape', y_test.shape)

Training X Shape (1544, 40)
Training y Shape (1544,)
Validation X Shape (1436, 40)
Validation y Shape (1436,)
Test X Shape (1455, 40)
Test y Shape (1455,)


In [10]:
X_train = normalize_data(X_train)
X_validation = normalize_data(X_validation)
X_test = normalize_data(X_test)

In [11]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)
rf_reg.score(X_validation, y_validation)

0.2061120239689861

In [12]:
feature_importances = pd.DataFrame(rf_reg.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
Yardage,0.356629
Par,0.342533
RoughHeight,0.062235
Stimp,0.046489
FairwayHeight,0.045289
GreenHeight,0.03323
FwyWidth325,0.026703
FwyWidth350,0.022791
FwyWidth275,0.022741
FwyWidth250,0.021734


In [13]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg.score(X_validation, y_validation)

0.5675487465181058

In [14]:
df_train.corr()['Score']

AvgPuttsGained                  3.172651e-17
AvgSGT2G                       -3.069059e-16
AvgSGTotal                     -4.285325e-16
AvgOTTSG                        3.069059e-16
AvgAppSG                       -1.460200e-16
AvgARGSG                       -1.722737e-16
DrivingDistanceAvg              1.452779e-16
DrivingAccuracyAvg              5.345119e-16
App5075ftAvg                   -2.644032e-16
App75100ftAvg                  -2.143432e-16
App100125ftAvg                  6.025355e-16
App50125ftAvg                  -1.435696e-16
App125150ftAvg                 -3.547153e-16
App150175ftAvg                  2.544006e-16
App175200ftAvg                 -2.544006e-16
App200ftAvg                    -2.289375e-16
AvgGIR                         -7.809075e-17
AvgProximityToHole             -2.433081e-16
AvgScramblingProximityToHole    1.435696e-16
AvgSandProximityToHole          1.327085e-16
PuttingInside5Ft                7.429349e-17
PuttingInside10Ft              -3.371921e-16
Putting5Ft