In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import time
import math
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle

from sklearn import linear_model
from sklearn import ensemble

from sklearn import metrics

%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
TRAIN_FILE = './data/phil-mickelson-strokes-2014-2016.csv'
VAL_FILE = './data/phil-mickelson-strokes-2017.csv'
TEST_FILE = './data/phil-mickelson-strokes-2018.csv'

In [3]:
def load_and_process_data(file):
    cols = ['To Location(Scorer)', 'Distance', 'Year', 'Tourn.#', 'Player #', 'Player Name', 
           'Course #', 'Course Name', 'Round', 'Hole', 'Time', 'Shot', 'From Location(Scorer)', 
           'Distance To Pin', 'Lie', 'Elevation', 'Slope', 'Total SG Putting', 'Avg SG Putting',
            'Total SG OTT', 'Avg SG OTT', 'Total SG Approach', 'Avg SG Approach', 
            'Total SG Around the Green', 'Avg SG Around the Green', 'Fwy Firmness', 'Fwy Height', 'Grn Firmness',
           'Grn Height', 'Rough Height', 'Stimp', 'Fwy Width 250', 'Fwy Width 275', 'Fwy Width 300', 
           'Fwy Width 325', 'Fwy Width 350', 'Actual 250 Distance', 'Actual 275 Distance', 'Actual 300 Distance',
           'Actual 325 Distance', 'Actual 350 Distance', 'Par', 'Actual Ydg']
    df = pd.read_csv(file, index_col=None, names=cols)
    
    df.drop(
        columns=['Year', 'Tourn.#', 'Player #', 'Player Name', 'Course #', 'Course Name', 'Round', 'Hole', 'Time'],
        axis=1,
        inplace=True
    )
        
    encodings = {
        'Lie': {
            'Good': 1, 
            '0': -1, 
            'Unknown': -1, 
            'Buried': 2
        },
        'Elevation': {
            'With': 1, 
            'Below Ball': 2, 
            '0': -1, 
            'Unmapped': -1, 
            'Above Ball': 2, 
            'Unknown': -1
        },
        'Slope': {
            'Level': 1, 
            'Downhill': 2,
            '0': -1, 
            'Unknown': -1,
            'Uphill': 2
        },
        'Fwy Firmness': {
            'Medium': 1, 
            'Firm': 2, 
            'Soft': 0, 
            'Unknown': -1
        },
        'Grn Firmness': {
            'Soft': 0, 
            'Medium': 1, 
            'Firm': 2,
            'Unknown': -1
        },
        'From Location(Scorer)': {
            'Tee Box': 0, 
            'Fairway': 1, 
            'Fringe': 2, 
            'Green': 3, 
            'Intermediate Rough': 4, 
            'Primary Rough': 5, 
            'Green Side Bunker': 6, 
            'Fairway Bunker': 7, 
            'Native Area': 8, 
            '0': -1, 
            'Unknown': 11, 
            'Other': 9, 
            'Water': 10
        },
        'To Location(Scorer)': {
            'Tee Box': 0, 
            'Fairway': 1, 
            'Fringe': 2,
            'Green': 3, 
            'Intermediate Rough': 4, 
            'Primary Rough': 5, 
            'Green Side Bunker': 6, 
            'Fairway Bunker': 7,
            'Waste Bunker': 7, 
            'Tree Outline': 8, 
            'Rock Outline': 8, 
            'Dirt Outline': 8,
            'Cart Path': 9, 
            'Path': 9,
            'Native Area': 10, 
            'Water': 11,
            '0': -1, 
            'Unknown': 12
        }
    }
    df.replace(encodings, inplace=True)
    
    # drop nan
    df.dropna(inplace=True)
    
    return df

In [4]:
def split_data(df):
    # return X, y
    return df[df.columns[2:]], df[df.columns[:2]]

In [5]:
df_train = load_and_process_data(TRAIN_FILE)
df_val = load_and_process_data(VAL_FILE)
df_test = load_and_process_data(TEST_FILE)

In [6]:
print('Training Shape', df_train.shape)
print('Validation Shape', df_val.shape)
print('Test Shape', df_test.shape)

Training Shape (11589, 34)
Validation Shape (5041, 34)
Test Shape (4602, 34)


In [7]:
X_train, y_train = split_data(df_train)
X_val, y_val = split_data(df_val)
X_test, y_test = split_data(df_test)

print('Training X Shape', X_train.shape)
print('Training y Shape', y_train.shape)
print('Validation X Shape', X_val.shape)
print('Validation y Shape', y_val.shape)
print('Test X Shape', X_test.shape)
print('Test y Shape', y_test.shape)

Training X Shape (11589, 32)
Training y Shape (11589, 2)
Validation X Shape (5041, 32)
Validation y Shape (5041, 2)
Test X Shape (4602, 32)
Test y Shape (4602, 2)


In [8]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor

multi_reg = MultiOutputRegressor(LinearRegression())
multi_reg.fit(X_train, y_train)

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

#log_reg = LogisticRegression()
#log_reg.fit(X_train, y_train)

multi_rf_reg = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=30))
multi_rf_reg.fit(X_train, y_train)

rf_reg = RandomForestRegressor(n_estimators=100, max_depth=30)
rf_reg.fit(X_train, y_train)

y_pred_multi_lin = multi_reg.predict(X_val)
y_pred_lin = lin_reg.predict(X_val)
#y_pred_log = log_reg.predict(X_val)
y_pred_multi_rf = multi_rf_reg.predict(X_val)
y_pred_rf = rf_reg.predict(X_val)

In [9]:
print('Linear Regression:', lin_reg.score(X_test, y_test))
#print('Logistic Regression:', log_reg.score(X_test, y_test))
print('RandomForest Regression:', rf_reg.score(X_test, y_test))

print('MultiOutput Linear Regression:', multi_reg.score(X_test, y_test))
print('MultiOutput RandomForest Regression:', multi_rf_reg.score(X_test, y_test))

Linear Regression: 0.9296382335388943
RandomForest Regression: 0.9742266976367036
MultiOutput Linear Regression: 0.6025839535568916
MultiOutput RandomForest Regression: 0.6981226251776559


In [10]:
feature_importances = pd.DataFrame(rf_reg.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
Distance To Pin,0.963519
Elevation,0.006765
From Location(Scorer),0.006064
Slope,0.005354
Shot,0.003179
Actual Ydg,0.002393
Fwy Width 250,0.001129
Fwy Width 300,0.000935
Fwy Width 350,0.000883
Fwy Width 325,0.000877


In [None]:
from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression

rf_clf2 = RandomForestClassifier(n_estimators=100, max_depth=30)
rf_clf2.fit(X_train, y_train['To Location(Scorer)'])

rf_reg2 = RandomForestRegressor(n_estimators=100, max_depth=30)
rf_reg2.fit(X_train, y_train['Distance'])

#log_reg2 = LogisticRegression(
#    multi_class='multinomial',
#    solver='newton-cg',
#    fit_intercept=True,
#    n_jobs=-1
#)
#log_reg2.fit(X_train, y_train['Distance'])

print('Split Classifier/Regressor')
print('Classification:', rf_clf2.score(X_test, y_test['To Location(Scorer)']))
print('RF Regression:', rf_reg2.score(X_test, y_test['Distance']))
#print('Log Regression:', log_reg2.score(X_test, y_test['Distance']))
print()

y_pred_rf_clf2 = rf_clf2.predict(X_val)
y_pred_rf_reg2 = rf_reg2.predict(X_val)
#y_pred_log_reg2 = log_reg2.predict(X_val)

print('Predictions')
print('Actual Location', y_val['To Location(Scorer)'][:1])
print('Actual Distance', y_val['Distance'][:1])
print()
print('Predicted Location', y_pred_rf_clf2[0])
print('Predicted RF Distance', y_pred_rf_reg2[0])
#print('Predicted Log Distance', y_pred_log_reg2[0])

In [None]:
feature_importances = pd.DataFrame(rf_clf2.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances

In [None]:
feature_importances = pd.DataFrame(rf_reg2.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances

In [None]:
pickle.dump(rf_reg, open('./models/phil-mickelson-stroke-rf.pkl', 'wb'))

pickle.dump(rf_clf2, open('./models/phil-mickelson-stroke-rf-clf.pkl', 'wb'))
pickle.dump(rf_reg2, open('./models/phil-mickelson-stroke-rf-reg.pkl', 'wb'))
#pickle.dump(log_reg2, open('./models/phil-mickelson-stroke-log-reg.pkl', 'wb'))

In [None]:
# TODO
# - calculate the linear difficulty for each From Location and weight appropriately
# - add in golfer characteristics (cumulative up to the point of tournament, or previous year's end stats)
# statistical independence - bayseian networks


In [None]:
# Location 3
# Distance 7095.88 (198 yards)
# Actual Location 5
# Actual Distance 7416 inches (206 yards)

##### Phil Mickelson - 2019 Genesis Open, Round 1, Hole 1
Shot 5 in the hole

Shot 4 putt 13 ft 5 in., 2 ft 11 in. to hole

Shot 3 65 ft 1 in. to green, 10 ft 3 in. to hole

Shot 2 182 yds to front center green side bunker, 74 ft 11 in. to hole

Shot 1 307 yds to left rough, 206 yds to hole
