In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import time
import math
import pickle

%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
model = pickle.load(open('./models/hole-prediction-rf.pkl', 'rb'))

In [3]:
features = [
    'Round', 'TotalSGPutting', 'AvgSGOTT', 'AvgSGApproach',
    'AvgSGAroundtheGreen', 'Actual350Distance', 'GrnFirmness_Medium',
    'GrnFirmness_Soft', 'GrnFirmness_Unknown',
    'TeeGrass_TifsportBermudagrass', 'FwyGrass_419BermudagrassRyegrass',
    'FwyGrass_419HybridBermudagrass', 'FwyGrass_Bermudagrass',
    'FwyGrass_BermudagrassRyegrass', 'FwyGrass_Bluegrass',
    'FwyGrass_CreepingBentgrass', 'FwyGrass_Paspalum',
    'FwyGrass_PenncrossBentgrassPoaa', 'FwyGrass_SouthshoreBentgrass',
    'FwyGrass_TifsportBermudagrass', 'FwyGrass_Zoysiagrass',
    'RoughGrass_328BermudagrassRyegrass',
    'RoughGrass_419BermudagrassBuffal',
    'RoughGrass_419BermudagrassRyegrass', 'RoughGrass_Bermudagrass',
    'RoughGrass_BermudagrassOverseededR', 'RoughGrass_Bluegrass',
    'RoughGrass_BluegrassFescue', 'RoughGrass_BluegrassRyegrass',
    'RoughGrass_BluegrassRyegrassFescue',
    'RoughGrass_Common419Bermudagrass', 'RoughGrass_KentuckyBluegrass',
    'RoughGrass_KikuyugrassRyegrass', 'RoughGrass_PerennialRyegrass',
    'RoughGrass_Ryegrass', 'RoughGrass_TallFescue',
    'RoughGrass_TifsportBermudagrass', 'WindDir_G', 'WindDir_0',
    'WindDir_C', 'WindDir_DW', 'WindDir_IW', 'WindDir_LR', 'WindDir_SW'
]

In [4]:
def parse_time_to_hour(x):
    return int(x[:2])

def is_morning(x):
    return x < 13

def get_wind(t, a, p):
    if is_morning(t):
        return a
    else:
        return p

In [22]:
def load_and_process(file, player=None, tournament=None, year=None, r=None):
    columns = [
        'Id', 'Name', 'Tournament Year', 'Tournament Schedule', 'Event Name', 'Course Name', 'Round', 
        'Hole', 'Total SG Putting', 'Avg SG Putting', 'Total SG OTT', 'Avg SG OTT', 'Total SG Approach', 
        'Avg SG Approach', 'Total SG Around the Green', 'Avg SG Around the Green', 'Par', 'Actual Yard', 
        'Fwy Firmness', 'Grn Firmness', 'Stimp', 'Fwy Height', 'Grn Height', 'Rough Height', 'Tee Grass', 
        'Fwy. Grass', 'Rough Grass', 'Actual 250 Distance', 'Actual 275 Distance', 'Actual 300 Distance', 
        'Actual 325 Distance', 'Actual 350 Distance', 'AM Wind Spd', 'AM Wind Dir', 'PM Wind Spd', 
        'PM Wind Dir', 'Time Hole Finished', 'Hit Fwy', 'Hit Green', 'Hit Greenside Bunker', 'Tee Shot Landing Loc', 
        'Score'
    ]    
    df = pd.read_csv(file, index_col=None, names=columns)
    
    if player != None:
        df = df.loc[(df['Id'] == player)]
        
    if tournament != None:
        df = df.loc[(df['Tournament Schedule'] == tournament)]
        
    if year != None:
        df = df.loc[(df['Tournament Year'] == year)]
        
    if r != None:
        df = df.loc[(df['Round'] == r)]
    
    if df.shape[0] > 0:
        # convert time to just the hour (24-hour)
        df['Time Hole Finished'] = df['Time Hole Finished'].apply(parse_time_to_hour)

        # determine wind speed and direction at time of hole played (am/pm)
        # drop old columns
        df['Wind Dir'] = np.vectorize(get_wind)(df['Time Hole Finished'], df['AM Wind Dir'], df['PM Wind Dir'])
        df['Wind Spd'] = np.vectorize(get_wind)(df['Time Hole Finished'], df['AM Wind Spd'], df['PM Wind Spd'])
        df.drop(['AM Wind Dir', 'AM Wind Spd', 'PM Wind Dir', 'PM Wind Spd'], axis=1, inplace=True)

        # drop result columns (include later for potential simulations?)
        df.drop(['Hit Fwy', 'Hit Green', 'Hit Greenside Bunker', 'Tee Shot Landing Loc'], axis=1, inplace=True)

        # drop non-features (only for identification)
        df.drop(['Id', 'Name', 'Tournament Year', 'Tournament Schedule', 'Event Name', 'Course Name', 'Hole'], axis=1, inplace=True)

        # one hot encode firmness and grasses
        df = pd.get_dummies(df, columns=['Fwy Firmness', 'Grn Firmness', 'Tee Grass', 'Fwy. Grass', 'Rough Grass'])

        # one hot encode wind dir ({' G', '0', 'C', 'DW', 'IW', 'LR', 'RL', 'SW'})
        df = pd.get_dummies(df, columns=['Wind Dir'])

        # drop wind spd for now until method of parsing can be determined
        df.drop(['Wind Spd'], axis=1, inplace=True)

        # rename columns to remove non-alpha chars
        renamed_cols = {}
        for x in df.columns:
            renamed_cols[x] = x.replace('/','').replace('&','').replace('#','').replace(' ','').replace('.','')

        df.rename(columns=renamed_cols, inplace=True)

        # run values through a standard scalar?

    return df

In [23]:
def split_data(data):
    X = data.drop('Score', axis=1, inplace=False)
    y = data['Score']
    return X, y

def normalize_data(X):
    cols = X.columns
    return pd.DataFrame(MinMaxScaler().fit_transform(X), columns=cols)

In [24]:
def load_and_process_filter(file, columns_filter, player=None, tournament=None, year=None, r=None):
    df = load_and_process(file, player=player, tournament=tournament, year=year, r=r)
    X, y = split_data(df)
    
    same_cols = list(set(columns_filter).intersection(set(X.columns)))
    diff_cols = list(set(columns_filter).difference(set(X.columns)))
    X = X[same_cols]
    for col in diff_cols:
        X[col] = 0
    return X, y

In [28]:
X, y = load_and_process_filter('./data/holes-2018.csv', columns_filter=features, player=1810, r=1, year=2018, tournament=200)
X.shape

(18, 44)

In [31]:
np.sum(y)

69.0

In [32]:
y_pred = model.predict(X)
np.sum(y_pred)

58.84059523809524

In [35]:
y_pred2 = np.round(y_pred,0)
np.sum(y_pred2)

56.0

In [33]:
y

324    4.0
325    3.0
326    3.0
327    4.0
328    4.0
329    4.0
330    2.0
331    4.0
332    4.0
333    4.0
334    5.0
335    3.0
336    3.0
337    6.0
338    4.0
339    4.0
340    3.0
341    5.0
Name: Score, dtype: float64

In [34]:
y_pred

array([3.08678571, 3.61666667, 3.18678571, 3.36666667, 3.36666667,
       3.11      , 3.08678571, 3.11      , 3.18678571, 3.61666667,
       3.36666667, 3.36666667, 3.3       , 3.36666667, 3.11      ,
       3.11      , 3.3       , 3.18678571])

In [None]:

#dict(a, **b)