### TODO

- Download OWGR historical information
- Apply OWGR as strength of field feature
- Gather golfer summary statistics as features
- Update score prediction for the hole after each shot?
- Gather SG stats for golfer and merge with course features

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import time
import math
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle

from sklearn import linear_model
from sklearn import ensemble

from sklearn import metrics

%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
TRAIN_FILE = './data/phil-mickelson-strokes-train.csv'
TEST_FILE = './data/phil-mickelson-strokes-test.csv'

In [3]:
df = pd.read_csv(TRAIN_FILE, index_col=None)
df.head()

Unnamed: 0,Year,Tourn.#,Player #,Player Name,Course #,Course Name,Round,Hole,Par Value,Yardage,Shot,Shot Type(S/P/D),From Location(Scorer),From Location(Enhanced),To Location(Scorer),To Location(Enhanced),Distance to Pin,Around the Green Flag,1st Putt Flag,Time,Lie,Elevation,Slope,Distance from Center,Distance from Edge,Left/Right,Recovery Shot,Tee Grass,Fwy Firmness,Fwy Height,Fwy. Grass,Grn Firmness,Grn Height,Green Grass,Rough Height,Rough Grass,Stimp,AM Wind Spd,AM Wind Dir,PM Wind Spd,PM Wind Dir,Fwy Width 250,Fwy Width 275,Fwy Width 300,Fwy Width 325,Fwy Width 350,Actual 250 Distance,Actual 275 Distance,Actual 300 Distance,Actual 325 Distance,Actual 350 Distance,Par,Scorecard Ydg,Actual Ydg
0,2010,40,1810,Phil Mickelson,4,Torrey Pines GC (South),1,1,4,450,1,S,Tee Box,Tee Box,Fairway,Right Fairway,16164,N,0,901,Good,With,Level,92,274,R,No,Bermudagrass/Ryegrass,Medium,0.5,Kikuyugrass,Soft,0.11,Poa annua,3.5,Perennial Ryegrass,12.0,c,C,5,IW,25.0,24.0,23.0,23.0,22.0,27.0,30.0,32.0,35.0,37.0,4.0,450.0,449.0
1,2010,40,1810,Phil Mickelson,4,Torrey Pines GC (South),1,1,4,450,2,S,Fairway,Right Fairway,Green Side Bunker,Left Rear Green Side Bunker,5048,Y,0,907,Good,With,Level,205,146,0,No,Bermudagrass/Ryegrass,Medium,0.5,Kikuyugrass,Soft,0.11,Poa annua,3.5,Perennial Ryegrass,12.0,c,C,5,IW,25.0,24.0,23.0,23.0,22.0,27.0,30.0,32.0,35.0,37.0,4.0,450.0,449.0
2,2010,40,1810,Phil Mickelson,4,Torrey Pines GC (South),1,1,4,450,3,S,Green Side Bunker,Left Rear Green Side Bunker,Green,Green,352,N,0,910,Good,With,Level,1,167,L,No,Bermudagrass/Ryegrass,Medium,0.5,Kikuyugrass,Soft,0.11,Poa annua,3.5,Perennial Ryegrass,12.0,c,C,5,IW,25.0,24.0,23.0,23.0,22.0,27.0,30.0,32.0,35.0,37.0,4.0,450.0,449.0
3,2010,40,1810,Phil Mickelson,4,Torrey Pines GC (South),1,1,4,450,4,S,Green,Green,0,0,24,N,Y,913,Good,With,Level,0,0,0,No,Bermudagrass/Ryegrass,Medium,0.5,Kikuyugrass,Soft,0.11,Poa annua,3.5,Perennial Ryegrass,12.0,c,C,5,IW,25.0,24.0,23.0,23.0,22.0,27.0,30.0,32.0,35.0,37.0,4.0,450.0,449.0
4,2010,40,1810,Phil Mickelson,4,Torrey Pines GC (South),1,2,4,389,1,S,Tee Box,Tee Box,Fairway Bunker,Fairway Bunker,13680,N,0,916,Good,With,Level,585,185,L,No,Bermudagrass/Ryegrass,Medium,0.5,Kikuyugrass,Soft,0.11,Poa annua,3.5,Perennial Ryegrass,12.0,c,C,5,RL,27.0,23.0,24.0,21.0,14.0,25.0,27.0,30.0,32.0,35.0,4.0,389.0,380.0


In [None]:
def load_and_process_data(file):
    df = pd.read_csv(file, index_col=None)
    
    df.drop(
        columns=['Player Name', 'Course Name', 'AM Wind Spd', 'AM Wind Dir', 'PM Wind Spd', 'PM Wind Dir'],
        axis=1,
        inplace=True
    )
    
    df.drop(
        columns=['Tee Grass', 'Fwy. Grass', 'Green Grass', 'Rough Grass', 'From Location(Enhanced)', 'To Location(Enhanced)'],
        axis=1,
        inplace=True
    )
    
    # year, tourn #, player #?, course #, round, hole, shot, time, one par, scorecard yrd, actual yrd, shot type, recovery shot
    df.drop(
        columns=['Tourn.#', 'Player #', 'Course #', 'Round', 'Hole', 'Shot', 'Time', 'Par', 'Scorecard Ydg', 'Actual Ydg', 'Shot Type(S/P/D)', 'Recovery Shot'],
        axis=1,
        inplace=True
    )
    
    #df = pd.get_dummies(df, 
    #                    prefix=['Shot_Type', 'Around_the_Green', '1st_Putt', 'Left_Right', 'Recovery_Shot'], 
    #                    columns=['Shot Type(S/P/D)', 'Around the Green Flag', '1st Putt Flag', 'Left/Right', 'Recovery Shot'])
    df = pd.get_dummies(df, 
                        prefix=['Around_the_Green', '1st_Putt', 'Left_Right'], 
                        columns=['Around the Green Flag', '1st Putt Flag', 'Left/Right'])
    
    encodings = {
        'Lie': {
            'Good': 1, 
            '0': -1, 
            'Unknown': -1, 
            'Buried': 2
        },
        'Elevation': {
            'With': 1, 
            'Below Ball': 2, 
            '0': -1, 
            'Unmapped': -1, 
            'Above Ball': 2, 
            'Unknown': -1
        },
        'Slope': {
            'Level': 1, 
            'Downhill': 2,
            '0': -1, 
            'Unknown': -1,
            'Uphill': 2
        },
        'Fwy Firmness': {
            'Medium': 1, 
            'Firm': 2, 
            'Soft': 0, 
            'Unknown': -1
        },
        'Grn Firmness': {
            'Soft': 0, 
            'Medium': 1, 
            'Firm': 2,
            'Unknown': -1
        },
        'From Location(Scorer)': {
            'Tee Box': 0, 
            'Fairway': 1, 
            'Fringe': 2, 
            'Green': 3, 
            'Intermediate Rough': 4, 
            'Primary Rough': 5, 
            'Green Side Bunker': 6, 
            'Fairway Bunker': 7, 
            'Native Area': 8, 
            '0': -1, 
            'Unknown': -1, 
            'Other': 9, 
            'Water': 10
        },
        'To Location(Scorer)': {
            'Tee Box': 0, 
            'Fairway': 1, 
            'Fringe': 2,
            'Green': 3, 
            'Intermediate Rough': 4, 
            'Primary Rough': 5, 
            'Green Side Bunker': 6, 
            'Fairway Bunker': 7,
            'Waste Bunker': 7, 
            'Tree Outline': 8, 
            'Rock Outline': 8, 
            'Dirt Outline': 8,
            'Cart Path': 9, 
            'Path': 9,
            'Native Area': 10, 
            'Water': 11,
            '0': -1, 
            'Unknown': -1
        }
    }
    df.replace(encodings, inplace=True)
    
    # rearrage columns
    cols = df.columns.tolist()
    df = df[cols[4:6] + cols[:4] + cols[6:]]
    
    # drop nan
    df.dropna(inplace=True)
    
    return df