In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
import pickle
from sklearn.svm import SVC
from xgboost import XGBRegressor
import warnings

In [2]:
with open('Cut_Classifier.pkl', 'rb') as infile:
    svm = pickle.load(infile)
    
with open('Strokes_Regressor.pkl', 'rb') as infile:
    xgbr = pickle.load(infile)

In [3]:
with open('indices.pkl', 'rb') as infile:
    indices = pickle.load(infile)
    
with open('base_features.pkl', 'rb') as infile:
    features = pickle.load(infile)
    
with open('sg_lag_features.pkl', 'rb') as infile:
    sg_lag_features = pickle.load(infile)

In [4]:
with open('players.pkl', 'rb') as infile:
    players = pickle.load(infile)

with open('tournaments.pkl', 'rb') as infile:
    tournaments = pickle.load(infile)
    
with open('courses.pkl', 'rb') as infile:
    courses = pickle.load(infile)

## 5) Tournament Predictions

Now that the models are tuned and ready to go, they need some upcoming tournament data to make predictions for.  Before doing anything else, we need to be able to transform an incoming record of data to match the expected input format of the model.  To do that, we need the following items:

<ol>
    <li>The name of the player, course, and tournament we want to make a prediction for</li>
    <li>The existing records in our dataset for the player in question</li>
</ol>

In the first section below, we will write a set of functions to transform the data for an individual player at a particular tournament to feed through the model.  In the second, we will do the same for a full tournament field and generate a predicted finish line-up.

### Individual Player

To forecast data for an individual player, we will add a dummy record to the original data-cleansing dataset for the current season.  That way, we can let the existing logic do all of the data transformation for us instead of having to code some tricky logic to force a dataframe to look hoe we want.

Below I will import our original dataset and filter only for the records associated with our player.  Then I will add the dummy record and walk through all of our previous functions to create the new dataset for the player.

#### Data Transformation

In [134]:
df_clean = pd.read_excel('C:\\Users\\rbush\\Documents\\Projects\\PGA Finish Projections\\PGA Finish Projections_cleaned data.xlsx')
df_clean = df_clean[['player', 'tournament_name', 'course', 'season', 'days_from_today',
                     'sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total', 
                     'strokes_rel_par', 'place_adj']]

In [135]:
df_clean.head()

Unnamed: 0,player,tournament_name,course,season,days_from_today,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,strokes_rel_par,place_adj
0,Aaron Baddeley,AT&T Pebble Beach Pro-Am,Pebble Beach,2021,72,0.73,-0.41,-2.67,0.6,-2.47,-1.74,0,80
1,Aaron Baddeley,Farmers Insurance Open,Torrey Pines,2021,86,-0.59,0.38,0.42,-2.04,-1.25,-1.83,4,80
2,Aaron Baddeley,The American Express,La Quinta CC,2021,93,-3.14,-0.72,0.07,-0.22,-0.86,-4.0,-1,80
3,Aaron Baddeley,Sony Open in Hawaii,Waialae CC,2021,100,1.59,-0.94,0.19,-0.35,-1.09,0.5,-11,41
4,Aaron Baddeley,The RSM Classic,Sea Island,2021,156,-0.2,1.07,-1.81,-0.84,-1.58,-1.78,-4,57


In [136]:
player = 'Cameron Tringale'
tournament = 'PGA Championship'
course = 'Torrey Pines'

In [137]:
df_player = df_clean.loc[df_clean['player']==player]

In [138]:
df_player.head()

Unnamed: 0,player,tournament_name,course,season,days_from_today,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,strokes_rel_par,place_adj
4819,Cameron Tringale,Valero Texas Open,TPC San Antonio,2021,23,0.82,0.51,0.19,0.62,1.32,2.15,-8,9
4820,Cameron Tringale,The Honda Classic,PGA National,2021,37,-0.04,1.3,0.13,0.72,2.15,2.11,-4,31
4821,Cameron Tringale,The Players Championship,TPC Sawgrass,2021,44,-0.84,-0.33,0.35,0.24,0.26,-0.58,3,80
4822,Cameron Tringale,Arnold Palmer Invitational,Bay Hill,2021,51,0.1,0.63,-0.72,0.79,0.7,0.8,1,13
4823,Cameron Tringale,The Genesis Invitational,Riviera,2021,65,-0.79,-0.24,1.88,0.0,1.65,0.85,-2,26


In [139]:
df_predict_record = df_player.iloc[0]

In [140]:
warnings.filterwarnings('ignore')

df_predict_record[1] = tournament
df_predict_record[2] = course
df_predict_record[3] = 'Predicted'
df_predict_record[4] = 1
df_predict_record[5:11] = 9.99
df_predict_record[11] = 99
df_predict_record[12] = 99

In [141]:
df_predict_record

player             Cameron Tringale
tournament_name    PGA Championship
course                 Torrey Pines
season                    Predicted
days_from_today                   1
sg_putt                        9.99
sg_arg                         9.99
sg_app                         9.99
sg_ott                         9.99
sg_t2g                         9.99
sg_total                       9.99
strokes_rel_par                  99
place_adj                        99
Name: 4819, dtype: object

In [142]:
df_clean.loc[999999] = df_predict_record
df_clean.sort_values(by = ['player', 'days_from_today'], inplace = True)

In [21]:
df_list = []
player_original = player

for player in players:
    dfi = df_clean.loc[df_clean['player']==player]
    df_list.append(dfi)

lag_features = set()

for feature in features:
    for df in df_list:
        for week_itrvl in range(1,4):
            df[feature+str(week_itrvl)] = df[feature].shift(-1*week_itrvl)
            lag_features.add(feature+str(week_itrvl))

df_clean = df_list[0]

for df in df_list[1:]:
    df_clean = df_clean.append(df)

df_clean.dropna(inplace = True)
player = player_original

In [22]:
df_clean.loc[df_clean['player']=='Cameron Tringale']

Unnamed: 0,player,tournament_name,course,season,days_from_today,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,...,sg_arg3,sg_app1,sg_app2,sg_app3,sg_ott1,sg_ott2,sg_ott3,sg_t2g1,sg_t2g2,sg_t2g3
999999,Cameron Tringale,PGA Championship,Torrey Pines,Predicted,1,9.990000,9.99,9.990000,9.990,9.99,...,-0.33,0.190000,0.130000,0.35,0.620,0.720,0.24,1.32,2.15,0.26
4819,Cameron Tringale,Valero Texas Open,TPC San Antonio,2021,23,0.820000,0.51,0.190000,0.620,1.32,...,0.63,0.130000,0.350000,-0.72,0.720,0.240,0.79,2.15,0.26,0.70
4820,Cameron Tringale,The Honda Classic,PGA National,2021,37,-0.040000,1.30,0.130000,0.720,2.15,...,-0.24,0.350000,-0.720000,1.88,0.240,0.790,0.00,0.26,0.70,1.65
4821,Cameron Tringale,The Players Championship,TPC Sawgrass,2021,44,-0.840000,-0.33,0.350000,0.240,0.26,...,-0.46,-0.720000,1.880000,1.42,0.790,0.000,0.12,0.70,1.65,1.08
4822,Cameron Tringale,Arnold Palmer Invitational,Bay Hill,2021,51,0.100000,0.63,-0.720000,0.790,0.70,...,0.19,1.880000,1.420000,0.13,0.000,0.120,0.38,1.65,1.08,0.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4948,Cameron Tringale,Valero Texas Open,TPC San Antonio,2015,2221,-0.310000,1.02,0.360000,-0.280,1.09,...,0.38,0.320000,-0.143333,0.22,0.260,-0.165,-0.29,0.69,0.27,0.31
4949,Cameron Tringale,Valspar Championship,Innisbrook,2015,2235,0.800000,0.10,0.320000,0.260,0.69,...,0.78,-0.143333,0.220000,-0.19,-0.165,-0.290,-0.30,0.27,0.31,0.29
4950,Cameron Tringale,World Golf Championships - Cadillac Championship,Doral,2015,2242,0.213333,0.58,-0.143333,-0.165,0.27,...,0.45,0.220000,-0.190000,-1.42,-0.290,-0.300,0.32,0.31,0.29,-0.66
4951,Cameron Tringale,The Honda Classic,PGA National,2015,2248,0.020000,0.38,0.220000,-0.290,0.31,...,0.74,-0.190000,-1.420000,-1.15,-0.300,0.320,-0.18,0.29,-0.66,-0.59


In [23]:
indices = ['player', 'tournament_name', 'course', 'season']

In [24]:
df_id = df_clean[indices]
df_feat = df_clean[sg_lag_features]

In [16]:
def min_max_feat(df, features):
    features_minmax = []
    for feature in features:
        features_minmax = features_minmax+[feature+'_max', feature+'_med', feature+'_min']
        
    min_max_df = pd.DataFrame(0, index = df.index, columns = features_minmax)
    
    for feature in features:
        min_max_df[[feature+'_min', feature+'_med',feature+'_max']] = np.sort(df[[feature+'1',feature+'2',feature+'3']].values,1)
        
    return(min_max_df)

In [26]:
df_feat_minmax = min_max_feat(df_feat, features)

In [27]:
df_season = pd.get_dummies(df_id['season'], prefix = 'season')
drop_columns = [item for item in df_season.columns if item != 'season_Predicted']
df_season.drop(columns = drop_columns, inplace = True)
df_season.rename(columns = {'season_Predicted':'prediction_record'}, inplace = True)

df_oh_player = pd.get_dummies(df_id['player'], prefix = 'player')
df_oh_tournament = pd.get_dummies(df_id['tournament_name'], prefix = 'tournament')
df_oh_course = pd.get_dummies(df_id['course'], prefix = 'course')

dfm1 = pd.merge(df_season, df_oh_player, left_index = True, right_index = True)
dfm2 = pd.merge(dfm1, df_oh_tournament, left_index = True, right_index = True)
dfm3 = pd.merge(dfm2, df_oh_course, left_index = True, right_index = True)
df_model_oh = pd.merge(dfm3, df_feat_minmax, left_index = True, right_index = True)

df_transform = df_model_oh

In [28]:
df_transform.loc[df_transform['prediction_record']==1]

Unnamed: 0,prediction_record,player_Aaron Baddeley,player_Aaron Wise,player_Abraham Ancer,player_Adam Hadwin,player_Adam Long,player_Adam Schenk,player_Adam Scott,player_Adam Svensson,player_Akshay Bhatia,...,sg_arg_min,sg_app_max,sg_app_med,sg_app_min,sg_ott_max,sg_ott_med,sg_ott_min,sg_t2g_max,sg_t2g_med,sg_t2g_min
999999,1,0,0,0,0,0,0,0,0,0,...,-0.33,0.35,0.19,0.13,0.72,0.62,0.24,2.15,1.32,0.26


#### Feed Forward Function

Now that the data is represented in the same format as our models were trained on, I'm going to feed it through our models to determine how we would have expected or player to finish.  As a quick reminder, we are forecasting his finish at this year's Valspar Championship.  At this point, we know he finished 3rd, and had one of this best finishes in a while after some solid performance on a few recent tournaments.  Let's see how the model does.

In [29]:
df_predict_record = df_transform.loc[df_transform['prediction_record']==1]

In [30]:
df_predict_record.drop(columns = ['prediction_record'], inplace = True)

In [31]:
made_cut = svm.predict(df_predict_record)

In [32]:
if made_cut == 1:
    print('We predict that %s' % (player), ' made the cut.', sep='')
else:
    print('We predict that %s' % (player), ' did NOT make the cut.', sep='')

We predict that Cameron Tringale made the cut.


In [33]:
predicted_strokes = xgbr.predict(df_predict_record)

In [34]:
predicted_strokes

array([-2.7896543], dtype=float32)

### Field Projection

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
import pickle
from sklearn.svm import SVC
from xgboost import XGBRegressor
import warnings

In [29]:
def min_max_feat(df, features):
    features_minmax = []
    for feature in features:
        features_minmax = features_minmax+[feature+'_max', feature+'_med', feature+'_min']
        
    min_max_df = pd.DataFrame(0, index = df.index, columns = features_minmax)
    
    for feature in features:
        min_max_df[[feature+'_min', feature+'_med',feature+'_max']] = np.sort(df[[feature+'1',feature+'2',feature+'3']].values,1)
        
    return(min_max_df)

In [36]:
def field_projection(course, tournament, players_tournament, players_trained, features, indices, sg_lag_features, 
                     classification_model, strokes_regression_model):
    """
    The field_projection function takes the following inputs and predicts whether a player makes the cut and how he 
    will finish relative to par:
    
        - df: the cleaned dataframe that comes from the previous files
        - course: the name of the course the tournament is being played on
        - tournament: the name of the tournament being contested
        - players: the names of the players in the field
        - classification_model: the trained classification model object
        - strokes_regression_model: the trained strokes relative to par regression object
    """
    warnings.filterwarnings('ignore')
    player_count = len(players_tournament)
    
    field_projection_list = []
    missing_players = []
        
    i = 1
    for player in players_tournament:
        try:
            df_clean = pd.read_excel('C:\\Users\\rbush\\Documents\\Projects\\PGA Finish Projections\\PGA Finish Projections_cleaned data.xlsx')
            df_clean = df_clean[['player', 'tournament_name', 'course', 'season', 'days_from_today',
                                 'sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total', 
                                 'strokes_rel_par', 'place_adj']]
            df_player = df_clean.loc[df_clean['player']==player]

            df_predict_record = df_player.iloc[0:1,:]

            df_predict_record.iloc[0,1] = tournament
            df_predict_record.iloc[0,2] = course
            df_predict_record.iloc[0,3] = 'Predicted'
            df_predict_record.iloc[0,4] = 1
            df_predict_record.iloc[0,5:11] = 9.99
            df_predict_record.iloc[0,11] = 99
            df_predict_record.iloc[0,12] = 99

            df_clean = df_clean.append(df_predict_record, ignore_index = True)
            df_clean.sort_values(by = ['player', 'days_from_today'], inplace = True)

            df_list = []
            player_original = player

            for player_trained in players_trained:
                dfi = df_clean.loc[df_clean['player']==player_trained]
                df_list.append(dfi)

            lag_features = set()

            for feature in features:
                for df in df_list:
                    for week_itrvl in range(1,4):
                        df[feature+str(week_itrvl)] = df[feature].shift(-1*week_itrvl)
                        lag_features.add(feature+str(week_itrvl))

            df_clean = df_list[0]

            for df in df_list[1:]:
                df_clean = df_clean.append(df)

            df_clean.dropna(inplace = True)
            player = player_original

            df_id = df_clean[indices]
            df_feat = df_clean[sg_lag_features]

            df_feat_minmax = min_max_feat(df_feat, features)

            df_season = pd.get_dummies(df_id['season'], prefix = 'season')
            drop_columns = [item for item in df_season.columns if item != 'season_Predicted']
            df_season.drop(columns = drop_columns, inplace = True)
            df_season.rename(columns = {'season_Predicted':'prediction_record'}, inplace = True)

            df_oh_player = pd.get_dummies(df_id['player'], prefix = 'player')
            df_oh_tournament = pd.get_dummies(df_id['tournament_name'], prefix = 'tournament')
            df_oh_course = pd.get_dummies(df_id['course'], prefix = 'course')

            dfm1 = pd.merge(df_season, df_oh_player, left_index = True, right_index = True)
            dfm2 = pd.merge(dfm1, df_oh_tournament, left_index = True, right_index = True)
            dfm3 = pd.merge(dfm2, df_oh_course, left_index = True, right_index = True)
            df_model_oh = pd.merge(dfm3, df_feat_minmax, left_index = True, right_index = True)

            df_transform = df_model_oh

            df_predict_record = df_transform.loc[df_transform['prediction_record']==1]

            df_predict_record.drop(columns = ['prediction_record'], inplace = True)

            made_cut = svm.predict(df_predict_record)
            predicted_strokes = xgbr.predict(df_predict_record)

            field_projection_list.append((player, made_cut[0], predicted_strokes[0]))

            pct_complete = round(100*i/player_count, 1)
            print(f'\r{pct_complete}% complete.  Missing players: {missing_players}', end = '')

            i += 1
            print(i)
            
        except:
            missing_players.append(player)
        
    pct_complete = round(100, 1)
    print(f'\r{pct_complete}% complete.  Missing players: {missing_players}', end = '')
    
    field_projection_df = pd.DataFrame(field_projection_list)
    field_projection_df = field_projection_df.rename(columns = {0:'player', 1:'made_cut', 2:'strokes_rel_par'})
    field_projection_df = field_projection_df.sort_values(by = ['strokes_rel_par'])
    field_projection_df['projected_finish'] = np.arange(len(field_projection_df['player']))
    
    return field_projection_df

In [37]:
with open('Cut_Classifier.pkl', 'rb') as infile:
    svm = pickle.load(infile)
    
with open('Strokes_Regressor.pkl', 'rb') as infile:
    xgbr = pickle.load(infile)
    
with open('base_features.pkl', 'rb') as infile:
    features = pickle.load(infile)
    
with open('sg_lag_features.pkl', 'rb') as infile:
    sg_lag_features = pickle.load(infile)

with open('players.pkl', 'rb') as infile:
    players_trained = pickle.load(infile)

with open('tournaments.pkl', 'rb') as infile:
    tournaments_trained = pickle.load(infile)
    
with open('courses.pkl', 'rb') as infile:
    courses_trained = pickle.load(infile)
    
indices = ['player', 'tournament_name', 'course', 'season']

In [38]:
#tournaments_trained

In [39]:
course = 'Muirfield Village'
tournament = 'The Memorial Tournament pres. by Nationwide'

In [52]:
players_field_df = pd.read_csv('C:\\Users\\rbush\\Documents\\Projects\\PGA Finish Projections\\Tournament Field.csv',
                              encoding = "ISO-8859-1")

players_tournament = [player for player in players_field_df.iloc[:,0]]
players_tournament = players_field_df.iloc[:,0].str.split().str.join(' ')
players_tournament = [player for player in players_tournament if player in players_trained]

In [1]:
predictions = field_projection(course, tournament, players_tournament, players_trained, features, indices, sg_lag_features, classification_model = svm, strokes_regression_model = xgbr)

In [23]:
predictions = predictions.loc[predictions['made_cut'] == 1]

In [24]:
predictions.to_csv('C:\\Users\\rbush\\Documents\\Projects\\PGA Finish Projections\\Output\\Weekly Predictions.csv')

In [25]:
predictions.head(20)

Unnamed: 0,player,made_cut,strokes_rel_par,projected_finish
89,Jordan Spieth,1,-8.647275,0
117,Jordan Spieth,1,-8.647275,1
3,Daniel Berger,1,-7.326073,2
125,Justin Thomas,1,-7.288437,3
100,Justin Thomas,1,-7.288437,4
62,Collin Morikawa,1,-6.409621,5
112,Gary Woodland,1,-5.569523,7
137,Gary Woodland,1,-5.569523,8
65,Joaquin Niemann,1,-5.533213,9
1,Abraham Ancer,1,-5.475438,10
