In [1]:
#!/Users/jim/anaconda3/envs/sb/bin/python

import pandas as pd
import numpy as np
from datetime import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import plotly.express as px

In [14]:
pga = pd.read_csv('~/dev/sb/data/pga_stats.csv')

In [15]:
pga['event_completed'] = pd.to_datetime(pga.event_completed)
pga['event_year'] = pga.event_completed.dt.year
pga.set_index('event_completed',inplace=True)
pga['2Y_sg'] = pga.groupby('dg_id')['sg_total'].transform(lambda d: d.rolling('720D', min_periods=1).mean())
pga['2M_sg'] = pga.groupby('dg_id')['sg_total'].transform(lambda d: d.rolling('60D', min_periods=1).mean())
pga = pga.reset_index()

In [16]:
pga['course-year-event-round'] = pga['season'].astype('str') + pga['event_id'].astype('str') + pga['course_name'].astype('str') + pga['round'].astype('str')
pga['dg_id_'] = pga['dg_id']
pga_dummies = pd.get_dummies(data=pga,columns=['dg_id_','course-year-event-round'])

In [17]:
player_dummies = [x for x in pga_dummies.columns if 'dg_id_' in x and x != 'dg_id_']
co_ev_ye_rd_dummies = [x for x in pga_dummies.columns if 'course-year-event-round' in x and x != 'course-year-event-round']
dummies = player_dummies + co_ev_ye_rd_dummies

In [38]:
train = pga_dummies.loc[pga_dummies.event_year < 2023].copy()
test = pga_dummies.loc[pga_dummies.event_year >= 2023].copy()

In [19]:
slm = LinearRegression()
predictors = dummies
response = 'score'
X, y = train[predictors],train[[response]]
slm.fit(X,y)
print(f"""model
        intercept={slm.intercept_[0]:,.2f}
        R2={slm.score(X,y):,.2%}""")

In [39]:
test['predicted_score'] = slm.predict(test[predictors])
train['predicted_score'] = slm.predict(train[predictors])

model
        intercept=-33,431,900,340.48
        R2=30.48%


In [40]:
diff = train.groupby(['season','round','event_name'])['predicted_score'].mean().reset_index()
diff = diff.rename(columns={'predicted_score':'mean_predicted_score'})
train = train.merge(diff,how='inner',left_on=['season','round','event_name'],right_on=['season','round','event_name'])
train['score_diff'] = train.score - train.predicted_score

In [41]:
train.set_index('event_completed',inplace=True)
train['2Y_diff_avg'] = train.groupby('dg_id')['score_diff'].transform(lambda d: d.rolling('720D', min_periods=1).mean())
train['2M_diff_avg'] = train.groupby('dg_id')['score_diff'].transform(lambda d: d.rolling('60D', min_periods=1).mean())
train.reset_index(inplace=True)

In [42]:
train['prior_diff_avg'] = train.groupby('dg_id')['score_diff'].shift(1)
train['prior_event'] = train.groupby('dg_id')['event_completed'].shift(1)
train['days_since_last_event'] = (train.event_completed - train.prior_event).dt.days

In [52]:
predictors_p = ['2Y_diff_avg','2M_diff_avg','prior_diff_avg','days_since_last_event']
response = ['score']
train_cleaned = train.dropna(subset=predictors_p).copy()

In [53]:
slm_p = LinearRegression()
X,y = train_cleaned[predictors_p],train_cleaned[response]
slm_p.fit(X,y)
print(f"""model
        intercept={slm_p.intercept_[0]:,.2f}
        R2={slm_p.score(X,y):,.2%}""")

model
        intercept=70.73
        R2=11.42%
