In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time
import random
import string
from dateutil import parser
from datetime import datetime

#Get name, weight, hieght from first data source
alph = string.ascii_uppercase
players = {}

for letter in alph:

    url = 'https://newsday.sportsdirectinc.com/golf/pga-players.aspx?page=/data/pga/players/'\
    +letter+'_players.html'

    response = requests.get(url)

    page = response.text
    soup = BeautifulSoup(page, 'html5')

    tables = soup.find_all('table')

    rows = [row for row in tables[1].find_all('tr')]
    rows = rows[1:]

    for row in rows:
        items = row.find_all('td')
        player = items[0].find('a')['href']
        players[player] = [i.text for i in items [0:]]

    time.sleep(.5+2*random.random())
    print(letter)


df = pd.DataFrame.from_dict(players).transpose()
df = df.reset_index().drop('index', axis=1)
df = df.apply(lambda x: x.str.strip())
df = df.apply(lambda x: x.replace('', np.nan))
df = df.dropna().reset_index().drop('index', axis=1)
df['name'] = df[0].str.split(', ').str[::-1].str.join(' ')

def get_inches(height):
    ht = height.split("'")
    feet = float(ht[0])
    inches = float(ht[1].replace("\"",""))
    return (12*feet) + inches

df['height_inches'] = df[1].apply(lambda x:get_inches(x))
df['weight_lbs'] = df[2].str.split(' ').str[0]
df['birthyear'] = df[3].apply(lambda x:parser.parse(x).year)        

#Parser returned birthyears such as 2065, need to adjust
def convert_year(year):
    if year > 2000:
        return year-100
    else:
        return year

df['birthyear'] = df['birthyear'].apply(convert_year)

df = df.drop([0,1,2,3,4], axis=1)

#Read in dataset found on kaggle
d = pd.read_csv('pgaTourData.csv')
d = d.rename(index=str, columns={'Player Name': 'name'})

new_df = pd.merge(df, d, on='name')
new_df = new_df.drop('Wins', axis=1)
new_df['Top 10'] = new_df['Top 10'].fillna(0)
new_df = new_df.dropna().reset_index().drop('index', axis=1)


#Obtain money earned data from pgatour.com
years = range(2010, 2019)
d = {}

for year in years:

    url = 'https://www.pgatour.com/stats/stat.109.'\
    +str(year)+'.html'

    response = requests.get(url)

    page = response.text
    soup = BeautifulSoup(page, 'html5')

    tables = soup.find_all('table')

    rows = [row for row in tables[1].find_all('tr')]
    rows = rows[1:]

    players = {}

    for row in rows:
        items = row.find_all('td')
        player = items[2].find('a')['href']
        players[player] = [items[2].text, items[4].text, year]

    d[year] = pd.DataFrame.from_dict(players).transpose()

    time.sleep(1+2*random.random())
    print(year)

mdf = pd.concat(d.values(), ignore_index=True)

def money_to_int(mon):
    mon = mon.replace('$','').replace(',','')
    return int(mon)

mdf['name'] = mdf[0].apply(lambda x: x.strip())
mdf['money'] = mdf[1].apply(lambda x: money_to_int(x))
mdf = mdf.drop([0,1], axis=1)
mdf = mdf.rename(index=int, columns={2: 'Year'})

g = pd.merge(new_df, mdf, on=['name','Year'])
g = g.drop('Money', axis=1)
cols = g.columns.tolist()
cols.insert(0, cols.pop(cols.index('money')))
g = g.reindex(columns=cols)
g['weight_lbs'] = g['weight_lbs'].apply(int)
g['Year'] = g['Year'].apply(int)
g['Points'] = g['Points'].apply(lambda x: x.replace(',', '')).apply(int)

A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
2010
2011
2012
2013
2014
2015
2016
2017
2018


In [4]:
g.to_pickle("pga.pickle")

In [2]:
g.head()

Unnamed: 0,money,name,height_inches,weight_lbs,birthyear,Rounds,Fairway Percentage,Year,Avg Distance,gir,Average Putts,Average Scrambling,Average Score,Points,Top 10,Average SG Putts,Average SG Total,SG:OTT,SG:APR,SG:ARG
0,25271,Robert Allenby,73.0,180,1971,51.0,55.06,2016,282.2,63.73,30.41,50.75,73.117,9,0.0,-0.491,-1.95,-0.631,-0.46,-0.368
1,271887,Robert Allenby,73.0,180,1971,63.0,63.06,2015,282.7,66.67,30.03,51.69,72.102,169,1.0,-0.754,-0.97,-0.262,0.26,-0.213
2,514288,Robert Allenby,73.0,180,1971,83.0,61.47,2014,284.9,67.63,29.85,59.96,70.92,438,0.0,-0.163,0.105,-0.05,0.552,-0.234
3,204272,Robert Allenby,73.0,180,1971,58.0,57.87,2013,284.9,62.4,29.64,52.77,72.056,129,1.0,-0.337,-0.859,-0.733,0.296,-0.085
4,808927,Robert Allenby,73.0,180,1971,76.0,59.34,2012,286.7,65.79,29.91,50.85,71.355,417,2.0,-0.428,-0.626,-0.449,0.537,-0.286


In [32]:
lr = LinearRegression()

X = g.iloc[:,2:]

y = g.iloc[:,0]

lr.fit(X,y)

lr.score(X,y)

0.9151755779899954

In [33]:
def split_and_validate(X, y):
    '''
    For a set of features and target X, y, perform a 80/20 train/val split, 
    fit and validate a linear regression model, and report results
    '''
    
    # perform train/val split
    X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=0.2, random_state=42)
    
    # fit linear regression to training data
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    
    # score fit model on validation data
    val_score = lr_model.score(X_val, y_val)
    
    # report results
    print('\nValidation R^2 score was:', val_score)
    print('Feature coefficient results: \n')
    for feature, coef in zip(X.columns, lr_model.coef_):
        print(feature, ':', f'{coef:.2f}') 

In [34]:
split_and_validate(X, y)


Validation R^2 score was: 0.9031159788338169
Feature coefficient results: 

height_inches : -3642.62
weight_lbs : 1143.60
birthyear : 1705.53
Rounds : -9470.05
Fairway Percentage : 18478.83
Year : 26192.35
Avg Distance : 17527.43
gir : 17429.64
Average Putts : -61328.61
Average Scrambling : -9493.32
Average Score : 18215.55
Points : 2775.52
Top 10 : 40664.34
Average SG Putts : 391924.96
Average SG Total : -294489.25
SG:OTT : 202504.97
SG:APR : 369094.70
SG:ARG : 597736.70


In [29]:
# hold out 20% of the data for final testing
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=10)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=3)

In [35]:
#set up the 3 models we're choosing from:

lm = LinearRegression()

#Feature scaling for train, val, and test so that we can run our ridge model on each
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

lm_reg = Ridge(alpha=1)

#Feature transforms for train, val, and test so that we can run our poly model on each
poly = PolynomialFeatures(degree=2) 

X_train_poly = poly.fit_transform(X_train.values)
X_val_poly = poly.transform(X_val.values)
X_test_poly = poly.transform(X_test.values)

lm_poly = LinearRegression()

In [36]:
#validate

lm.fit(X_train, y_train)
print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.3f}')

lm_reg.fit(X_train_scaled, y_train)
print(f'Ridge Regression val R^2: {lm_reg.score(X_val_scaled, y_val):.3f}')

lm_poly.fit(X_train_poly, y_train)
print(f'Degree 2 polynomial regression val R^2: {lm_poly.score(X_val_poly, y_val):.3f}')

Linear Regression val R^2: 0.895
Ridge Regression val R^2: 0.896
Degree 2 polynomial regression val R^2: 0.907


In [37]:
lm.fit(X,y)
print(f'Degree 2 polynomial regression val R^2: {lm.score(X_test, y_test):.3f}')

Degree 2 polynomial regression val R^2: 0.906
