In [1]:
from pymongo import MongoClient
import pprint

from scipy import stats

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# Requests sends and recieves HTTP requests.
import requests

# Beautiful Soup parses HTML documents in python.
from bs4 import BeautifulSoup

import json
import time
import copy

In [2]:
#Import URLs
off_the_tee_url = 'https://www.pgatour.com/content/pgatour/stats/stat.02567.y2019.html'
approach_url = 'https://www.pgatour.com/content/pgatour/stats/stat.02568.y2019.html'
around_the_green_url = 'https://www.pgatour.com/content/pgatour/stats/stat.02569.y2019.html'
putting_url = 'https://www.pgatour.com/content/pgatour/stats/stat.02564.y2019.html'
total_url = 'https://www.pgatour.com/content/pgatour/stats/stat.02675.y2020.html'
scoring_url = 'https://www.pgatour.com/stats/stat.120.y2019.html'

In [3]:
#Request HTML
tee = requests.get(off_the_tee_url)
med_long = requests.get(approach_url)
med_short = requests.get(around_the_green_url)
putt = requests.get(putting_url)
total = requests.get(total_url)
scoring_avg = requests.get(scoring_url)

In [4]:
#Check status == 200 :: 200 implies the request was successfully processed
requested = [tee, med_short, med_long, putt, total,scoring_avg]
for r in requested:
    print(r.status_code)

200
200
200
200
200
200


In [5]:
#Print sample HTML from each request
for r in requested:
    pprint.pprint(r.text[:1000])

('\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '<!DOCTYPE HTML>\n'
 '<html lang="en" >\n'
 '\n'
 '    \n'
 '    \n'
 '        <head>\n'
 '    <meta http-equiv="content-type" content="text/html; charset=UTF-8">\n'
 '    <meta name="viewport" content="width=device-width, initial-scale=1.0, '
 'maximum-scale=1.0, user-scalable=no, shrink-to-fit=no" />\n'
 '\n'
 '    <link rel="canonical" '
 'href="https://www.pgatour.com/stats/stat.02567.y2019.html"/>\n'
 '<meta name="title" content="SG: Off-the-Tee | PGA TOUR Stats" />\n'
 '<meta name="description" content="Strokes Gained | Distance (All Drives) | '
 'Distance (Measured Drives) | Accuracy | Scoring | Other | Radar" />\n'
 '<meta name="msApplication-ID" content="pga-tour/9wzdncrfhz2c"/>\n'
 '\n'
 '<meta name="google-site-verification" '
 'content="xp_dFTcmlaq2Fgq3gyZfIjWo0KjdEC4cfaO4nWx5yPA" />\n'
 '<!-- Google Authorship and Publisher Markup -->\n'
 '<!-- Schema.org markup for Google+ http://schema.org/Article -->\n'
 '<meta itemprop="name" conten

In [8]:
#Save HTML into a MongoDB
client = MongoClient()
golf_db = client.golf
long_html = golf_db.long
med_long_html = golf_db.med_long
med_short_html = golf_db.med_short
short_html = golf_db.short
total_html = golf_db.total
scoring_html = golf_db.scoring

long_html.insert_one({'link': off_the_tee_url, 'html': tee.text})
med_long_html.insert_one({'link': approach_url, 'html': med_long.text})
med_short_html.insert_one({'link': around_the_green_url, 'html': med_short.text})
short_html.insert_one({'link': putting_url, 'html': putt.text})
total_html.insert_one({'link': total_url, 'html': total.text})
scoring_html.insert_one({'link': scoring_url, 'html': scoring_avg.text})
client.list_database_names()

['admin', 'config', 'golf', 'local']

In [9]:
#Parse HTML
soup_tee = BeautifulSoup(tee.text, 'html.parser')
soup_med_long = BeautifulSoup(med_long.text, 'html.parser')
soup_med_short = BeautifulSoup(med_short.text, 'html.parser')
soup_putt = BeautifulSoup(putt.text, 'html.parser')
soup_total = BeautifulSoup(total.text, 'html.parser')
soup_scoreing = BeautifulSoup(scoring_avg.text, 'html.parser')

In [15]:
#Search HTML for table
table_search = soup_tee.find_all('table')
for table in table_search:
    print(table.attrs)

{}
{'class': ['table-styled'], 'id': 'statsTable'}


In [17]:
table_search = soup_med_long.find_all('table')
for table in table_search:
    print(table.attrs)

{}
{'class': ['table-styled'], 'id': 'statsTable'}


In [23]:
table_search = soup_med_short.find_all('table')
for table in table_search:
    print(table.attrs)

{}
{'class': ['table-styled'], 'id': 'statsTable'}


In [20]:
table_search = soup_putt.find_all('table')
for table in table_search:
    print(table.attrs)

{}
{'class': ['table-styled'], 'id': 'statsTable'}


In [21]:
table_search = soup_total.find_all('table')
for table in table_search:
    print(table.attrs)

{}
{'class': ['table-styled'], 'id': 'statsTable'}


In [22]:
table_search = soup_scoreing.find_all('table')
for table in table_search:
    print(table.attrs)

{}
{'class': ['table-styled'], 'id': 'statsTable'}


In [24]:
#All tables have class attribute 'table-styled'
#Functionalize getting data from each
def get_list_from_soup(soup_object):
    data = []
    table = soup_object.find('table', attrs={'class':'table-styled'})
    table_body = table.find('tbody')

    rows = table_body.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele])
        
    return data

In [25]:
#Plug soup objects into function above
tee_data = get_list_from_soup(soup_tee)
approach_data = get_list_from_soup(soup_med_long)
chip_data = get_list_from_soup(soup_med_short)
putt_data = get_list_from_soup(soup_putt)
total_data = get_list_from_soup(soup_total)
scoring_data = get_list_from_soup(soup_scoreing)

In [33]:
print (tee_data[:5])
print(/n)
print (tee_data[-5:-1])

SyntaxError: invalid syntax (<ipython-input-33-36623e3fbadd>, line 2)

In [None]:
approach_data

In [None]:
chip_data

In [None]:
putt_data

In [None]:
total_data

In [None]:
scoring_data

In [None]:
tee_trimmed = [[x[0], x[2], x[4]] for x in tee_data]
approach_trimmed = [[x[0], x[2], x[4]] for x in approach_data]
chip_trimmed = [[x[0], x[2], x[4]] for x in chip_data]
putt_trimmed = [[x[0], x[2], x[4]] for x in putt_data]
scoring_trimmed = [[x[2], x[4]] for x in scoring_data]

In [None]:
tee_trimmed

In [None]:
approach_trimmed

In [None]:
chip_trimmed

In [None]:
putt_trimmed

In [None]:
scoring_trimmed

In [None]:
print(f'{len(tee_trimmed)} values in tee')
print(f'{len(approach_trimmed)} values in approach')
print(f'{len(chip_trimmed)} values in chip')
print(f'{len(putt_trimmed)} values in putt')

In [None]:
tee_df = pd.DataFrame(tee_trimmed, columns = ['tee_rank', 'name', 'tee_strokes_gained'])
tee_df

In [None]:
approach_df = pd.DataFrame(approach_trimmed, columns = ['approach_rank', 'name', 'approach_strokes_gained'])
approach_df

In [None]:
chip_df = pd.DataFrame(chip_trimmed, columns = ['chip_rank', 'name', 'chip_strokes_gained'])
chip_df

In [None]:
putt_df = pd.DataFrame(putt_trimmed, columns = ['putt_rank', 'name', 'putt_strokes_gained'])
putt_df

In [None]:
scoring_df = pd.DataFrame(scoring_trimmed, columns = ['name', 'scoring_average'])
scoring_df

In [None]:
#Merge DataFrames
golf_df = pd.merge(tee_df, approach_df, how = 'left', on = 'name')
golf_df = pd.merge(golf_df, chip_df, how = 'left', on = 'name')
golf_df = pd.merge(golf_df, putt_df, how = 'left', on = 'name')
golf_df = pd.merge(golf_df, scoring_df, how = 'left', on = 'name')
golf_df

In [None]:
#Rearrange columns
cols = golf_df.columns.tolist()
cols = [cols[1],cols[0]] + cols[2:]
golf_df = golf_df[cols]
golf_df

In [None]:
golf_df.info()

In [None]:
#Strip rankings of T in ties
rankings = ['tee_rank', 'approach_rank', 'chip_rank', 'putt_rank']
for series in rankings:
    for string in series:
        golf_df[series] = golf_df[series].str.replace('T','')

In [None]:
golf_df

In [None]:
columns_sans_name = golf_df.columns.tolist()[1:]
for col in columns_sans_name:
    golf_df[col] = pd.to_numeric(golf_df[col])
golf_df.info()

In [None]:
bins = 187 ** .5
print(bins)



In [None]:
strokes_gained_columns = [golf_df.tee_strokes_gained, golf_df.approach_strokes_gained, golf_df.chip_strokes_gained, golf_df.putt_strokes_gained]

for series in strokes_gained_columns:
    fig, ax = plt.subplots()
    ax.hist(series, 14)
    ax.set_xlabel(series.name)
    ax.set_ylabel('frequency')

In [None]:
temp = golf_df.copy()
temp['LONG_STROKES_GAINED'] = (temp['tee_strokes_gained'] + temp['approach_strokes_gained']) /2
long_game_sorted = temp.sort_values('LONG_STROKES_GAINED', ascending = False)
long_game_sorted['LONG_GAME_RANK'] = long_game_sorted['LONG_STROKES_GAINED'].rank( ascending = False)

In [None]:
tempo = golf_df.copy()
tempo['SHORT_STROKES_GAINED'] = (tempo['chip_strokes_gained'] + tempo['putt_strokes_gained'])/2
short_game_sorted = tempo.sort_values('SHORT_STROKES_GAINED', ascending = False)
short_game_sorted['SHORT_GAME_RANK'] = short_game_sorted['SHORT_STROKES_GAINED'].rank(ascending = False)

In [None]:
golf_df['LONG_GAME_RANK'] = long_game_sorted['LONG_GAME_RANK']
golf_df['LONG_STROKES_GAINED'] = long_game_sorted['LONG_STROKES_GAINED']
golf_df['SHORT_GAME_RANK'] = short_game_sorted['SHORT_GAME_RANK']
golf_df['SHORT_STROKES_GAINED'] = short_game_sorted['SHORT_STROKES_GAINED']
golf_df

In [None]:
fig, ax = plt.subplots()
ax.hist(golf_df['LONG_STROKES_GAINED'])
ax.set_xlabel('Long Strokes Gained')
ax.set_ylabel('frequency')

In [None]:
fig, ax = plt.subplots()
ax.hist(golf_df['SHORT_STROKES_GAINED'])
ax.set_xlabel('Short Strokes Gained')
ax.set_ylabel('frequency')

In [None]:
short_game_bool = golf_df['SHORT_STROKES_GAINED'] > 0
short_gamers = golf_df[short_game_bool]
short_gamers

In [None]:
long_game_bool = golf_df['LONG_STROKES_GAINED'] > 0
long_gamers = golf_df[long_game_bool]
long_gamers

In [None]:
fig, ax = plt.subplots()
ax.hist(golf_df['scoring_average'])
ax.set_xlabel('Population scoring averages')
ax.set_ylabel('frequency')

In [None]:
long_gamers_scoring = long_gamers['scoring_average']
short_gamers_scoring = short_gamers['scoring_average']
long_game_mean = long_gamers_scoring.mean()
short_game_mean = short_gamers_scoring.mean()

In [None]:
fig, ax = plt.subplots()
ax.hist(long_gamers_scoring,alpha=0.3,bins=5,label='Long-gamers') #density = True
ax.hist(short_gamers_scoring,alpha=0.3,bins=5,label='Short-gamers') #density = True
ax.set_xlabel('Scoring Average')
ax.set_ylabel('Frequency ')
ax.legend();

In [None]:
#null 1, the averages are equal

In [None]:
def welch_test_statistic(sample_1, sample_2):
    numerator = np.mean(sample_1) - np.mean(sample_2)
    denominator_sq = (np.var(sample_1) / len(sample_1)) + \
                        (np.var(sample_2) / len(sample_2))
    return numerator / np.sqrt(denominator_sq)

In [None]:
long_vs_short_test_stat = welch_test_statistic(long_gamers_scoring, short_gamers_scoring)
print("Welch Test Statistic: {:2.2f}".format(long_vs_short_test_stat))

In [None]:
def welch_satterhwaithe_df(sample_1, sample_2):
    ss1 = len(sample_1)
    ss2 = len(sample_2)
    df = (
        ((np.var(sample_1)/ss1 + np.var(sample_2)/ss2)**(2.0)) / 
        ((np.var(sample_1)/ss1)**(2.0)/(ss1 - 1) + (np.var(sample_2)/ss2)**(2.0)/(ss2 - 1))
    )
    return df

In [None]:
deg_free = welch_satterhwaithe_df(long_gamers_scoring, short_gamers_scoring)
print("Degrees of Freedom for Welch's Test: {:2.2f}".format(deg_free))

In [None]:
#choose alpha .05

In [None]:
t_dist = stats.t(deg_free)
p_different_means = t_dist.cdf(long_vs_short_test_stat) + (1 - t_dist.cdf(-long_vs_short_test_stat))
print("p-value for different average score: {:2.2f}".format(p_different_means))

In [None]:
#fail to reject null hypothesis

In [None]:
drivers_bool = golf_df['tee_strokes_gained'] > 0
drivers = golf_df[drivers_bool]
drivers

In [None]:
putters_bool = golf_df['putt_strokes_gained'] > 0
putters = golf_df[putters_bool]
putters

In [None]:
drivers_scoring = drivers['scoring_average']
putters_scoring = putters['scoring_average']
drivers_mean = drivers_scoring.mean()
putters_mean = putters_scoring.mean()

In [None]:
fig, ax = plt.subplots()
ax.hist(drivers_scoring,alpha=0.3,bins=5,label='Drivers') #density = True
ax.hist(putters_scoring,alpha=0.3,bins=5,label='Putters') #density = True
ax.set_xlabel('Scoring Average')
ax.set_ylabel('Frequency ')
ax.legend();

In [None]:
#null 1 the means are equal

In [None]:
drive_vs_putt_test_stat = welch_test_statistic(drivers_scoring, putters_scoring)
print("Welch Test Statistic: {:2.2f}".format(drive_vs_putt_test_stat))

In [None]:
deg_free2 = welch_satterhwaithe_df(drivers_scoring, putters_scoring)
print("Degrees of Freedom for Welch's Test: {:2.2f}".format(deg_free2))

In [None]:
t_dist = stats.t(deg_free2)
p_different_means = t_dist.cdf(drive_vs_putt_test_stat) + (1 - t_dist.cdf(-drive_vs_putt_test_stat))
print("p-value for different average score: {:2.2f}".format(p_different_means))

In [7]:
#fail to reject null hypothesis