In [293]:
import re
import requests
from collections import defaultdict
from html.parser import HTMLParser

BASE_URL = "http://www.owgr.com"

In [335]:
def is_str_blank(s):
    return all([ch == " " for ch in s])

def clean_html(s):
    return s.strip().replace("\n","").replace("\r", "")

def get_id_from_player_url(player_url):
    r = re.compile("\d+")
    m = re.search(r, player_url)
    return m.group(0)

#  1 	 2-3 	 4-10 	 11-25 	 Made Cut	Missed Cut / WD / DQ
def get_bucket_from_standing(pos):
    r = re.compile("\d+")
    m = r.search(pos)
    if m is not None:
        place = int(m.group(0))
        if place == 1:
            return "1"
        elif place < 4:
            return "2-3"
        elif place < 11:
            return "4-10"
        elif place < 26:
            return "11-25"
        else:
            return "CM"
    else:
        return "boo"

## Parse Events

In [243]:
COLS = ["Week", "Year", "Tour", "EventUrl", "EventName", "PlayerUrl", "Winner", "WinnerPoints", "WorldRating", "HomeRating", "SoF"]


class OwgrEventsHtmlParser(HTMLParser):
    def __init__(self):
        self.debug = False
        self.at_table = False
        self.row = None
        self.all_rows = []
        
        super().__init__()
    
    def handle_starttag(self, tag, attrs):
        if not self.at_table and tag == "table":
            self.at_table = True
        
        if self.at_table:
            if tag == "tr":
                if self.debug:
                    print("starting new row")
                self.row = []
            
            if self.row is not None and tag == "a":
                self.row.append(attrs[0][1])

    def handle_endtag(self, tag):
        if self.at_table and tag == "table":
            if self.debug:
                print("Leaving the table :", tag)
            self.at_table = False

        if tag == "tr":
            r = [el.strip() for el in self.row]
            if self.debug:
                print("Leaving the row :", tag)
                print(r)
            d = dict(zip(COLS, r))
            self.all_rows.append(d)
            self.row = None

    def handle_data(self, data):
        if self.at_table and self.row is not None:
            if not is_str_blank(data):
                self.row.append(data)


In [356]:
events_for_2018 = []
urls = ["http://www.owgr.com/events?pageNo=1&pageSize=400&tour=&year=2018"]
for url in urls:
    r = requests.get(url)
    t = r.text.strip().replace("\n","").replace("\r", "")
    p = OwgrEventsHtmlParser()
    p.feed(t)
    events_for_2018.extend(p.all_rows[1:])
len(events_for_2018)

395

In [357]:
t = r.text.strip().replace("\n","").replace("\r", "")

In [358]:
p = OwgrEventsHtmlParser()

In [359]:
p.feed(t)

In [360]:
events = events_for_2018

In [361]:
all_event_names = [el["EventName"] for el in events]
all_event_names.reverse()
all_event_names

['Sentry Tournament of Champions',
 'Sony Open in Hawaii',
 'BMW SA Open hosted by the City of Ekurhuleni',
 'REBEL Sport Masters',
 'CareerBuilder Challenge',
 'The Bahamas Great Exuma Classic at Sandals Emerald Bay',
 'Red Sea Egyptian Classic',
 'SMBC Singapore Open',
 'Abu Dhabi HSBC Championship presented by EGA',
 'Farmers Insurance Open',
 'Leopalace21 Myanmar Open',
 'Omega Dubai Desert Classic',
 'The Bahamas Great Abaco Classic at The Abaco Club',
 'Red Sea Ain Sokhna Classic',
 'PGM Darulaman Championship',
 'Maybank Championship',
 'Oates Vic Open',
 'Panama Championship',
 'Waste Management Phoenix Open',
 'ISPS Handa World Super 6 Perth',
 'AT&T Pebble Beach Pro-Am',
 'Open Prestigia',
 'Eye of Africa PGA Championship',
 'Club Colombia Championship',
 'Open Casa Green Golf',
 'Ein Bay Open',
 'Genesis Open',
 'NBO Oman Open',
 'Coca-Cola QLD PGA Championship',
 'Dimension Data Pro-Am',
 'Mediter Real Estate Masters',
 'The Honda Classic',
 'Horizon Golf NZ PGA Championshi

## Parse single event

In [283]:
e = events[0]["EventUrl"]
r = requests.get(BASE_URL + e)

In [272]:
h = clean_html(r.text)

In [273]:
EVENT_COLS = ['Pos', 'PlayerUrl', 'Name', 'R1', 'R2', 'R3', 'R4', 'Agg', 'Ranking Points']


class OwgrSingleEventHtmlParser(HTMLParser):
    def __init__(self):
        self.at_table = False
        self.row = None
        self.all_rows = []
        self.debug = False
        
        super().__init__()
    
    def handle_starttag(self, tag, attrs):
        if not self.at_table and tag == "table":
            if self.debug:
                print("Starting table")
            self.at_table = True
        
        if self.at_table:
            if tag == "tr":
                if self.debug:
                    print("starting new row")
                self.row = []
            
            if self.row is not None and tag == "a":
                self.row.append(attrs[0][1])

    def handle_endtag(self, tag):
        if self.at_table and tag == "table":
            if self.debug:
                print("Leaving the table :", tag)
            self.at_table = False

        if tag == "tr":
            r = [el.strip() for el in self.row]
            if self.debug:
                print(r)
                print("Leaving the row :", tag)
            if len(r) > 0:
                d = dict(zip(EVENT_COLS, self.row))
                self.all_rows.append(d)
            self.row = None

    def handle_data(self, data):
        if self.at_table and self.row is not None:
            if not is_str_blank(data):
                self.row.append(data)


In [362]:
p = OwgrSingleEventHtmlParser()
p.feed(h)

In [363]:
p.all_rows[1:]

[{'Pos': '1',
  'PlayerUrl': '/en/Ranking/PlayerProfile.aspx?playerID=12577',
  'Name': 'Gary Woodland',
  'R1': '67',
  'R2': '68',
  'R3': '67',
  'R4': '64',
  'Agg': '266',
  'Ranking Points': '60'},
 {'Pos': '2',
  'PlayerUrl': '/en/Ranking/PlayerProfile.aspx?playerID=7961',
  'Name': 'Chez Reavie',
  'R1': '68',
  'R2': '65',
  'R3': '67',
  'R4': '66',
  'Agg': '266',
  'Ranking Points': '36'},
 {'Pos': 'T3',
  'PlayerUrl': '/en/Ranking/PlayerProfile.aspx?playerID=11019',
  'Name': 'Brendan Steele',
  'R1': '68',
  'R2': '67',
  'R3': '67',
  'R4': '67',
  'Agg': '269',
  'Ranking Points': '21'},
 {'Pos': 'T3',
  'PlayerUrl': '/en/Ranking/PlayerProfile.aspx?playerID=18662',
  'Name': 'Ollie Schniederjans',
  'R1': '68',
  'R2': '68',
  'R3': '68',
  'R4': '65',
  'Agg': '269',
  'Ranking Points': '21'},
 {'Pos': 'T5',
  'PlayerUrl': '/en/Ranking/PlayerProfile.aspx?playerID=19841',
  'Name': 'Bryson DeChambeau',
  'R1': '66',
  'R2': '66',
  'R3': '68',
  'R4': '70',
  'Agg': '27

## Find events of interest

## Get weights

In [364]:
weights = {}
for l in open("data/2019 draft board for scrape.csv"):
    r = l.strip().split(",")
    try:
        weights[r[1]] = float(r[0])
    except:
        pass

In [365]:
OUR_TOURNEYS = [k for k,v in weights.items()]

In [366]:
weights

{'Waste Management Phoenix Open': 1.0,
 'AT&T Pebble Beach Pro-Am': 1.0,
 'Genesis Open': 1.0,
 'WGC - Mexico Championship': 1.5,
 'The Honda Classic': 1.0,
 'Arnold Palmer Invitational presented by Mastercard': 1.0,
 'THE PLAYERS Championship': 1.5,
 'Valspar Championship': 1.0,
 'WGC - Dell Technologies Match Play': 1.0,
 'Masters Tournament': 2.0,
 'Houston Open': 0.5,
 'RBC Heritage': 1.0,
 'HOLD FOR ZURICH': 0.0,
 'Wells Fargo Championship': 1.0,
 'AT&T Byron Nelson': 1.0,
 '100th PGA Championship': 2.0,
 'Fort Worth Invitational': 1.0,
 'the Memorial Tournament presented by Nationwide': 1.0,
 'RBC Canadian Open': 1.0,
 'U.S. Open': 2.0,
 'Travelers Championship': 1.0,
 'Aberdeen Standard Investments Scottish Open': 1.0,
 'Dubai Duty Free Irish Open Hosted by the Rory Foundation': 1.0,
 'John Deere Classic': 1.0,
 'A Military Tribute at The Greenbrier': 0.5,
 '147th Open Championship': 2.0,
 'FedEx St. Jude Classic': 1.0,
 'WGC - Bridgestone Invitational': 1.0,
 'Wyndham Champions

**Check!  All the names of the tournaments are found in our data from OWGR.**

In [367]:
for el in OUR_TOURNEYS:
    if el not in all_event_names:
        print(el)

HOLD FOR ZURICH


## Parse each event list:

In [368]:
player_id_to_name = {}
player_id_to_standings = defaultdict(list)

In [369]:
for event in events:
    if event["EventName"] in OUR_TOURNEYS:
        w = weights[event["EventName"]]
        print(event["EventName"])
        r = requests.get(BASE_URL + event["EventUrl"])
        h = clean_html(r.text)
        p = OwgrSingleEventHtmlParser()
        p.feed(h)
        for el in p.all_rows[1:]:
            i = get_id_from_player_url(el["PlayerUrl"])
            if i not in player_id_to_name:
                player_id_to_name[i] = el["Name"]
            player_id_to_standings[i].append((el["Pos"], w))

TOUR Championship
BMW Championship
Dell Technologies Championship
The Northern Trust
Wyndham Championship
100th PGA Championship
WGC - Bridgestone Invitational
RBC Canadian Open
147th Open Championship
Aberdeen Standard Investments Scottish Open
John Deere Classic
Dubai Duty Free Irish Open Hosted by the Rory Foundation
A Military Tribute at The Greenbrier
Travelers Championship
U.S. Open
FedEx St. Jude Classic
the Memorial Tournament presented by Nationwide
Fort Worth Invitational
BMW PGA CHAMPIONSHIP
AT&T Byron Nelson
THE PLAYERS Championship
Wells Fargo Championship
RBC Heritage
Masters Tournament
Houston Open
WGC - Dell Technologies Match Play
Arnold Palmer Invitational presented by Mastercard
Valspar Championship
WGC - Mexico Championship
The Honda Classic
Genesis Open
AT&T Pebble Beach Pro-Am
Waste Management Phoenix Open


In [370]:
{player_id_to_name[k]: v for k,v in  player_id_to_standings.items()}

{'Tiger Woods': [('1', 1.5),
  ('T6', 1.5),
  ('T24', 0.5),
  ('T40', 1.0),
  ('2', 2.0),
  ('T31', 1.0),
  ('T6', 2.0),
  ('MC', 2.0),
  ('T23', 1.0),
  ('T11', 1.5),
  ('T55', 1.0),
  ('T32', 2.0),
  ('T5', 1.0),
  ('T2', 1.0),
  ('12', 1.0),
  ('MC', 1.0)],
 'Billy Horschel': [('2', 1.5),
  ('T3', 1.5),
  ('WD', 0.5),
  ('T3', 1.0),
  ('T11', 1.0),
  ('T35', 2.0),
  ('MC', 1.0),
  ('MC', 1.0),
  ('T51', 1.0),
  ('MC', 1.0),
  ('T21', 1.0),
  ('T37', 1.5),
  ('T5', 1.0),
  ('MC', 2.0),
  ('T54', 1.0),
  ('MC', 1.0),
  ('MC', 1.0),
  ('MC', 1.0),
  ('MC', 1.0),
  ('T43', 1.0)],
 'Dustin Johnson': [('3', 1.5),
  ('T24', 1.5),
  ('T7', 0.5),
  ('T11', 1.0),
  ('T27', 2.0),
  ('T3', 1.0),
  ('1', 1.0),
  ('MC', 2.0),
  ('3', 2.0),
  ('1', 1.0),
  ('T8', 1.0),
  ('T17', 1.5),
  ('T16', 1.0),
  ('T10', 2.0),
  ('T59', 1.0),
  ('T7', 1.5),
  ('T16', 1.0),
  ('T2', 1.0)],
 'Webb Simpson': [('T4', 1.5),
  ('T6', 1.5),
  ('T49', 0.5),
  ('T28', 1.0),
  ('T2', 1.0),
  ('T19', 2.0),
  ('T24', 1.

In [371]:
results = {player_id_to_name[k]: [( get_bucket_from_standing(el[0]), el[1]) for el in v] for k,v in  player_id_to_standings.items()}
results

{'Tiger Woods': [('1', 1.5),
  ('4-10', 1.5),
  ('11-25', 0.5),
  ('CM', 1.0),
  ('2-3', 2.0),
  ('CM', 1.0),
  ('4-10', 2.0),
  ('boo', 2.0),
  ('11-25', 1.0),
  ('11-25', 1.5),
  ('CM', 1.0),
  ('CM', 2.0),
  ('4-10', 1.0),
  ('2-3', 1.0),
  ('11-25', 1.0),
  ('boo', 1.0)],
 'Billy Horschel': [('2-3', 1.5),
  ('2-3', 1.5),
  ('boo', 0.5),
  ('2-3', 1.0),
  ('11-25', 1.0),
  ('CM', 2.0),
  ('boo', 1.0),
  ('boo', 1.0),
  ('CM', 1.0),
  ('boo', 1.0),
  ('11-25', 1.0),
  ('CM', 1.5),
  ('4-10', 1.0),
  ('boo', 2.0),
  ('CM', 1.0),
  ('boo', 1.0),
  ('boo', 1.0),
  ('boo', 1.0),
  ('boo', 1.0),
  ('CM', 1.0)],
 'Dustin Johnson': [('2-3', 1.5),
  ('11-25', 1.5),
  ('4-10', 0.5),
  ('11-25', 1.0),
  ('CM', 2.0),
  ('2-3', 1.0),
  ('1', 1.0),
  ('boo', 2.0),
  ('2-3', 2.0),
  ('1', 1.0),
  ('4-10', 1.0),
  ('11-25', 1.5),
  ('11-25', 1.0),
  ('4-10', 2.0),
  ('CM', 1.0),
  ('4-10', 1.5),
  ('11-25', 1.0),
  ('2-3', 1.0)],
 'Webb Simpson': [('4-10', 1.5),
  ('4-10', 1.5),
  ('CM', 0.5),
  ('

In [372]:
#         1 	2-3 	4-10	11-25	Made Cut	Missed Cut / WD / DQ

final_rows = []
for player, standings in results.items():
    row = [player, 0, 0, 0, 0, 0, 0]
    for standing in standings:
        group = standing[0]
        weight = standing[1]
        if group == '1':
            row[1] += weight
        elif group == '2-3':
            row[2] += weight
        elif group == '4-10':
            row[3] += weight
        elif group == '11-25':
            row[4] += weight
        elif group == 'CM':
            row[5] += weight
        elif group == "boo":
            row[6] += weight
    
    final_rows.append(row)

final_rows

[['Tiger Woods', 1.5, 3.0, 4.5, 4.0, 5.0, 3.0],
 ['Billy Horschel', 0, 4.0, 1.0, 2.0, 6.5, 9.5],
 ['Dustin Johnson', 2.0, 5.5, 5.0, 6.0, 3.0, 2.0],
 ['Webb Simpson', 1.5, 1.0, 8.0, 8.0, 4.5, 3.0],
 ['Justin Rose', 1.0, 5.0, 6.5, 5.5, 2.0, 1.0],
 ['Hideki Matsuyama', 0, 0, 2.0, 9.5, 6.0, 5.5],
 ['Rickie Fowler', 0, 2.0, 5.0, 9.0, 4.0, 2.5],
 ['Justin Thomas', 2.0, 1.5, 7.5, 9.5, 1.0, 2.0],
 ['Xander Schauffele', 0, 5.0, 4.5, 4.0, 7.5, 4.0],
 ['Rory McIlroy', 1.0, 2.5, 7.0, 3.5, 5.0, 5.5],
 ['Tommy Fleetwood', 0, 2.0, 5.0, 11.0, 4.0, 1.0],
 ['Gary Woodland', 1.0, 0, 2.0, 6.5, 8.5, 6.5],
 ['Paul Casey', 1.0, 1.0, 2.0, 9.0, 5.0, 4.5],
 ['Jon Rahm', 0, 0, 6.0, 6.5, 4.0, 5.0],
 ['Aaron Wise', 1.0, 1.0, 2.0, 4.0, 4.0, 8.5],
 ['Tony Finau', 0, 2.0, 9.0, 5.5, 6.0, 3.0],
 ['Kyle Stanley', 0, 2.0, 1.0, 6.5, 6.5, 8.5],
 ['Jason Day', 1.0, 1.0, 2.5, 12.0, 2.0, 2.5],
 ['Bryson DeChambeau', 2.5, 2.0, 3.0, 5.0, 9.5, 4.0],
 ['Cameron Smith', 0, 1.5, 4.0, 2.5, 9.5, 6.5],
 ['Patton Kizzire', 0, 0, 0, 3.0

In [373]:
with open("data/2019_draft_prep_out.csv", "w") as wf:
    for row in final_rows:
        wf.write(",".join([str(el) for el in row]) + "\n")