In [1]:
import re
import requests
from collections import defaultdict
from html.parser import HTMLParser

BASE_URL = "http://www.owgr.com"

In [2]:
def is_str_blank(s):
    return all([ch == " " for ch in s])

def clean_html(s):
    return s.strip().replace("\n","").replace("\r", "")

def get_id_from_player_url(player_url):
    r = re.compile("\d+")
    m = re.search(r, player_url)
    return m.group(0)

#  1 	 2-3 	 4-10 	 11-25 	 Made Cut	Missed Cut / WD / DQ
def get_bucket_from_standing(pos):
    r = re.compile("\d+")
    m = r.search(pos)
    if m is not None:
        place = int(m.group(0))
        if place == 1:
            return "1"
        elif place < 4:
            return "2-3"
        elif place < 11:
            return "4-10"
        elif place < 26:
            return "11-25"
        else:
            return "CM"
    else:
        return "boo"

## Parse Events

In [3]:
COLS = ["Week", "Year", "Tour", "EventUrl", "EventName", "PlayerUrl", "Winner", "WinnerPoints", "WorldRating", "HomeRating", "SoF"]


class OwgrEventsHtmlParser(HTMLParser):
    def __init__(self):
        self.debug = False
        self.at_table = False
        self.row = None
        self.all_rows = []
        
        super().__init__()
    
    def handle_starttag(self, tag, attrs):
        if not self.at_table and tag == "table":
            self.at_table = True
        
        if self.at_table:
            if tag == "tr":
                if self.debug:
                    print("starting new row")
                self.row = []
            
            if self.row is not None and tag == "a":
                self.row.append(attrs[0][1])

    def handle_endtag(self, tag):
        if self.at_table and tag == "table":
            if self.debug:
                print("Leaving the table :", tag)
            self.at_table = False

        if tag == "tr":
            r = [el.strip() for el in self.row]
            if self.debug:
                print("Leaving the row :", tag)
                print(r)
            d = dict(zip(COLS, r))
            self.all_rows.append(d)
            self.row = None

    def handle_data(self, data):
        if self.at_table and self.row is not None:
            if not is_str_blank(data):
                self.row.append(data)


In [4]:
events_for_2018 = []
urls = ["http://www.owgr.com/events?pageNo=1&pageSize=400&tour=&year=2018"]
for url in urls:
    r = requests.get(url)
    t = r.text.strip().replace("\n","").replace("\r", "")
    p = OwgrEventsHtmlParser()
    p.feed(t)
    events_for_2018.extend(p.all_rows[1:])
len(events_for_2018)

397

In [5]:
t = r.text.strip().replace("\n","").replace("\r", "")

In [6]:
p = OwgrEventsHtmlParser()

In [7]:
p.feed(t)

In [8]:
events = events_for_2018

In [9]:
all_event_names = [el["EventName"] for el in events]
all_event_names.reverse()
all_event_names

['56',
 '1st Points',
 'Sentry Tournament of Champions',
 'Sony Open in Hawaii',
 'BMW SA Open hosted by the City of Ekurhuleni',
 'REBEL Sport Masters',
 'CareerBuilder Challenge',
 'The Bahamas Great Exuma Classic at Sandals Emerald Bay',
 'Red Sea Egyptian Classic',
 'SMBC Singapore Open',
 'Abu Dhabi HSBC Championship presented by EGA',
 'Farmers Insurance Open',
 'Leopalace21 Myanmar Open',
 'Omega Dubai Desert Classic',
 'The Bahamas Great Abaco Classic at The Abaco Club',
 'Red Sea Ain Sokhna Classic',
 'PGM Darulaman Championship',
 'Maybank Championship',
 'Oates Vic Open',
 'Panama Championship',
 'Waste Management Phoenix Open',
 'ISPS Handa World Super 6 Perth',
 'AT&T Pebble Beach Pro-Am',
 'Open Prestigia',
 'Eye of Africa PGA Championship',
 'Club Colombia Championship',
 'Open Casa Green Golf',
 'Ein Bay Open',
 'Genesis Open',
 'NBO Oman Open',
 'Coca-Cola QLD PGA Championship',
 'Dimension Data Pro-Am',
 'Mediter Real Estate Masters',
 'The Honda Classic',
 'Horizon G

## Parse single event

In [10]:
e = events[0]["EventUrl"]
r = requests.get(BASE_URL + e)

In [11]:
h = clean_html(r.text)

In [12]:
EVENT_COLS = ['Pos', 'PlayerUrl', 'Name', 'R1', 'R2', 'R3', 'R4', 'Agg', 'Ranking Points']


class OwgrSingleEventHtmlParser(HTMLParser):
    def __init__(self):
        self.at_table = False
        self.row = None
        self.all_rows = []
        self.debug = False
        
        super().__init__()
    
    def handle_starttag(self, tag, attrs):
        if not self.at_table and tag == "table":
            if self.debug:
                print("Starting table")
            self.at_table = True
        
        if self.at_table:
            if tag == "tr":
                if self.debug:
                    print("starting new row")
                self.row = []
            
            if self.row is not None and tag == "a":
                self.row.append(attrs[0][1])

    def handle_endtag(self, tag):
        if self.at_table and tag == "table":
            if self.debug:
                print("Leaving the table :", tag)
            self.at_table = False

        if tag == "tr":
            r = [el.strip() for el in self.row]
            if self.debug:
                print(r)
                print("Leaving the row :", tag)
            if len(r) > 0:
                d = dict(zip(EVENT_COLS, self.row))
                self.all_rows.append(d)
            self.row = None

    def handle_data(self, data):
        if self.at_table and self.row is not None:
            if not is_str_blank(data):
                self.row.append(data)


In [13]:
p = OwgrSingleEventHtmlParser()
p.feed(h)

In [14]:
p.all_rows[1:]

[{'Agg': '268',
  'Name': 'Poom Saksansin',
  'PlayerUrl': '/en/Ranking/PlayerProfile.aspx?playerID=14951',
  'Pos': '1',
  'R1': '67',
  'R2': '63',
  'R3': '70',
  'R4': '68',
  'Ranking Points': '24'},
 {'Agg': '271',
  'Name': 'Jazz Janewattananond',
  'PlayerUrl': '/en/Ranking/PlayerProfile.aspx?playerID=14368',
  'Pos': '2',
  'R1': '68',
  'R2': '69',
  'R3': '69',
  'R4': '65',
  'Ranking Points': '14.4'},
 {'Agg': '273',
  'Name': 'Panuphol Pittayarat',
  'PlayerUrl': '/en/Ranking/PlayerProfile.aspx?playerID=14133',
  'Pos': '3',
  'R1': '71',
  'R2': '66',
  'R3': '70',
  'R4': '66',
  'Ranking Points': '9.6'},
 {'Agg': '274',
  'Name': 'Henrik Stenson',
  'PlayerUrl': '/en/Ranking/PlayerProfile.aspx?playerID=5994',
  'Pos': '4',
  'R1': '67',
  'R2': '68',
  'R3': '68',
  'R4': '71',
  'Ranking Points': '7.2'},
 {'Agg': '275',
  'Name': 'Thitiphun Chuayprakong',
  'PlayerUrl': '/en/Ranking/PlayerProfile.aspx?playerID=14129',
  'Pos': '5',
  'R1': '67',
  'R2': '70',
  'R3': 

## Find events of interest

## Get weights

In [15]:
weights = {}
for l in open("data/2019 draft board for scrape.csv"):
    r = l.strip().split(",")
    try:
        weights[r[1]] = float(r[0])
    except:
        pass

In [16]:
OUR_TOURNEYS = [k for k,v in weights.items()]

In [17]:
weights

{'100th PGA Championship': 2.0,
 '147th Open Championship': 2.0,
 'A Military Tribute at The Greenbrier': 0.5,
 'AT&T Byron Nelson': 1.0,
 'AT&T Pebble Beach Pro-Am': 1.0,
 'Aberdeen Standard Investments Scottish Open': 1.0,
 'Arnold Palmer Invitational presented by Mastercard': 1.0,
 'BMW Championship': 1.5,
 'BMW PGA CHAMPIONSHIP': 0.5,
 'Dell Technologies Championship': 0.5,
 'Dubai Duty Free Irish Open Hosted by the Rory Foundation': 1.0,
 'FedEx St. Jude Classic': 1.0,
 'Fort Worth Invitational': 1.0,
 'Genesis Open': 1.0,
 'HOLD FOR ZURICH': 0.0,
 'Houston Open': 0.5,
 'John Deere Classic': 1.0,
 'Masters Tournament': 2.0,
 'RBC Canadian Open': 1.0,
 'RBC Heritage': 1.0,
 'THE PLAYERS Championship': 1.5,
 'TOUR Championship': 1.5,
 'The Honda Classic': 1.0,
 'The Northern Trust': 1.0,
 'Travelers Championship': 1.0,
 'U.S. Open': 2.0,
 'Valspar Championship': 1.0,
 'WGC - Bridgestone Invitational': 1.0,
 'WGC - Dell Technologies Match Play': 1.0,
 'WGC - Mexico Championship': 1.5

**Check!  All the names of the tournaments are found in our data from OWGR.**

In [18]:
for el in OUR_TOURNEYS:
    if el not in all_event_names:
        print(el)

HOLD FOR ZURICH


## Parse each event list:

In [19]:
player_id_to_name = {}
player_id_to_standings = defaultdict(list)

In [20]:
for event in events:
    if event["EventName"] in OUR_TOURNEYS:
        w = weights[event["EventName"]]
        print(event["EventName"])
        r = requests.get(BASE_URL + event["EventUrl"])
        h = clean_html(r.text)
        p = OwgrSingleEventHtmlParser()
        p.feed(h)
        for el in p.all_rows[1:]:
            i = get_id_from_player_url(el["PlayerUrl"])
            if i not in player_id_to_name:
                player_id_to_name[i] = el["Name"]
            player_id_to_standings[i].append((el["Pos"], w))

TOUR Championship


AttributeError: 'NoneType' object has no attribute 'group'

In [None]:
{player_id_to_name[k]: v for k,v in  player_id_to_standings.items()}

In [None]:
results = {player_id_to_name[k]: [( get_bucket_from_standing(el[0]), el[1]) for el in v] for k,v in  player_id_to_standings.items()}
results

In [None]:
#  1 	2-3 	4-10	11-25	Made Cut	Missed Cut / WD / DQ

final_rows = []
for player, standings in results.items():
    row = [player, 0, 0, 0, 0, 0, 0]
    for standing in standings:
        group = standing[0]
        weight = standing[1]
        if group == '1':
            row[1] += weight
        elif group == '2-3':
            row[2] += weight
        elif group == '4-10':
            row[3] += weight
        elif group == '11-25':
            row[4] += weight
        elif group == 'CM':
            row[5] += weight
        elif group == "boo":
            row[6] += weight
    
    final_rows.append(row)

final_rows

In [None]:
with open("data/2019_draft_prep_out.csv", "w") as wf:
    for row in final_rows:
        wf.write(",".join([str(el) for el in row]) + "\n")