In [1]:
YEAR_URL = "http://www.atpworldtour.com/en/scores/results-archive?year=1975"

In [186]:
import requests
from lxml import html
import re

def html_parse_tree(url):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    return tree

def xpath_parse(tree, xpath):
    result = tree.xpath(xpath)
    return result


def regex_strip_string(string):
    string = re.sub('\n', '', string).strip()
    string = re.sub('\r', '', string).strip()
    string = re.sub('\t', '', string).strip()
    return string


def regex_strip_array(array):
    for i in range(0, len(array)):
        array[i] = regex_strip_string(array[i]).strip()
    return array

In [17]:
year_tree = html_parse_tree(YEAR_URL) 

In [18]:
year_tree

<Element html at 0x10b2658b8>

In [19]:
tourney_title_xpath = "//span[contains(@class, 'tourney-title')]/text()"
tourney_title_parsed = xpath_parse(year_tree, tourney_title_xpath)
tourney_title_cleaned = regex_strip_array(tourney_title_parsed)

In [50]:
list(enumerate(tourney_title_cleaned))

[(0, 'Australian Open'),
 (1, 'Auckland'),
 (2, 'San Juan WCT'),
 (3, 'Bahamas'),
 (4, 'Baltimore'),
 (5, 'Philadelphia WCT'),
 (6, 'Birmingham'),
 (7, 'Richmond-WCT'),
 (8, 'Dayton'),
 (9, 'Roanoke'),
 (10, 'St. Petersburg WCT'),
 (11, 'Little Rock'),
 (12, 'Basel'),
 (13, 'Bologna WCT'),
 (14, 'Toronto WCT'),
 (15, 'Salisbury'),
 (16, 'Barcelona WCT'),
 (17, 'Fort Worth WCT'),
 (18, 'La Costa WCT'),
 (19, 'Boca Raton'),
 (20, 'Algiers'),
 (21, 'Rotterdam WCT'),
 (22, 'San Antonio WCT'),
 (23, 'Fairfield'),
 (24, 'Cairo'),
 (25, 'London WCT'),
 (26, 'Shreveport'),
 (27, 'WCT Aetna World Cup'),
 (28, 'Munich WCT'),
 (29, 'Sao Paulo WCT'),
 (30, 'Washington WCT'),
 (31, 'Washington WCT'),
 (32, 'Dusseldorf Indoor'),
 (33, 'Hampton'),
 (34, 'Caracas WCT'),
 (35, 'Memphis WCT'),
 (36, 'New York'),
 (37, 'Atlanta WCT'),
 (38, 'Monte Carlo WCT'),
 (39, 'Orlando WCT'),
 (40, 'Jackson'),
 (41, 'Tuscon'),
 (42, 'Washington Indoor'),
 (43, 'St. Louis WCT'),
 (44, 'Barcelona'),
 (45, 'Johannesbu

In [53]:
tourney_details_url_xpath = "//tr[contains(@class, 'tourney-result')][48]/td[contains(@class, 'tourney-details')]/a/@href"
tourney_details_url_parsed = xpath_parse(year_tree, tourney_details_url_xpath)

In [54]:
tourney_details_url_parsed

['/en/scores/archive/tokyo/9037/1975/results']

In [59]:
xpath = "//tr[@class='tourney-result']"  # Gets all table rows with name 'tourney-result'
w = xpath_parse(year_tree, xpath)

In [62]:
w

[<Element tr at 0x10b3becc8>,
 <Element tr at 0x10b1dd368>,
 <Element tr at 0x10b2659a8>,
 <Element tr at 0x10b265ae8>,
 <Element tr at 0x10b265868>,
 <Element tr at 0x10b265bd8>,
 <Element tr at 0x10b265318>,
 <Element tr at 0x10b265a48>,
 <Element tr at 0x10b265818>,
 <Element tr at 0x10b265a98>,
 <Element tr at 0x10b265908>,
 <Element tr at 0x10b265b38>,
 <Element tr at 0x10b265c28>,
 <Element tr at 0x10b265c78>,
 <Element tr at 0x10b265cc8>,
 <Element tr at 0x10b265d18>,
 <Element tr at 0x10b265d68>,
 <Element tr at 0x10b265ea8>,
 <Element tr at 0x10b265ef8>,
 <Element tr at 0x10b265f48>,
 <Element tr at 0x10b265f98>,
 <Element tr at 0x10b281db8>,
 <Element tr at 0x10b281c78>,
 <Element tr at 0x10b281048>,
 <Element tr at 0x10b281098>,
 <Element tr at 0x10b2810e8>,
 <Element tr at 0x10b281138>,
 <Element tr at 0x10b281188>,
 <Element tr at 0x10b2811d8>,
 <Element tr at 0x10b281228>,
 <Element tr at 0x10b281278>,
 <Element tr at 0x10b2812c8>,
 <Element tr at 0x10b281318>,
 <Element 

In [496]:
import pandas as pd
import requests
from lxml import html


def xpath_parse(tree, xpath):
    result = tree.xpath(xpath)
    return result


class TournamentScraper():

    EXPECTED_ELEMS = 8  # We expect 8 elements per tournament row
    ELIGIBLE_SURFACES = [
        'Hard',
        'Carpet',
        'Clay',
        'Grass'
    ]
    
    @staticmethod
    def clean_text(to_parse, xpath_expr, unique=True):
        _x = to_parse.xpath(xpath_expr)
        if len(_x) > 1:
            raise ValueError("Expected singleton, received array of length %d" % len(_x))
        return regex_strip_string(_x[0])

    def _check_xpath_validity(self):
        n_elems = len(self.tr_elem.xpath('.//td'))
        if n_elems != self.EXPECTED_ELEMS:
            raise ValueError("Expected {0} elements per tournament, received {1}").format(
                self.EXPECTED_ELEMS, n_elems
            )

    def _process_title_location_date(self, elem):
        self.tourney_title = self.clean_text(elem, ".//span[@class='tourney-title']/text()")
        self.tourney_location = self.clean_text(elem, ".//span[@class='tourney-location']/text()")
        self.tourney_dates = self.clean_text(elem, ".//span[@class='tourney-dates']/text()")

    def _process_draw_sizes(self, elem):
        sgd_dbl = regex_strip_array(elem.xpath(".//div[@class='item-details']/text()"))
        assert sgd_dbl == ['SGL', 'DBL', '']
        draw_sizes = regex_strip_array(elem.xpath(".//span[@class='item-value']/text()"))
        if len(draw_sizes) != 2:
            raise ValueError("Expected two draw sizes, found %d instead" % draw_sizes)
        self.singles_draw_size = int(draw_sizes[0])
        self.doubles_draw_size = int(draw_sizes[1])

    def _process_surface(self, elem):
        in_out = regex_strip_string(elem.xpath(".//div[@class='item-details']/text()[1]")[0])
        if in_out not in ['Indoor', 'Outdoor']:
            raise ValueError('Expected to see "Indoor" or "Outdoor", instead saw "%s"' % in_out)
        self.in_out = in_out
        surface = regex_strip_string(elem.xpath(".//span[@class='item-value']/text()[1]")[0])
        if surface not in self.ELIGIBLE_SURFACES:
            raise ValueError("Unrecognized Surface %s" % surface)
        self.surface = surface

    @staticmethod
    def _first_if_present(to_parse, expr, default=None):
        # Parses the first element if present, otherwise returns default value
        xpath_res = to_parse.xpath(expr)
        if len(xpath_res) > 0:
            return regex_strip_string(xpath_res[0])
        else:
            return default
        
    def _process_fin_commit(self, elem):
        # TODO: Make a function to process if present...
        self.fin_commit = self._first_if_present(elem, ".//span[@class='item-value']/text()[1]")
            
    def _process_tourney_winners(self, elem):
        elems = elem.xpath(".//div[@class='tourney-detail-winner']")
        if len(elems) != 2:
            raise ValueError("Expected two winners, instead found %d" % len(elems))
        singles, doubles = elems
        assert regex_strip_string(singles.xpath("./text()[1]")[0]) == 'SGL:'
        assert regex_strip_string(doubles.xpath("./text()[1]")[0]) == 'DBL:'

        singles_winner_html = singles.xpath(
            "./a"
        )
        doubles_winner_html = doubles.xpath(
            "./a"
        )
        if len(singles_winner_html) == 0:
            self.singles_winner_name = None
            self.singles_winner_link = None
        else:
            self.singles_winner_name = self._first_if_present(
                singles_winner_html[0], "./text()"
            )
            self.singles_winner_link = singles_winner_html[0].get('href')

        if len(doubles_winner_html) != 2:
            self.doubles_winner_first_name = None
            self.doubles_winner_first_link = None
            self.doubles_winner_second_name = None
            self.doubles_winner_second_link = None
        else:
            self.doubles_winner_first_name = self._first_if_present(
                doubles_winner_html[0], "./text()"
            )
            self.doubles_winner_second_name = self._first_if_present(
                doubles_winner_html[1], "./text()"
            )
            self.doubles_winner_first_link = doubles_winner_html[0].get('href')
            self.doubles_winner_second_link = doubles_winner_html[1].get('href')
            
    def _process_results_link(self, elem):
        self.tourney_url_suffix = elem.xpath("./a")[0].get('href')
        
    def state_to_dict(self):
        return {
            'tourney_title': self.tourney_title,
            'tourney_location': self.tourney_location,
            'tourney_dates': self.tourney_dates,
            'tourney_singles_draw_size': self.singles_draw_size,
            'tourney_doubles_draw_size': self.doubles_draw_size,
            'tourney_in_out': self.in_out,
            'tourney_surface': self.surface,
            'tourney_singles_winner_name': self.singles_winner_name,
            'tourney_singles_winner_link': self.singles_winner_link,
            'tourney_doubles_winner_first_name': self.doubles_winner_first_name,
            'tourney_doubles_winner_second_name': self.doubles_winner_second_name,
            'tourney_doubles_winner_first_link': self.doubles_winner_first_link,
            'tourney_doubles_winner_second_link': self.doubles_winner_second_link,
            'tourney_url_suffix': self.tourney_url_suffix
        }
            
        
    def __init__(self, tr_elem):
        self.tr_elem = tr_elem
        self._check_xpath_validity()
        self.table_entries = self.tr_elem.xpath('.//td')
        self._process_title_location_date(self.table_entries[2])
        self._process_draw_sizes(self.table_entries[3])
        self._process_surface(self.table_entries[4])
        self._process_fin_commit(self.table_entries[5])
        self._process_tourney_winners(self.table_entries[6])
        self._process_results_link(self.table_entries[7])
        

class TennisScraper():
    
    @staticmethod
    def _get_tourney_tree(base_url):
        return html.fromstring(requests.get(base_url).content)
    
    def __init__(self, year):
        self.year = year
        self.base_url = "http://www.atpworldtour.com/en/scores/results-archive?year=%d" % self.year
        self.tourney_tree = self._get_tourney_tree(self.base_url)
        self.tourneys = [
            TournamentScraper(t) for t in xpath_parse(self.tourney_tree, "//tr[@class='tourney-result']")
        ]
    
    def tourney_df(self):
        return pd.DataFrame([
            t.state_to_dict() for t in self.tourneys
        ])

In [497]:
ts = TennisScraper(1979)

In [498]:
ts.tourney_df()

Unnamed: 0,tourney_dates,tourney_doubles_draw_size,tourney_doubles_winner_first_link,tourney_doubles_winner_first_name,tourney_doubles_winner_second_link,tourney_doubles_winner_second_name,tourney_in_out,tourney_location,tourney_singles_draw_size,tourney_singles_winner_link,tourney_singles_winner_name,tourney_surface,tourney_title,tourney_url_suffix
0,1979.01.01,16,/en/players/phil-dent/d074/overview,Phil Dent,/en/players/bob-giltinan/g074/overview,Bob Giltinan,Outdoor,Hobart,32,/en/players/guillermo-vilas/v028/overview,Guillermo Vilas,Hard,Hobart,/en/scores/archive/hobart/713/1979/results
1,1979.01.02,16,/en/players/bernard-mitton/m072/overview,Bernard Mitton,/en/players/kim-warwick/w006/overview,Kim Warwick,Outdoor,Auckland,32,/en/players/tim-wilkison/w019/overview,Tim Wilkison,Hard,Auckland,/en/scores/archive/auckland/301/1979/results
2,1979.01.03,8,/en/players/peter-fleming/f030/overview,Peter Fleming,/en/players/john-mcenroe/m047/overview,John McEnroe,Indoor,London,0,,,Carpet,World Doubles WCT,/en/scores/archive/world-doubles-wct/601/1979/...
3,1979.01.15,16,/en/players/stan-smith/s060/overview,Stan Smith,/en/players/dick-stockton/s090/overview,Dick Stockton,Indoor,Birmingham,32,/en/players/jimmy-connors/c044/overview,Jimmy Connors,Carpet,Birmingham WCT,/en/scores/archive/birmingham/712/1979/results
4,1979.01.15,16,/en/players/marty-riessen/r055/overview,Marty Riessen,/en/players/sherwood-stewart/s082/overview,Sherwood Stewart,Indoor,Baltimore,32,/en/players/harold-solomon/s065/overview,Harold Solomon,Carpet,Baltimore,/en/scores/archive/baltimore/816/1979/results
5,1979.01.22,32,/en/players/wojtek-fibak/f020/overview,Wojtek Fibak,/en/players/tom-okker/o032/overview,Tom Okker,Indoor,Philadelphia,64,/en/players/jimmy-connors/c044/overview,Jimmy Connors,Carpet,Philadelphia WCT,/en/scores/archive/philadelphia/401/1979/results
6,1979.01.29,16,/en/players/brian-gottfried/g029/overview,Brian Gottfried,/en/players/john-mcenroe/m047/overview,John McEnroe,Indoor,Richmond,32,/en/players/bjorn-borg/b058/overview,Bjorn Borg,Carpet,Richmond WCT,/en/scores/archive/richmond/802/1979/results
7,1979.01.29,16,/en/players/vitas-gerulaitis/g008/overview,Vitas Gerulaitis,/en/players/vladimir-zednik/z023/overview,Vladimir Zednik,Indoor,Little Rock,32,/en/players/vitas-gerulaitis/g008/overview,Vitas Gerulaitis,Hard,Little Rock,/en/scores/archive/little-rock/660/1979/results
8,1979.02.10,0,,,,,Outdoor,Boca Raton,4,/en/players/bjorn-borg/b058/overview,Bjorn Borg,Clay,Pepsi Grand Slam,/en/scores/archive/boca-raton/1725/1979/results
9,1979.02.12,32,/en/players/gene-mayer/m038/overview,Gene Mayer,/en/players/sandy-mayer/m039/overview,Sandy Mayer,Outdoor,Rancho Mirage,64,/en/players/roscoe-tanner/t006/overview,Roscoe Tanner,Hard,Rancho Mirage,/en/scores/archive/rancho-mirage/404/1979/results


In [462]:
len(ts.tourneys)

108

In [459]:
ts.tourneys[0].doubles_winner_first_name

'John Alexander'

In [447]:
ts.tourneys[0].singles_winner_link

'/en/players/john-newcombe/n044/overview'

In [340]:
print(html.tostring(ts.tourneys[0].table_entries[0]))

b'<td class="when-heading"></td>\r\n\t\t\t\t\t\t\t    '


In [484]:
print(html.tostring(ts.tourneys[3].table_entries[7]))

b'<td class="tourney-details">\r\n\t\t\t<a href="/en/scores/archive/birmingham/712/1979/results" class="button-border">\r\n\t\t\t\tResults\r\n\t\t\t</a>\r\n\t</td>\r\n\r\n\t\t\t\t\t\t'


In [485]:
w = ts.tourneys[0].table_entries[7]
# winner1, winner2 = w.xpath(".//div[@class='tourney-detail-winner']")

In [489]:
w.xpath('./a')[0].get('href')

'/en/scores/archive/hobart/713/1979/results'

In [436]:
q = winner1.xpath(".//a")[0]

In [438]:
q.get('href')

'/en/players/john-newcombe/n044/overview'

In [425]:
winner2.xpath("./a[1]/text()")

['\r\n\t\t\tJohn Alexander\r\n\t\t']

In [236]:
regex_strip_array(w.xpath(".//span[@class='item-value']/text()"))

['64', '32']

In [97]:
w = ts.tourneys[0].tr_elem

In [130]:
xpath_parse(w, ".//span[@class='tourney-title']/text()")

['\r\n                Australian Open\r\n            ']

In [132]:
len(xpath_parse(w, ".//td"))

8

In [108]:
print(html.tostring(w))

b'<tr class="tourney-result">\r\n\t\t\t\t\t\t\t<td class="when-heading"></td>\r\n\t\t\t\t\t\t\t    <td class="tourney-badge-wrapper">\r\n                <img alt="tournament badge" src="/assets/atpwt/images/tournament/badges/categorystamps_grandslam.png">\r\n    </td>\r\n\r\n\t\t\t\t\t\t\t    <td class="title-content">\r\n            <span class="tourney-title">\r\n                Australian Open\r\n            </span>\r\n\r\n        <span class="tourney-location">\r\n            Melbourne\r\n        </span>\r\n        <span class="tourney-dates">\r\n1974.12.21        </span>\r\n    </td>\r\n\r\n\t\t\t\t\t\t\t    <td class="tourney-details">\r\n        <div class="info-area">\r\n            <div class="item-details">\r\n                SGL\r\n                <a href="/en/scores/archive/australian-open/580/1975/draws?matchtype=singles">\r\n                    <span class="item-value">\r\n                        64\r\n                    </span>\r\n                </a>\r\n               

#### Match Scores

In [499]:
TEST_URL = 'https://www.atptour.com/en/scores/archive/australian-open/580/1976/results'

In [564]:
class MatchResultsParser():
    
    def __init__(self, url):
        self.url = url
        self.match_score_list = []
        self._cur_round_name = None

    def _parse_header_row(self, header_row):
        self._cur_round_name = regex_strip_string(header_row.xpath(
            "./th/text()"
        )[0])
        
    def _parse_result_row(self, result_row):
        winner_link, loser_link = result_row.xpath("./td[@class='day-table-name']/a")
        winner_name = regex_strip_string(winner_link.xpath('./text()')[0])
        loser_name = regex_strip_string(loser_link.xpath('./text()')[0])
        winner_url = winner_link.get('href') 
        loser_url = loser_link.get('href') 
        self.match_score_list.append({
            'winner_name': winner_name,
            'loser_name': loser_name,
            'winner_url': winner_url,
            'loser_url': loser_url,
            'round': self._cur_round_name
        })
        
    def _parse_row(self, table_row):
        player_names = table_row.xpath(".//td[@class='day-table-name']/a")
        if len(player_names) != 2:  # If this is the case, probably a header
            assert len(table_row.xpath("./th")) == 1
            self._parse_header_row(table_row)
        else:
            assert len(table_row.xpath("./th")) == 0
            self._parse_result_row(table_row)
        
    def parse_scores(self):
        self.tree = html.fromstring(requests.get(self.url).content)
        score_table_elems = self.tree.xpath(".//table[@class='day-table']")
        assert len(score_table_elems) == 1
        score_table_elem = score_table_elems[0]
        
        table_rows = score_table_elem.xpath('.//tr')
        for table_row in table_rows:
            self._parse_row(table_row)

In [565]:
mr = MatchResultsParser(TEST_URL)
mr.parse_scores()

In [566]:
mr.match_score_list

[{'winner_name': 'Mark Edmondson',
  'loser_name': 'John Newcombe',
  'winner_url': '/en/players/mark-edmondson/e005/overview',
  'loser_url': '/en/players/john-newcombe/n044/overview',
  'round': 'Finals'},
 {'winner_name': 'Mark Edmondson',
  'loser_name': 'Ken Rosewall',
  'winner_url': '/en/players/mark-edmondson/e005/overview',
  'loser_url': '/en/players/ken-rosewall/r075/overview',
  'round': 'Semi-Finals'},
 {'winner_name': 'John Newcombe',
  'loser_name': 'Ray Ruffels',
  'winner_url': '/en/players/john-newcombe/n044/overview',
  'loser_url': '/en/players/ray-ruffels/r076/overview',
  'round': 'Semi-Finals'},
 {'winner_name': 'Ken Rosewall',
  'loser_name': 'Brad Drewett',
  'winner_url': '/en/players/ken-rosewall/r075/overview',
  'loser_url': '/en/players/brad-drewett/d040/overview',
  'round': 'Quarter-Finals'},
 {'winner_name': 'John Newcombe',
  'loser_name': 'Ross Case',
  'winner_url': '/en/players/john-newcombe/n044/overview',
  'loser_url': '/en/players/ross-case/c020

In [500]:
root = html.fromstring(requests.get(TEST_URL).content)

In [512]:
zubba = root.xpath(".//table[@class='day-table']")[0]

In [515]:
wubba = zubba.xpath('.//tr')

In [549]:
wubba[1].xpath("./td[@class='day-table-name']/a")

[<Element a at 0x1181fe278>, <Element a at 0x1181fe8b8>]

In [557]:
html.tostring(wubba[0])

b'<tr>\r\n                        <th colspan="10">Finals</th>\r\n                    </tr>\r\n                '