In [1]:
YEAR_URL = "http://www.atpworldtour.com/en/scores/results-archive?year=1975"

In [644]:
import pandas as pd
import requests
from lxml import html


def regex_strip_string(string):
    string = re.sub('\n', '', string).strip()
    string = re.sub('\r', '', string).strip()
    string = re.sub('\t', '', string).strip()
    return string


def regex_strip_array(array):
    for i in range(0, len(array)):
        array[i] = regex_strip_string(array[i]).strip()
    return array

def clean_text(to_parse, xpath_expr, unique=True):
    _x = to_parse.xpath(xpath_expr)
    if len(_x) > 1:
        raise ValueError("Expected singleton, received array of length %d" % len(_x))
    return regex_strip_string(_x[0])


class MatchResultsParser():
    
    URL_PREFIX = 'https://www.atptour.com/'
    
    def __init__(self, url):
        self.url = url
        self.match_result_list = []
        self._cur_round_name = None
        self.parse_scores()
        
    def _parse_header_row(self, header_row):
        self._cur_round_name = clean_text(
            header_row,
            "./th/text()"
        )
        
    def _parse_result_row(self, result_row):
        winner_link, loser_link = result_row.xpath("./td[@class='day-table-name']/a")
        winner_name = clean_text(winner_link, './text()')
        loser_name = clean_text(loser_link, './text()')
        winner_url = winner_link.get('href') 
        loser_url = loser_link.get('href') 

        score = ''.join(regex_strip_array(result_row.xpath("./td[@class='day-table-score']/a/text()")))
        self.match_result_list.append({
            'winner_name': winner_name,
            'loser_name': loser_name,
            'winner_url': winner_url,
            'loser_url': loser_url,
            'round': self._cur_round_name,
            'score': score
        })
        
    def _parse_row(self, table_row):
        player_names = table_row.xpath(".//td[@class='day-table-name']/a")
        if len(player_names) != 2:  # If this is the case, probably a header
            assert len(table_row.xpath("./th")) == 1
            self._parse_header_row(table_row)
        else:
            assert len(table_row.xpath("./th")) == 0
            self._parse_result_row(table_row)
        
    def parse_scores(self):
        self.tree = html.fromstring(requests.get(self.url).content)
        score_table_elems = self.tree.xpath(".//table[@class='day-table']")
        assert len(score_table_elems) <= 1
        if len(score_table_elems) != 0:
            score_table_elem = score_table_elems[0]
            table_rows = score_table_elem.xpath('.//tr')
            for table_row in table_rows:
                self._parse_row(table_row)


class TournamentScraper():

    EXPECTED_ELEMS = 8  # We expect 8 elements per tournament row
    ELIGIBLE_SURFACES = [
        'Hard',
        'Carpet',
        'Clay',
        'Grass'
    ]
    

    def _check_xpath_validity(self):
        n_elems = len(self.tr_elem.xpath('.//td'))
        if n_elems != self.EXPECTED_ELEMS:
            raise ValueError("Expected {0} elements per tournament, received {1}").format(
                self.EXPECTED_ELEMS, n_elems
            )

    def _process_title_location_date(self, elem):
        self.tourney_title = clean_text(elem, ".//span[@class='tourney-title']/text()")
        self.tourney_location = clean_text(elem, ".//span[@class='tourney-location']/text()")
        self.tourney_dates = clean_text(elem, ".//span[@class='tourney-dates']/text()")

    def _process_draw_sizes(self, elem):
        sgd_dbl = regex_strip_array(elem.xpath(".//div[@class='item-details']/text()"))
        assert sgd_dbl == ['SGL', 'DBL', '']
        draw_sizes = regex_strip_array(elem.xpath(".//span[@class='item-value']/text()"))
        if len(draw_sizes) != 2:
            raise ValueError("Expected two draw sizes, found %d instead" % draw_sizes)
        self.singles_draw_size = int(draw_sizes[0])
        self.doubles_draw_size = int(draw_sizes[1])

    def _process_surface(self, elem):
        in_out = regex_strip_string(elem.xpath(".//div[@class='item-details']/text()[1]")[0])
        if in_out not in ['Indoor', 'Outdoor']:
            raise ValueError('Expected to see "Indoor" or "Outdoor", instead saw "%s"' % in_out)
        self.in_out = in_out
        surface = regex_strip_string(elem.xpath(".//span[@class='item-value']/text()[1]")[0])
        if surface not in self.ELIGIBLE_SURFACES:
            raise ValueError("Unrecognized Surface %s" % surface)
        self.surface = surface

    @staticmethod
    def _first_if_present(to_parse, expr, default=None):
        # Parses the first element if present, otherwise returns default value
        xpath_res = to_parse.xpath(expr)
        if len(xpath_res) > 0:
            return regex_strip_string(xpath_res[0])
        else:
            return default
        
    def _process_fin_commit(self, elem):
        # TODO: Make a function to process if present...
        self.fin_commit = self._first_if_present(elem, ".//span[@class='item-value']/text()[1]")
            
    def _process_tourney_winners(self, elem):
        elems = elem.xpath(".//div[@class='tourney-detail-winner']")
        if len(elems) != 2:
            raise ValueError("Expected two winners, instead found %d" % len(elems))
        singles, doubles = elems
        assert regex_strip_string(singles.xpath("./text()[1]")[0]) == 'SGL:'
        assert regex_strip_string(doubles.xpath("./text()[1]")[0]) == 'DBL:'

        singles_winner_html = singles.xpath(
            "./a"
        )
        doubles_winner_html = doubles.xpath(
            "./a"
        )
        if len(singles_winner_html) == 0:
            self.singles_winner_name = None
            self.singles_winner_link = None
        else:
            self.singles_winner_name = self._first_if_present(
                singles_winner_html[0], "./text()"
            )
            self.singles_winner_link = singles_winner_html[0].get('href')

        if len(doubles_winner_html) != 2:
            self.doubles_winner_first_name = None
            self.doubles_winner_first_link = None
            self.doubles_winner_second_name = None
            self.doubles_winner_second_link = None
        else:
            self.doubles_winner_first_name = self._first_if_present(
                doubles_winner_html[0], "./text()"
            )
            self.doubles_winner_second_name = self._first_if_present(
                doubles_winner_html[1], "./text()"
            )
            self.doubles_winner_first_link = doubles_winner_html[0].get('href')
            self.doubles_winner_second_link = doubles_winner_html[1].get('href')
            
    def _process_results_link(self, elem):
        tourney_url_elem = elem.xpath("./a")
        if len(tourney_url_elem) > 0:
            self.tourney_url_suffix = tourney_url_elem[0].get('href')
        else:
            self.tourney_url_suffix = None

    def _parse_year_id(self):
        if self.tourney_url_suffix is None:
            self.year_id = None
        else:
            split = self.tourney_url_suffix.split('/')
            assert split[0] == '' 
            assert split[1] == 'en'
            assert split[2] == 'scores'
            assert split[3] == 'archive'
            self.tourney_id = split[5]
            url_year = split[6]
            self.year_id = '_'.join([url_year, self.tourney_id])
            
    def _parse_results(self):
        if self.tourney_url_suffix is not None:
            print("Parsing results for %s" % self.tourney_url_suffix)
            mp = MatchResultsParser(MatchResultsParser.URL_PREFIX + self.tourney_url_suffix)
            self.match_results = mp.match_result_list
        else:
            self.match_results = []
    
        
    def state_to_dict(self):
        return {
            'tourney_title': self.tourney_title,
            'tourney_location': self.tourney_location,
            'tourney_dates': self.tourney_dates,
            'tourney_singles_draw_size': self.singles_draw_size,
            'tourney_doubles_draw_size': self.doubles_draw_size,
            'tourney_in_out': self.in_out,
            'tourney_surface': self.surface,
            'tourney_singles_winner_name': self.singles_winner_name,
            'tourney_singles_winner_link': self.singles_winner_link,
            'tourney_doubles_winner_first_name': self.doubles_winner_first_name,
            'tourney_doubles_winner_second_name': self.doubles_winner_second_name,
            'tourney_doubles_winner_first_link': self.doubles_winner_first_link,
            'tourney_doubles_winner_second_link': self.doubles_winner_second_link,
            'tourney_url_suffix': self.tourney_url_suffix,
            'tourney_year_id': self.year_id
        }
            
        
    def __init__(self, tr_elem):
        self.tr_elem = tr_elem
        self._check_xpath_validity()
        self.table_entries = self.tr_elem.xpath('.//td')
        self._process_title_location_date(self.table_entries[2])
        self._process_draw_sizes(self.table_entries[3])
        self._process_surface(self.table_entries[4])
        self._process_fin_commit(self.table_entries[5])
        self._process_tourney_winners(self.table_entries[6])
        self._process_results_link(self.table_entries[7])
        self._parse_year_id()
        self._parse_results()
        
    def result_df(self):
        if len(self.match_results) == 0:
            return None
        else:
            match_result_df = pd.DataFrame(self.match_results)
            for k, v in self.state_to_dict().items():
                match_result_df[k] = v
            return match_result_df
        

class TennisScraper():
    
    @staticmethod
    def _get_tourney_tree(base_url):
        return html.fromstring(requests.get(base_url).content)
    
    def __init__(self, year, challenger=False):
        self.year = year
        _base_url = "http://www.atpworldtour.com/en/scores/results-archive?year=%d"
        self.base_url = _base_url % self.year
        if challenger:
            self.base_url += "&tournamentType=ch" 
        self.tourney_tree = self._get_tourney_tree(self.base_url)
        self.tourneys = [
            TournamentScraper(t) for t in self.tourney_tree.xpath("//tr[@class='tourney-result']")
        ]
    
    def tourney_df(self):
        return pd.DataFrame([
            t.state_to_dict() for t in self.tourneys
        ])

    def match_df(self):
        return pd.concat([
            t.result_df() for t in self.tourneys
        ])

In [645]:
ts = TennisScraper(1979)
matches_1979 = ts.match_df()

Parsing results for /en/scores/archive/hobart/713/1979/results
Parsing results for /en/scores/archive/auckland/301/1979/results
Parsing results for /en/scores/archive/world-doubles-wct/601/1979/results
Parsing results for /en/scores/archive/birmingham/712/1979/results
Parsing results for /en/scores/archive/baltimore/816/1979/results
Parsing results for /en/scores/archive/philadelphia/401/1979/results
Parsing results for /en/scores/archive/richmond/802/1979/results
Parsing results for /en/scores/archive/little-rock/660/1979/results
Parsing results for /en/scores/archive/boca-raton/1725/1979/results
Parsing results for /en/scores/archive/rancho-mirage/404/1979/results
Parsing results for /en/scores/archive/sarasota/709/1979/results
Parsing results for /en/scores/archive/denver/389/1979/results
Parsing results for /en/scores/archive/dorado-beach/6880/1979/results
Parsing results for /en/scores/archive/memphis/402/1979/results
Parsing results for /en/scores/archive/lagos/205/1979/results
P

In [646]:
matches_1979.head()

Unnamed: 0,loser_name,loser_url,round,score,winner_name,winner_url,tourney_title,tourney_location,tourney_dates,tourney_singles_draw_size,...,tourney_in_out,tourney_surface,tourney_singles_winner_name,tourney_singles_winner_link,tourney_doubles_winner_first_name,tourney_doubles_winner_second_name,tourney_doubles_winner_first_link,tourney_doubles_winner_second_link,tourney_url_suffix,tourney_year_id
0,Mark Edmondson,/en/players/mark-edmondson/e005/overview,Finals,64 64,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Hobart,Hobart,1979.01.01,32,...,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results,1979_713
1,Phil Dent,/en/players/phil-dent/d074/overview,Semi-Finals,60 62,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Hobart,Hobart,1979.01.01,32,...,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results,1979_713
2,Allan Stone,/en/players/allan-stone/s148/overview,Semi-Finals,64 63,Mark Edmondson,/en/players/mark-edmondson/e005/overview,Hobart,Hobart,1979.01.01,32,...,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results,1979_713
3,Terry Rocavert,/en/players/terry-rocavert/r072/overview,Quarter-Finals,60 62,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Hobart,Hobart,1979.01.01,32,...,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results,1979_713
4,Bob Carmichael,/en/players/bob--carmichael/c080/overview,Quarter-Finals,16 76 1412,Allan Stone,/en/players/allan-stone/s148/overview,Hobart,Hobart,1979.01.01,32,...,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results,1979_713


In [647]:
ts_challenger = TennisScraper(1979, challenger=True)
matches_1979_ch = ts_challenger.match_df()

Parsing results for /en/scores/archive/linz/351/1979/results
Parsing results for /en/scores/archive/nagoya/217/1979/results
Parsing results for /en/scores/archive/indian-river/2002/1979/results
Parsing results for /en/scores/archive/parioli/218/1979/results
Parsing results for /en/scores/archive/cuneo/282/1979/results
Parsing results for /en/scores/archive/raleigh/371/1979/results
Parsing results for /en/scores/archive/galatina/256/1979/results
Parsing results for /en/scores/archive/zell-am-see/286/1979/results
Parsing results for /en/scores/archive/montgomery/1504/1979/results
Parsing results for /en/scores/archive/green-bay/2003/1979/results
Parsing results for /en/scores/archive/concord/2001/1979/results
Parsing results for /en/scores/archive/san-diego/2005/1979/results
Parsing results for /en/scores/archive/porto-alegre/235/1979/results
Parsing results for /en/scores/archive/salvador/648/1979/results
Parsing results for /en/scores/archive/ribeiro-preto/782/1979/results
Parsing resu

In [648]:
matches_1979_ch.shape

(1129, 21)

In [635]:
w = ts.tourneys[0]

In [637]:
w.year_id

'1979_713'

In [617]:
ts.tourneys[0].result_df()

Unnamed: 0,loser_name,loser_url,round,score,winner_name,winner_url,tourney_title,tourney_location,tourney_dates,tourney_singles_draw_size,tourney_doubles_draw_size,tourney_in_out,tourney_surface,tourney_singles_winner_name,tourney_singles_winner_link,tourney_doubles_winner_first_name,tourney_doubles_winner_second_name,tourney_doubles_winner_first_link,tourney_doubles_winner_second_link,tourney_url_suffix
0,Mark Edmondson,/en/players/mark-edmondson/e005/overview,Finals,64 64,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Hobart,Hobart,1979.01.01,32,16,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results
1,Phil Dent,/en/players/phil-dent/d074/overview,Semi-Finals,60 62,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Hobart,Hobart,1979.01.01,32,16,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results
2,Allan Stone,/en/players/allan-stone/s148/overview,Semi-Finals,64 63,Mark Edmondson,/en/players/mark-edmondson/e005/overview,Hobart,Hobart,1979.01.01,32,16,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results
3,Terry Rocavert,/en/players/terry-rocavert/r072/overview,Quarter-Finals,60 62,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Hobart,Hobart,1979.01.01,32,16,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results
4,Bob Carmichael,/en/players/bob--carmichael/c080/overview,Quarter-Finals,16 76 1412,Allan Stone,/en/players/allan-stone/s148/overview,Hobart,Hobart,1979.01.01,32,16,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results
5,Chris Kachel,/en/players/chris-kachel/k044/overview,Quarter-Finals,63 76,Phil Dent,/en/players/phil-dent/d074/overview,Hobart,Hobart,1979.01.01,32,16,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results
6,Bob Giltinan,/en/players/bob-giltinan/g074/overview,Quarter-Finals,62 76,Mark Edmondson,/en/players/mark-edmondson/e005/overview,Hobart,Hobart,1979.01.01,32,16,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results
7,Peter Campbell,/en/players/peter-campbell/c079/overview,Round of 16,64 62,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Hobart,Hobart,1979.01.01,32,16,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results
8,Victor Eke,/en/players/victor-eke/e071/overview,Round of 16,63 63,Allan Stone,/en/players/allan-stone/s148/overview,Hobart,Hobart,1979.01.01,32,16,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results
9,Ion Tiriac,/en/players/ion-tiriac/t040/overview,Round of 16,67 64 63,Phil Dent,/en/players/phil-dent/d074/overview,Hobart,Hobart,1979.01.01,32,16,Outdoor,Hard,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,Phil Dent,Bob Giltinan,/en/players/phil-dent/d074/overview,/en/players/bob-giltinan/g074/overview,/en/scores/archive/hobart/713/1979/results


In [608]:
pd.DataFrame(ts.tourneys[0].match_results)

Unnamed: 0,loser_name,loser_url,round,score,winner_name,winner_url
0,Mark Edmondson,/en/players/mark-edmondson/e005/overview,Finals,64 64,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview
1,Phil Dent,/en/players/phil-dent/d074/overview,Semi-Finals,60 62,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview
2,Allan Stone,/en/players/allan-stone/s148/overview,Semi-Finals,64 63,Mark Edmondson,/en/players/mark-edmondson/e005/overview
3,Terry Rocavert,/en/players/terry-rocavert/r072/overview,Quarter-Finals,60 62,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview
4,Bob Carmichael,/en/players/bob--carmichael/c080/overview,Quarter-Finals,16 76 1412,Allan Stone,/en/players/allan-stone/s148/overview
5,Chris Kachel,/en/players/chris-kachel/k044/overview,Quarter-Finals,63 76,Phil Dent,/en/players/phil-dent/d074/overview
6,Bob Giltinan,/en/players/bob-giltinan/g074/overview,Quarter-Finals,62 76,Mark Edmondson,/en/players/mark-edmondson/e005/overview
7,Peter Campbell,/en/players/peter-campbell/c079/overview,Round of 16,64 62,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview
8,Victor Eke,/en/players/victor-eke/e071/overview,Round of 16,63 63,Allan Stone,/en/players/allan-stone/s148/overview
9,Ion Tiriac,/en/players/ion-tiriac/t040/overview,Round of 16,67 64 63,Phil Dent,/en/players/phil-dent/d074/overview


In [459]:
ts.tourneys[0].doubles_winner_first_name

'John Alexander'

In [447]:
ts.tourneys[0].singles_winner_link

'/en/players/john-newcombe/n044/overview'

In [340]:
print(html.tostring(ts.tourneys[0].table_entries[0]))

b'<td class="when-heading"></td>\r\n\t\t\t\t\t\t\t    '


In [484]:
print(html.tostring(ts.tourneys[3].table_entries[7]))

b'<td class="tourney-details">\r\n\t\t\t<a href="/en/scores/archive/birmingham/712/1979/results" class="button-border">\r\n\t\t\t\tResults\r\n\t\t\t</a>\r\n\t</td>\r\n\r\n\t\t\t\t\t\t'


In [485]:
w = ts.tourneys[0].table_entries[7]
# winner1, winner2 = w.xpath(".//div[@class='tourney-detail-winner']")

In [489]:
w.xpath('./a')[0].get('href')

'/en/scores/archive/hobart/713/1979/results'

In [436]:
q = winner1.xpath(".//a")[0]

In [438]:
q.get('href')

'/en/players/john-newcombe/n044/overview'

In [425]:
winner2.xpath("./a[1]/text()")

['\r\n\t\t\tJohn Alexander\r\n\t\t']

In [236]:
regex_strip_array(w.xpath(".//span[@class='item-value']/text()"))

['64', '32']

In [97]:
w = ts.tourneys[0].tr_elem

In [130]:
xpath_parse(w, ".//span[@class='tourney-title']/text()")

['\r\n                Australian Open\r\n            ']

In [132]:
len(xpath_parse(w, ".//td"))

8

#### Match Scores

In [499]:
TEST_URL = 'https://www.atptour.com/en/scores/archive/australian-open/580/1976/results'

In [580]:
class MatchResultsParser():
    
    def __init__(self, url):
        self.url = url
        self.match_score_list = []
        self._cur_round_name = None

    def _parse_header_row(self, header_row):
        self._cur_round_name = clean_text(
            header_row,
            "./th/text()"
        )
        
    def _parse_result_row(self, result_row):
        winner_link, loser_link = result_row.xpath("./td[@class='day-table-name']/a")
        winner_name = clean_text(winner_link, './text()')
        loser_name = clean_text(loser_link, './text()')
        winner_url = winner_link.get('href') 
        loser_url = loser_link.get('href') 
        score = clean_text(result_row, "./td[@class='day-table-score']/a/text()")
        self.match_score_list.append({
            'winner_name': winner_name,
            'loser_name': loser_name,
            'winner_url': winner_url,
            'loser_url': loser_url,
            'round': self._cur_round_name,
            'score': score
        })
        
    def _parse_row(self, table_row):
        player_names = table_row.xpath(".//td[@class='day-table-name']/a")
        if len(player_names) != 2:  # If this is the case, probably a header
            assert len(table_row.xpath("./th")) == 1
            self._parse_header_row(table_row)
        else:
            assert len(table_row.xpath("./th")) == 0
            self._parse_result_row(table_row)
        
    def parse_scores(self):
        self.tree = html.fromstring(requests.get(self.url).content)
        score_table_elems = self.tree.xpath(".//table[@class='day-table']")
        assert len(score_table_elems) == 1
        score_table_elem = score_table_elems[0]
        
        table_rows = score_table_elem.xpath('.//tr')
        for table_row in table_rows:
            self._parse_row(table_row)

In [581]:
mr = MatchResultsParser(TEST_URL)
mr.parse_scores()

In [582]:
mr.match_score_list

[{'winner_name': 'Mark Edmondson',
  'loser_name': 'John Newcombe',
  'winner_url': '/en/players/mark-edmondson/e005/overview',
  'loser_url': '/en/players/john-newcombe/n044/overview',
  'round': 'Finals',
  'score': '67 63 76 61'},
 {'winner_name': 'Mark Edmondson',
  'loser_name': 'Ken Rosewall',
  'winner_url': '/en/players/mark-edmondson/e005/overview',
  'loser_url': '/en/players/ken-rosewall/r075/overview',
  'round': 'Semi-Finals',
  'score': '61 26 62 64'},
 {'winner_name': 'John Newcombe',
  'loser_name': 'Ray Ruffels',
  'winner_url': '/en/players/john-newcombe/n044/overview',
  'loser_url': '/en/players/ray-ruffels/r076/overview',
  'round': 'Semi-Finals',
  'score': '64 64 76'},
 {'winner_name': 'Ken Rosewall',
  'loser_name': 'Brad Drewett',
  'winner_url': '/en/players/ken-rosewall/r075/overview',
  'loser_url': '/en/players/brad-drewett/d040/overview',
  'round': 'Quarter-Finals',
  'score': '64 36 62 62'},
 {'winner_name': 'John Newcombe',
  'loser_name': 'Ross Case',


In [512]:
zubba = root.xpath(".//table[@class='day-table']")[0]

In [515]:
wubba = zubba.xpath('.//tr')

In [549]:
wubba[1].xpath("./td[@class='day-table-name']/a")

[<Element a at 0x1181fe278>, <Element a at 0x1181fe8b8>]