In [None]:
from tennis_new.fetch.atp_api.scrapers.updated_scraper.base import MatchResultsParser

mp = MatchResultsParser('https://www.atptour.com/en/scores/archive/como/3473/2010/results')

In [None]:
mp.match_result_list

In [None]:
from lxml import html

score_table_elems = mp.tree.xpath(".//table[@class='day-table']")

In [None]:
table_rows = score_table_elems[0].xpath('.//tr')

In [None]:
html.tostring(table_rows[1])

In [None]:
result_row = table_rows[1]

In [None]:
html.tostring(result_row)

In [None]:
w = result_row.xpath("./td[@class='day-table-score']/a")

In [None]:
w[0].attrib['href']

In [None]:
w[0].xpath('./text()')

In [None]:
stripped = regex_strip_array(w[0].xpath('./text()'))
stripped.remove("")
stripped

In [None]:
''.join(regex_strip_array(w[0].xpath('./text()')))

In [None]:
z = w[0]

In [None]:
from tennis_new.fetch.atp_api.scrapers.updated_scraper.base import regex_strip_string, regex_strip_array

regex_strip_string(z.text_content())

In [None]:
?z.text_content

In [None]:
z = w[0]

In [None]:
z.attrib['href']

In [None]:
html.tostring(z)

In [None]:
z.attrib

In [None]:
len(table_rows)

In [None]:
import pandas as pd
import requests
from lxml import html


def regex_strip_string(string):
    string = re.sub('\n', '', string).strip()
    string = re.sub('\r', '', string).strip()
    string = re.sub('\t', '', string).strip()
    return string


def regex_strip_array(array):
    for i in range(0, len(array)):
        array[i] = regex_strip_string(array[i]).strip()
    return array

def clean_text(to_parse, xpath_expr, unique=True):
    _x = to_parse.xpath(xpath_expr)
    if len(_x) > 1:
        raise ValueError("Expected singleton, received array of length %d" % len(_x))
    return regex_strip_string(_x[0])


class MatchResultsParser():
    
    URL_PREFIX = 'https://www.atptour.com/'
    
    def __init__(self, url):
        self.url = url
        self.match_result_list = []
        self._cur_round_name = None
        self.parse_scores()
        
    def _parse_header_row(self, header_row):
        self._cur_round_name = clean_text(
            header_row,
            "./th/text()"
        )
        
    def _parse_result_row(self, result_row):
        winner_link, loser_link = result_row.xpath("./td[@class='day-table-name']/a")
        winner_name = clean_text(winner_link, './text()')
        loser_name = clean_text(loser_link, './text()')
        winner_url = winner_link.get('href') 
        loser_url = loser_link.get('href') 

        score = ''.join(regex_strip_array(result_row.xpath("./td[@class='day-table-score']/a/text()")))
        self.match_result_list.append({
            'winner_name': winner_name,
            'loser_name': loser_name,
            'winner_url': winner_url,
            'loser_url': loser_url,
            'round': self._cur_round_name,
            'score': score
        })
        
    def _parse_row(self, table_row):
        player_names = table_row.xpath(".//td[@class='day-table-name']/a")
        if len(player_names) != 2:  # If this is the case, probably a header
            assert len(table_row.xpath("./th")) == 1
            self._parse_header_row(table_row)
        else:
            assert len(table_row.xpath("./th")) == 0
            self._parse_result_row(table_row)
        
    def parse_scores(self):
        self.tree = html.fromstring(requests.get(self.url).content)
        score_table_elems = self.tree.xpath(".//table[@class='day-table']")
        assert len(score_table_elems) <= 1
        if len(score_table_elems) != 0:
            score_table_elem = score_table_elems[0]
            table_rows = score_table_elem.xpath('.//tr')
            for table_row in table_rows:
                self._parse_row(table_row)


class TournamentScraper():

    EXPECTED_ELEMS = 8  # We expect 8 elements per tournament row
    ELIGIBLE_SURFACES = [
        'Hard',
        'Carpet',
        'Clay',
        'Grass'
    ]
    

    def _check_xpath_validity(self):
        n_elems = len(self.tr_elem.xpath('.//td'))
        if n_elems != self.EXPECTED_ELEMS:
            raise ValueError("Expected {0} elements per tournament, received {1}").format(
                self.EXPECTED_ELEMS, n_elems
            )

    def _process_title_location_date(self, elem):
        self.tourney_title = clean_text(elem, ".//span[@class='tourney-title']/text()")
        self.tourney_location = clean_text(elem, ".//span[@class='tourney-location']/text()")
        self.tourney_dates = clean_text(elem, ".//span[@class='tourney-dates']/text()")

    def _process_draw_sizes(self, elem):
        sgd_dbl = regex_strip_array(elem.xpath(".//div[@class='item-details']/text()"))
        assert sgd_dbl == ['SGL', 'DBL', '']
        draw_sizes = regex_strip_array(elem.xpath(".//span[@class='item-value']/text()"))
        if len(draw_sizes) != 2:
            raise ValueError("Expected two draw sizes, found %d instead" % draw_sizes)
        self.singles_draw_size = int(draw_sizes[0])
        self.doubles_draw_size = int(draw_sizes[1])

    def _process_surface(self, elem):
        in_out = regex_strip_string(elem.xpath(".//div[@class='item-details']/text()[1]")[0])
        if in_out not in ['Indoor', 'Outdoor']:
            raise ValueError('Expected to see "Indoor" or "Outdoor", instead saw "%s"' % in_out)
        self.in_out = in_out
        surface = regex_strip_string(elem.xpath(".//span[@class='item-value']/text()[1]")[0])
        if surface not in self.ELIGIBLE_SURFACES:
            raise ValueError("Unrecognized Surface %s" % surface)
        self.surface = surface

    @staticmethod
    def _first_if_present(to_parse, expr, default=None):
        # Parses the first element if present, otherwise returns default value
        xpath_res = to_parse.xpath(expr)
        if len(xpath_res) > 0:
            return regex_strip_string(xpath_res[0])
        else:
            return default
        
    def _process_fin_commit(self, elem):
        # TODO: Make a function to process if present...
        self.fin_commit = self._first_if_present(elem, ".//span[@class='item-value']/text()[1]")
            
    def _process_tourney_winners(self, elem):
        elems = elem.xpath(".//div[@class='tourney-detail-winner']")
        if len(elems) != 2:
            raise ValueError("Expected two winners, instead found %d" % len(elems))
        singles, doubles = elems
        assert regex_strip_string(singles.xpath("./text()[1]")[0]) == 'SGL:'
        assert regex_strip_string(doubles.xpath("./text()[1]")[0]) == 'DBL:'

        singles_winner_html = singles.xpath(
            "./a"
        )
        doubles_winner_html = doubles.xpath(
            "./a"
        )
        if len(singles_winner_html) == 0:
            self.singles_winner_name = None
            self.singles_winner_link = None
        else:
            self.singles_winner_name = self._first_if_present(
                singles_winner_html[0], "./text()"
            )
            self.singles_winner_link = singles_winner_html[0].get('href')

        if len(doubles_winner_html) != 2:
            self.doubles_winner_first_name = None
            self.doubles_winner_first_link = None
            self.doubles_winner_second_name = None
            self.doubles_winner_second_link = None
        else:
            self.doubles_winner_first_name = self._first_if_present(
                doubles_winner_html[0], "./text()"
            )
            self.doubles_winner_second_name = self._first_if_present(
                doubles_winner_html[1], "./text()"
            )
            self.doubles_winner_first_link = doubles_winner_html[0].get('href')
            self.doubles_winner_second_link = doubles_winner_html[1].get('href')
            
    def _process_results_link(self, elem):
        tourney_url_elem = elem.xpath("./a")
        if len(tourney_url_elem) > 0:
            self.tourney_url_suffix = tourney_url_elem[0].get('href')
        else:
            self.tourney_url_suffix = None

    def _parse_year_id(self):
        if self.tourney_url_suffix is None:
            self.year_id = None
        else:
            split = self.tourney_url_suffix.split('/')
            assert split[0] == '' 
            assert split[1] == 'en'
            assert split[2] == 'scores'
            assert split[3] == 'archive'
            self.tourney_id = split[5]
            url_year = split[6]
            self.year_id = '_'.join([url_year, self.tourney_id])
            
    def _parse_results(self):
        if self.tourney_url_suffix is not None:
            print("Parsing results for %s" % self.tourney_url_suffix)
            mp = MatchResultsParser(MatchResultsParser.URL_PREFIX + self.tourney_url_suffix)
            self.match_results = mp.match_result_list
        else:
            self.match_results = []
    
        
    def state_to_dict(self):
        return {
            'tourney_title': self.tourney_title,
            'tourney_location': self.tourney_location,
            'tourney_dates': self.tourney_dates,
            'tourney_singles_draw_size': self.singles_draw_size,
            'tourney_doubles_draw_size': self.doubles_draw_size,
            'tourney_in_out': self.in_out,
            'tourney_surface': self.surface,
            'tourney_singles_winner_name': self.singles_winner_name,
            'tourney_singles_winner_link': self.singles_winner_link,
            'tourney_doubles_winner_first_name': self.doubles_winner_first_name,
            'tourney_doubles_winner_second_name': self.doubles_winner_second_name,
            'tourney_doubles_winner_first_link': self.doubles_winner_first_link,
            'tourney_doubles_winner_second_link': self.doubles_winner_second_link,
            'tourney_url_suffix': self.tourney_url_suffix,
            'tourney_year_id': self.year_id
        }
            
        
    def __init__(self, tr_elem):
        self.tr_elem = tr_elem
        self._check_xpath_validity()
        self.table_entries = self.tr_elem.xpath('.//td')
        self._process_title_location_date(self.table_entries[2])
        self._process_draw_sizes(self.table_entries[3])
        self._process_surface(self.table_entries[4])
        self._process_fin_commit(self.table_entries[5])
        self._process_tourney_winners(self.table_entries[6])
        self._process_results_link(self.table_entries[7])
        self._parse_year_id()
        self._parse_results()
        
    def result_df(self):
        if len(self.match_results) == 0:
            return None
        else:
            match_result_df = pd.DataFrame(self.match_results)
            for k, v in self.state_to_dict().items():
                match_result_df[k] = v
            return match_result_df
        

class TennisScraper():
    
    @staticmethod
    def _get_tourney_tree(base_url):
        return html.fromstring(requests.get(base_url).content)
    
    def __init__(self, year, challenger=False):
        self.year = year
        _base_url = "http://www.atpworldtour.com/en/scores/results-archive?year=%d"
        self.base_url = _base_url % self.year
        if challenger:
            self.base_url += "&tournamentType=ch" 
        self.tourney_tree = self._get_tourney_tree(self.base_url)
        self.tourneys = [
            TournamentScraper(t) for t in self.tourney_tree.xpath("//tr[@class='tourney-result']")
        ]
    
    def tourney_df(self):
        return pd.DataFrame([
            t.state_to_dict() for t in self.tourneys
        ])

    def match_df(self):
        return pd.concat([
            t.result_df() for t in self.tourneys
        ])

In [None]:
ts = TennisScraper(1979)
matches_1979 = ts.match_df()

In [None]:
matches_1979.head()

In [None]:
ts_challenger = TennisScraper(1979, challenger=True)
matches_1979_ch = ts_challenger.match_df()

In [None]:
matches_1979_ch.shape

In [None]:
w = ts.tourneys[0]

In [None]:
w.year_id

In [None]:
ts.tourneys[0].result_df()

In [None]:
pd.DataFrame(ts.tourneys[0].match_results)

In [None]:
ts.tourneys[0].doubles_winner_first_name

In [None]:
ts.tourneys[0].singles_winner_link

In [None]:
print(html.tostring(ts.tourneys[0].table_entries[0]))

In [None]:
print(html.tostring(ts.tourneys[3].table_entries[7]))

In [None]:
w = ts.tourneys[0].table_entries[7]
# winner1, winner2 = w.xpath(".//div[@class='tourney-detail-winner']")

In [None]:
w.xpath('./a')[0].get('href')

In [None]:
q = winner1.xpath(".//a")[0]

In [None]:
q.get('href')

In [None]:
winner2.xpath("./a[1]/text()")

In [None]:
regex_strip_array(w.xpath(".//span[@class='item-value']/text()"))

In [None]:
w = ts.tourneys[0].tr_elem

In [None]:
xpath_parse(w, ".//span[@class='tourney-title']/text()")

In [None]:
len(xpath_parse(w, ".//td"))

#### Match Scores

In [None]:
TEST_URL = 'https://www.atptour.com/en/scores/archive/australian-open/580/1976/results'

In [None]:
class MatchResultsParser():
    
    def __init__(self, url):
        self.url = url
        self.match_score_list = []
        self._cur_round_name = None

    def _parse_header_row(self, header_row):
        self._cur_round_name = clean_text(
            header_row,
            "./th/text()"
        )
        
    def _parse_result_row(self, result_row):
        winner_link, loser_link = result_row.xpath("./td[@class='day-table-name']/a")
        winner_name = clean_text(winner_link, './text()')
        loser_name = clean_text(loser_link, './text()')
        winner_url = winner_link.get('href') 
        loser_url = loser_link.get('href') 
        score = clean_text(result_row, "./td[@class='day-table-score']/a/text()")
        self.match_score_list.append({
            'winner_name': winner_name,
            'loser_name': loser_name,
            'winner_url': winner_url,
            'loser_url': loser_url,
            'round': self._cur_round_name,
            'score': score
        })
        
    def _parse_row(self, table_row):
        player_names = table_row.xpath(".//td[@class='day-table-name']/a")
        if len(player_names) != 2:  # If this is the case, probably a header
            assert len(table_row.xpath("./th")) == 1
            self._parse_header_row(table_row)
        else:
            assert len(table_row.xpath("./th")) == 0
            self._parse_result_row(table_row)
        
    def parse_scores(self):
        self.tree = html.fromstring(requests.get(self.url).content)
        score_table_elems = self.tree.xpath(".//table[@class='day-table']")
        assert len(score_table_elems) == 1
        score_table_elem = score_table_elems[0]
        
        table_rows = score_table_elem.xpath('.//tr')
        for table_row in table_rows:
            self._parse_row(table_row)

In [None]:
mr = MatchResultsParser(TEST_URL)
mr.parse_scores()

In [None]:
mr.match_score_list

In [None]:
zubba = root.xpath(".//table[@class='day-table']")[0]

In [None]:
wubba = zubba.xpath('.//tr')

In [None]:
wubba[1].xpath("./td[@class='day-table-name']/a")

In [None]:
import os
from tennis_new.fetch.atp_api.defs import API_RESULTS_DIR

mypath = os.path.join(API_RESULTS_DIR, 'updated_api_results', 'match_results_2019.csv')

In [None]:
import pandas as pd

df = pd.read_csv(mypath)

In [None]:
df.iloc[0]

In [None]:
df['match_stats_url'].isnull().mean()

In [None]:
sorted(df.columns)

In [None]:
df[df['match_stats_url'].isnull()][[
    'tourney_title',
    'score',
    'winner_name',
    'loser_name',
    'round'
]]

#### Scraping Match Stats

In [None]:
URL = 'https://www.atptour.com/en/scores/2019/339/MS003/match-stats?isLive=False'

In [None]:
import requests
from lxml import html

tree = html.fromstring(requests.get(URL).content)

In [None]:
tree.find_class('player-left-name')[0].find_class('first-name')[0].text

In [None]:
name_elem = tree.find_class('player-left-name')[0]
html.tostring(name_elem)

In [None]:
name_elem.xpath('./a')[0].attrib['href']

In [None]:
print(html.tostring(tree, pretty_print=True))

In [None]:
table = tree.find_class('match-stats-table')[0]

In [None]:
table_rows = table.xpath('.//tr')

In [None]:
html.tostring(table_rows[1])

In [None]:
left_numbers = tree.find_class('match-stats-number-left')

In [None]:
right_numbers = table.find_class('match-stats-number-right')

In [None]:
labels = [x.text for x in table.find_class('match-stats-label')]
labels

In [None]:
[_find_center(x) for x in left_numbers]

In [None]:
[_find_center(x) for x in right_numbers]

In [None]:
def _find_center(elem, sol=[]):
    l = list(elem.iterchildren())
    if len(l) == 0:
        return elem.text
    elif len(l) == 1:
        return _find_center(l[0])
    else:
        assert l[1].attrib['class'] == 'stat-breakdown'
        return (_find_center(l[0]), _find_center(l[1]))
        
_find_center(z)

In [None]:
q = z.iterchildren()

In [None]:
html.tostring(table_rows[1])

In [None]:
    html.tostring(tree.find_class('match-stats-table')[0])

In [None]:
print(html.tostring(tree, pretty_print=True))

In [None]:
import re

def regex_strip_string(string):
    string = re.sub('\n', '', string).strip()
    string = re.sub('\r', '', string).strip()
    string = re.sub('\t', '', string).strip()
    return string

def regex_strip_array(array):
    for i in range(0, len(array)):
        if isinstance(array[i], str):
            array[i] = regex_strip_string(array[i]).strip()
        else:
            array[i] = regex_strip_array(array[i])
    return array

def _find_center(elem, sol=[]):
    l = list(elem.iterchildren())
    if len(l) == 0:
        return elem.text
    elif len(l) == 1:
        return _find_center(l[0])
    else:
        assert l[1].attrib['class'] == 'stat-breakdown'
        return [_find_center(l[0]), _find_center(l[1])]

class MatchStatsParser(object):

    def __init__(self, url):
        self.tree = html.fromstring(requests.get(url).content) 
        self.parse_match()

    def _parse_name(self, suffix):
        name_elem = self.tree.find_class('player-%s-name' % suffix)
        if len(name_elem) != 1:
            raise ValueError('Expecting exactly 1 name elem with suffix %s' % suffix)
        name_elem = name_elem[0]

        link = name_elem.xpath('./a')
        if len(link) == 0:
            url = None
        elif len(link) == 1:
            url = link[0].attrib['href']
        else:
            raise ValueError('Expecting exactly 1 player url, found %d' % len(link))
       
        first_name = name_elem.find_class('first-name')
        if len(first_name) != 1:
            raise ValueError('Expecting exactly 1 first name, found %d' % len(first_name))
        first_name = regex_strip_string(first_name[0].text)

        last_name = name_elem.find_class('last-name')
        if len(last_name) != 1:
            raise ValueError('Expecting exactly 1 last name, found %d' % len(last_name))
        last_name = regex_strip_string(last_name[0].text)
        return first_name, last_name,  url

    
    def _parse_stats(self, suffix):
        nums = self.tree.find_class('match-stats-number-%s' % suffix)
        return [_find_center(x) for x in nums] 
    
    
    def parse_stats(self):
        stats_labels = regex_strip_array([x.text for x in self.tree.find_class('match-stats-label')])
        stats_labels = ['_'.join(x.lower().split(' ')) for x in stats_labels]
        self.left_stats_result = regex_strip_array(self._parse_stats('left'))
        assert len(stats_labels) == len(self.left_stats_result)
        self.left_stats = dict(zip(stats_labels, self.left_stats_result))
        self.right_stats_result = self._parse_stats('right')
        self.right_stats_result = regex_strip_array(self.right_stats_result)
        assert len(stats_labels) == len(self.right_stats_result)
        self.right_stats = dict(zip(stats_labels, self.right_stats_result))
        # TODO: Add results to object attributes with right prefix
    
    def parse_names(self):
        self.left_first_name, self.left_last_name, self.left_url = self._parse_name('left')
        self.right_first_name, self.right_last_name, self.right_url = self._parse_name('right')
        
    def parse_match(self):
        self.parse_names()
        self.parse_stats()

In [None]:
mrp = MatchStatsParser(URL)

In [None]:
mrp.left_stats

In [None]:
mrp.right_stats

In [None]:
mrp.left_stats_result

### Look at jd

In [4]:
from tennis_new.fetch.get_joined import read_joined

jd = read_joined()

  if (yield from self.run_code(code, result)):


In [None]:
jd['match_stats_url'].notnull().sum()

In [None]:
jd.iloc[-1]

In [None]:
jd['tourney_year_id'].tail(100)

In [None]:
jd.iloc[0]

In [None]:
jd['loser_url'].isnull().any()

In [None]:
jd['winner_url'].isnull().any()

In [None]:
jd['winner_name'].tail(100)

#### Joining Odds Data to My Data

In [181]:
from tennis_new.fetch.get_joined import read_joined

odds_df = pd.read_csv("/Users/siddhantjagadish/Downloads/2018.csv")
jd = read_joined()

  if (yield from self.run_code(code, result)):


#### Name Processing

Let's process the player's names so we can join on them

#### Tournament Mapping

In [1]:
# To help with joining odds data...
TOURNAMENT_MAPPING = {
    'BRISBANE INTERNATIONAL': ['BRISBANE', 'BRISBANE INTERNATIONAL'],
    'US OPEN': 'US OPEN',
    'WIMBLEDON': 'WIMBLEDON',
    'FRENCH OPEN': 'ROLAND GARROS',
    'AUSTRALIAN OPEN': ['AUSTRALIAN OPEN',  'AUSTRALIAN OPEN-2'],
    'SONY ERICSSON OPEN': [
        'ATP MASTERS 1000 MIAMI',
        'MIAMI OPEN PRESENTED BY ITAU',
        'MIAMI'
    ],
    'BNP PARIBAS OPEN': 'BNP PARIBAS OPEN',
    "INTERNAZIONALI BNL D'ITALIA": "INTERNAZIONALI BNL D'ITALIA",
    "WESTERN & SOUTHERN FINANCIAL GROUP MASTERS": "WESTERN & SOUTHERN OPEN",
    "AEGON CHAMPIONSHIPS": "LONDON / QUEEN'S CLUB",
    "MONTE CARLO MASTERS": "ATP MASTERS 1000 MONTE CARLO",
    "MUTUA MADRID OPEN": ["ATP MASTERS 1000 MADRID", "MUTUA MADRID OPEN"],
    "SHANGHAI MASTERS": ["ATP MASTERS 1000 SHANGHAI", "SHANGHAI"],
    "ROGERS MASTERS": ["ATP MASTERS 1000 CANADA", "COUPE ROGERS"],
    "CITI OPEN": ["WASHINGTON", "CITI OPEN"],
    "GERMAN OPEN TENNIS CHAMPIONSHIPS": [
        "HAMBURG",
        "ATP MASTERS 1000 HAMBURG",
        "HAMBURG EUROPEAN OPEN"
    ],
    "OPEN BANCO SABADELL": [
        "BARCELONA OPEN BANC SABADELL",
        "BARCELONA",
    ],
    
}

In [5]:
def tourney_stats(s):
    tourney_titles = jd[
        jd['tourney_title'].map(lambda x: s in x) &
        (jd['tour_type'] == 'atp')
    ]['tourney_title'].unique()
    rel = jd[
        jd['tourney_title'].isin(tourney_titles) &
        (jd['tour_type'] == 'atp')
    ]
    print(rel['tourney_title'].value_counts())
    print(rel.groupby('tourney_title').apply(lambda x: x['tourney_dates'].max()))
    
tourney_stats("Salem")

Winston-Salem         547
Winston-Salem Open     59
Name: tourney_title, dtype: int64
tourney_title
Winston-Salem         2018.08.19
Winston-Salem Open    2019.08.18
dtype: object


In [3]:
import pandas as pd

odds_df = pd.read_csv('/Users/siddhantjagadish/Downloads/2013.csv')

In [None]:
odds_df.groupby(['Tournament', 'Location']).apply(
    lambda x: pd.Series({
        'n_matches': x.shape[0],
        'min_date': x['Date'].min()
    })
).sort_values('n_matches', ascending=False)

In [None]:
odds_df[['Winner', 'Loser']].head()

#### Score Parsing

In [82]:
WALKOVER_DEFS = [
    'W/O',
    'DEF'
]

def parse_set_score(s):
    if 'RET' in s:
        return ('RETIRE', 'RETIRE')
    elif any([x in s for x in WALKOVER_DEFS]):
        return ('WALKOVER', 'WALKOVER')
    elif 'UNP' in s:
        return ('MATCH_NOT_PLAYED', 'MATCH_NOT_PLAYED')
    else:
        s = s.strip(' (NA)')
        if len(s) == 2:
            return int(s[0]), int(s[1])
        else:
            for b in range(1, len(s)):
                s1, s2 = int(s[:b]), int(s[b:])
                if abs(s1 - s2) <= 2:
                    return s1, s2
            return None, None 

In [83]:
def parse_match_score(s):
    if pd.isnull(s):
        return {}
    set_scores = s.split(';')
    out = {}
    for idx, ss in enumerate(set_scores):
        w, l = parse_set_score(ss)
        out.update({
            'W%d' % (idx + 1): w,
            'L%d' % (idx + 1): l
        })
    return out 

In [84]:
score_df = pd.DataFrame(jd['score'].map(parse_match_score).tolist())

In [88]:
jd = pd.concat([jd, score_df], axis=1)
jd.shape

(373029, 39)

In [93]:
score_cols = ['W%d' % s for s in range(1, 6)] + ['L%d' % s for s in range(1, 6)]

In [148]:
merged = pd.merge(
    odds_df,
    jd[[
        'match_id',
        'tourney_url_suffix',
        'winner_altered',
        'loser_altered',
        'tourney_dates',
    ] + score_cols],
    left_on=['Winner', 'Loser'] + score_cols,
    right_on=[
        'winner_altered', 'loser_altered',
    ] + score_cols,
)

In [149]:
odds_df.shape

(2631, 43)

In [150]:
merged.shape

(2098, 48)

In [151]:
merged['tourney_dates'] = pd.to_datetime(merged['tourney_dates'])
merged['odds_date'] = pd.to_datetime(merged['Date'])

In [152]:
merged['date_diff'] = (merged['odds_date'] - merged['tourney_dates']).map(lambda x: x.days)

In [162]:
q = merged[
    (merged['date_diff'] >= 0) & 
    (merged['date_diff'] < 31)
]

In [163]:
q['id'].value_counts()

1753    2
2047    1
1264    1
1220    1
1222    1
1226    1
1230    1
1232    1
1234    1
1238    1
1242    1
1244    1
1246    1
1252    1
1256    1
1260    1
1266    1
1202    1
1268    1
1270    1
1272    1
1276    1
1278    1
1280    1
1282    1
1284    1
1286    1
1290    1
1292    1
1294    1
       ..
431     1
463     1
2480    1
433     1
2482    1
435     1
2484    1
437     1
2486    1
2488    1
441     1
2490    1
443     1
2492    1
2494    1
447     1
2496    1
449     1
451     1
2500    1
453     1
2502    1
455     1
2504    1
457     1
2506    1
459     1
2508    1
461     1
0       1
Name: id, Length: 1985, dtype: int64

In [166]:
q[q['id'] == 1753][[
    'tourney_dates',
    'odds_date',
    'id',
    'match_id',
    'tourney_url_suffix',
    'Tournament'
]]

Unnamed: 0,tourney_dates,odds_date,id,match_id,tourney_url_suffix,Tournament
1350,2013-07-08,2013-07-19,1753,Fabio Fognini*Tommy Haas*2013_321*Quarter-Finals,/en/scores/archive/stuttgart/321/2013/results,German Open Tennis Championships
1351,2013-07-15,2013-07-19,1753,Fabio Fognini*Tommy Haas*2013_414*Quarter-Finals,/en/scores/archive/hamburg/414/2013/results,German Open Tennis Championships


In [160]:
odds_df[(odds_df['Winner'] == 'Fognini F.') & (odds_df['Loser'] == 'Haas T.')][[
    'id',
    'Date',
    'Tournament'
] + score_cols]

Unnamed: 0,id,Date,Tournament,W1,W2,W3,W4,W5,L1,L2,L3,L4,L5
1679,1679,7/12/13,Mercedes Cup,6.0,6.0,,,,2.0,4.0,,,
1753,1753,7/19/13,German Open Tennis Championships,6.0,6.0,,,,2.0,4.0,,,


In [121]:
odds_df.shape

(2631, 43)

In [122]:
merged.shape

(2679, 46)

In [111]:
jd[
    (jd['winner_altered'] == 'Nishikori K.') &
    (jd['tourney_title'] == "Brisbane")
][['winner_altered', 'loser_altered', 'tourney_dates'] + score_cols].tail(40)

Unnamed: 0,winner_altered,loser_altered,tourney_dates,W1,W2,W3,W4,W5,L1,L2,L3,L4,L5
240588,Nishikori K.,Reynolds B.,2009.01.04,6,6,,,,3,2,,,
240593,Nishikori K.,Berdych T.,2009.01.04,7,6,,,,6,3,,,
277836,Nishikori K.,Stebe C.,2012.01.01,3,6,6.0,,,6,1,4.0,,
290028,Nishikori K.,Matosevic M.,2012.12.30,7,6,,,,5,2,,,
290044,Nishikori K.,Robredo T.,2012.12.30,6,6,,,,3,3,,,
290050,Nishikori K.,Dolgopolov A.,2012.12.30,6,7,,,,4,6,,,
302117,Nishikori K.,Ebden M.,2013.12.29,6,6,,,,2,4,,,
302125,Nishikori K.,Cilic M.,2013.12.29,6,5,6.0,,,4,7,2.0,,
314374,Nishikori K.,Johnson S.,2015.01.04,6,7,,,,4,5,,,
314382,Nishikori K.,Tomic B.,2015.01.04,6,6,,,,0,4,,,


In [123]:
merged['id'].value_counts().head()

1983    3
706     3
1974    3
2005    3
2630    3
Name: id, dtype: int64

In [131]:
merged[merged['id'] == 1983][['match_id', 'winner_altered', 'loser_altered', 'tourney_url_suffix'] + score_cols]

Unnamed: 0,match_id,winner_altered,loser_altered,tourney_url_suffix,W1,W2,W3,W4,W5,L1,L2,L3,L4,L5
2013,Feliciano Lopez*Kei Nishikori*2011_425*Round o...,Lopez F.,Nishikori K.,/en/scores/archive/barcelona/425/2011/results,6,7,,,,4,6,,,
2014,Feliciano Lopez*Kei Nishikori*2013_422*Round o...,Lopez F.,Nishikori K.,/en/scores/archive/cincinnati/422/2013/results,6,7,,,,4,6,,,
2015,Feliciano Lopez*Kei Nishikori*2015_404*Round o...,Lopez F.,Nishikori K.,/en/scores/archive/indian-wells/404/2015/results,6,7,,,,4,6,,,


In [116]:
merged[merged['winner_altered'].isnull()].iloc[6]

ATP                          2
Location               Chennai
Tournament        Chennai Open
Date                    1/1/13
Series                  ATP250
Court                  Outdoor
Surface                   Hard
Round                1st Round
Best of                      3
Winner             Bautista R.
Loser                Kavcic B.
WRank                       80
LRank                       94
WPts                       648
LPts                       569
W1                           7
L1                           6
W2                           6
L2                           2
W3                         NaN
L3                         NaN
W4                         NaN
L4                         NaN
W5                         NaN
L5                         NaN
Wsets                        2
Lsets                        0
Comment              Completed
B365W                     1.44
B365L                     2.62
EXW                       1.55
EXL                       2.35
LBW     

In [90]:
jd.iloc[0]

loser_id                                                                         lh23
loser_name                                                                 F. Langham
loser_url                                        /en/players/f.-langham/lh23/overview
match_stats_url                                                                   NaN
round                                                                  Quarter-Finals
round_order                                                                         2
score                                                                     63;62;56;61
winner_id                                                                        gi91
winner_name                                                              Spencer Gore
winner_url                                     /en/players/spencer-gore/gi91/overview
tourney_title                                                               Wimbledon
tourney_location                                      

In [28]:
%pdb
pd.DataFrame(jd[
    (jd['winner_name'] == "Novak Djokovic") &
    (jd['loser_name'] == "Roger Federer")
]['score'].map(parse_match_score).tolist())

Automatic pdb calling has been turned ON


Unnamed: 0,L1,L2,L3,L4,L5,W1,W2,W3,W4,W5
0,6,6.0,6.0,,,7,2.0,7.0,,
1,5,3.0,6.0,,,7,6.0,7.0,,
2,6,2.0,3.0,,,3,6.0,6.0,,
3,6,3.0,3.0,,,4,6.0,6.0,,
4,4,6.0,2.0,,,6,4.0,6.0,,
5,7,1.0,7.0,2.0,5.0,5,6.0,5.0,6.0,7.0
6,6,5.0,4.0,,,7,7.0,6.0,,
7,3,3.0,,,,6,6.0,,,
8,3,6.0,2.0,,,6,3.0,6.0,,
9,7,6.0,3.0,2.0,5.0,6,4.0,6.0,6.0,7.0


In [None]:
jd['score'].tail(30).map(parse_match_score)

In [None]:
jd['score'].tail()

In [177]:
def reformat_name(n):
    s = n.split(' ')
    return ' '.join(s[1:]) + ' ' + s[0][0] + '.' 

def last_name_jd(n):
    return ' '.join(n.split(' ')[1:])

def last_name_odds(n):
    return ' '.join(n.split(' ')[: 1])

jd['winner_altered'] = jd['winner_name'].map(reformat_name)
jd['loser_altered'] = jd['loser_name'].map(reformat_name)
jd['winner_last_name'] = jd['winner_name'].map(last_name_jd)
jd['loser_last_name'] = jd['loser_name'].map(last_name_jd)

In [None]:
q = pd.merge(
    odds_df,
    jd[['winner_altered', 'loser_altered']],
    left_on=['Winner', 'Loser'],
    right_on=['winner_altered', 'loser_altered']
)

In [None]:
(odds_df['B365W'] < odds_df['B365L']).mean()

Bet365 had 69.1% accuracy in 2013 for these matches...We can beat this!  Let's see how it was in 2018

In [168]:
odds_2018 = pd.read_csv("/Users/siddhantjagadish/Downloads/2018.csv")

In [171]:
(odds_2018['B365W'] <= odds_2018['B365L']).mean()

0.6890405764125901

In [172]:
(odds_2018['B365W'] < odds_2018['B365L']).mean()

0.6617368221463784

In 2018, 69% accuracy, but lots of cases with even odds that I'm 

In [173]:
rel = odds_2018[odds_2018['B365W'] != odds_2018['B365L']]
(rel['B365W'] < rel['B365L']).mean()

0.6803118908382066

Removing these, 68% accuracy

In [176]:
ranked = odds_2018[
    odds_2018['WRank'].notnull() &
    odds_2018['LRank'].notnull()
]
(ranked['WRank'] < ranked['LRank']).mean()

0.6339150227617603

The rankings have 63% accuracy over the same period

#### Odds Scraping

In [2]:
URL = 'https://www.oddsportal.com/tennis/australia/canberra-challenger-men/results/'

In [3]:
from lxml import html
import requests

tree = html.fromstring(requests.get(URL).content)

In [4]:
html.tostring(tree)

b'<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="cs">\n            <head>\n\t\t        <meta http-equiv="content-type" content="text/html; charset=utf-8">\n\t\t        <meta http-equiv="expires" content="86400">\n\t\t        <meta name="description" content="">\n\t\t        <meta name="keywords" content="">\n\t\t        <title>OddsPortal: Page not found</title>\n\t\t        <style type="text/css" media="screen,projection">\n\t\t\t\t\tbody {background: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAQAAAAEAgMAAADUn3btAAAADFBMVEUqKiokJCQbGxsfHx/MYYG6AAAAGUlEQVR4XgXAgRAAAAwCwDeY3/zyC6FzXmUFqAH/YS8jCAAAAABJRU5ErkJggg%3D%3D") repeat scroll 0 0 #2A2A2A;font-family: Tahoma,Verdana,Arial;}body {background-color: #000000;}body {   color: #333333;font-size: 75%;margin: 0;padding: 0;}.wrap {padding-top: 1px;}           #mother {margin: 0 auto 25px;padding: 22px 0 1px;position: relative;text-align: left;width: 790px;}       #wrap {background: url("data:image/png;base64,iVBORw0KGgoAAAANS