In this notebook, we look into some weird stuff in our data fetching, such as:

* String vs. bytes in match_stats data 

#### String vs. bytes in match stats data

In [None]:
import os
from tennis_new.fetch.atp_api.defs import (
    API_RESULTS_DIR,
    MATCH_STATS_HEADER,
    MATCH_SCORES_HEADER
)
import pandas as pd

In [None]:
match_scores_path = os.path.join(
    API_RESULTS_DIR,
    'match_scores',
    'match_scores_2016-2016.csv'
)
match_scores = pd.read_csv(match_scores_path)

In [None]:
assert match_scores['match_id'].value_counts().max() == 1

In [None]:
match_scores[[
    'winner_name',
    'loser_name',
    'winner_player_id',
    'loser_player_id',
    'tourney_slug',
    'tourney_round_name',
    'match_score_tiebreaks'
]].head(10)

In [None]:
[x for x in match_scores.columns if match_scores[x].dtype == 'O']

In [None]:
match_scores['winner_name'].dtype

#### Look into match stats

In [None]:
match_stats_path = os.path.join(
    API_RESULTS_DIR,
    'match_stats',
    'match_stats_2018_0.csv'
)
match_stats = pd.read_csv(match_stats_path)

In [None]:
match_stats.iloc[0]['match_stats_url_suffix']

In [None]:
match_stats.iloc[0]

#### Tourney Data

In [None]:
tourney_path = os.path.join(
    API_RESULTS_DIR,
    'tournaments',
    'tournaments_2017-2019.csv'
)
tourneys = pd.read_csv(tourney_path)

In [None]:
tourneys.iloc[0]

I can probably decode tourney_name and tourney_location in tournaments upon parsing... 

In [None]:
[x for x in tourneys.columns if tourneys[x].dtype == 'O']

#### Tourneys Combined...

In [None]:
from tennis_new.fetch.defs import STORED_DATA_PATH

tourneys_combined_path = os.path.join(STORED_DATA_PATH, 'tournaments', 'combined.csv')
tourneys = pd.read_csv(tourneys_combined_path)

In [None]:
tourneys.iloc[0]

#### Match Scores Combined...

In [None]:
from tennis_new.fetch.defs import STORED_DATA_PATH

ms_combined_path = os.path.join(STORED_DATA_PATH, 'match_scores', 'combined.csv')
match_scores = pd.read_csv(
    ms_combined_path
)

In [None]:
match_scores.columns[23]

In [None]:
match_scores.iloc[0]

In [None]:
match_scores.shape

#### Read Joined


In [None]:
from tennis_new.fetch.defs import STORED_DATA_PATH

jd = pd.read_csv(os.path.join(STORED_DATA_PATH, 'joined.tsv'), sep='\t')

In [None]:
jd.columns[19]

In [None]:
jd['tourney_fin_commit'].value_counts(dropna=False)

#### Missing Tournaments

We are missing a bunch of tournaments from our dataset.  For example, Soonwoo Kwon has played many more matches than we see in our dataset.  In addition, these matches even appear on the ATP website, but these tournaments don't appear in the way we're searching for them...

In [1]:
BASE_URL = "https://www.atptour.com/en/players/soonwoo-kwon/kf17/player-activity?year=all"


class PlayerParser():

    @staticmethod
    def return_first_elem(l):
        if len(l) > 1:
            raise ValueError("Two tournament names")
        if len(l) > 0:
            return l[0]
        else:
            return None
    
    def _get_tourney_name(self, tourney_elem):
        rel_tourney_xpath = ".//a[contains(@class, 'tourney-title')]/text()"
        tourney_names = tourney_elem.xpath(rel_tourney_xpath)
        tn = self.return_first_elem(tourney_names)
        if tn is None:
            rel_tourney_name_path = ".//span[contains(@class, 'tourney-title')]/text()"
            tourney_names = tourney_elem.xpath(rel_tourney_name_path)
            tn = self.return_first_elem(tourney_names)
        return tn
    
    def _parse_tourney(self, tourney_elem):
        tn = self._get_tourney_name(tourney_elem)
        
        rel_tourney_dates_xpath = ".//span[contains(@class, 'tourney-dates')]/text()"
        tourney_dates = tourney_elem.xpath(rel_tourney_dates_xpath)
        td = self.return_first_elem(tourney_dates)
        return (tn, td)
        
    def parse_tourneys(self):
        title_content_xpath = "//td[contains(@class, 'title-content')]"
        self.tourney_elems = xpath_parse(q, title_content_xpath)
        self.tourneys = [self._parse_tourney(t) for t in self.tourney_elems]
        
    def __init__(self, base_url):
        self.base_parse_tree = html_parse_tree(base_url)
        self.parse_tourneys()

In [2]:
from tennis_new.fetch.atp_api.scrapers.functions import html_parse_tree, xpath_parse
import lxml

BASE_URL = "https://www.atptour.com/en/players/soonwoo-kwon/kf17/player-activity?year=all"

q = html_parse_tree(BASE_URL)

In [3]:
pp = PlayerParser(BASE_URL)

In [4]:
import pandas as pd

tourney_df = pd.DataFrame(
    pp.tourneys,
    columns=[
        'tourney_name',
        'tourney_dates'
    ]
)

In [5]:
tourney_df.head()

Unnamed: 0,tourney_name,tourney_dates
0,Wimbledon,\r\n2019.07.01 - 2019.07.14
1,Ilkley Trophy,\r\n2019.06.17 - 2019.06.23
2,Nature Valley Open Nottingham,\r\n2019.06.10 - 2019.06.16
3,Surbiton Trophy,\r\n2019.06.03 - 2019.06.09
4,Roland Garros,\r\n2019.05.27 - 2019.06.09


In [6]:
w = lxml.etree.tostring(q)
print(w.decode())

<html class="no-js ">&#13;
<!--<![endif]-->&#13;
	<head>&#13;
		<!-- disable auto format for telephone numbers -->&#13;
		<meta name="format-detection" content="telephone=no"/>&#13;
		&#13;
&#13;
<title>&#13;
	Soonwoo Kwon | Player Activity | ATP Tour | Tennis&#13;
</title>&#13;
&#13;
<meta name="viewport" content="initial-scale=1.0, width=768, user-scalable=yes, minimum-scale=1.0, maximum-scale=1.25"/>&#13;
&#13;
<meta charset="UTF-8"/>&#13;
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"/>&#13;
&#13;
<meta name="keywords" content=""/>&#13;
<meta name="description" content=""/>&#13;
&#13;
<meta name="SKYPE_TOOLBAR" content="SKYPE_TOOLBAR_PARSER_COMPATIBLE"/>&#13;
<meta name="thumbnail" content="www.atptour.com/-/media/tennis/players/head-shot/2018/soonwon_kwon_head_ao18.png"/>&#13;
&#13;
<meta property="pageTransitionTitle" content="Soonwoo Kwon"/>&#13;
<meta property="fb:app_id" content="132901818553"/>&#13;
<meta property="og:locale" content="en"/>&#13;
<meta property=

In [38]:
table_xpath = "//table[contains(@class, 'mega-table')]"
tables = xpath_parse(q, table_xpath)
html_table = lxml.etree.tostring(tables[1], pretty_print=True).decode()
print(html_table)

<table class="mega-table">&#13;
                                        <thead>&#13;
                                        <th>&#13;
                                            Round&#13;
                                        </th>&#13;
                                        <th>Rank</th>&#13;
                                        <th>Opponent</th>&#13;
                                        <th>W-L</th>&#13;
                                        <th>Score</th>&#13;
                                        </thead>&#13;
                                        <tbody>&#13;
                                                        <tr>&#13;
                                                            <td>Round of 16</td>&#13;
                                                            <td>&#13;
64                                                            </td>&#13;
                                                            <td>&#13;
&#13;
                                         

In [39]:
z = pd.read_html(html_table)

In [40]:
z[0]

Unnamed: 0,Round,Rank,Opponent,W-L,Score
0,Round of 16,64,Ugo Humbert,L,26 57
1,Round of 32,129,Gianluca Mager,W,46 62 60
2,Round of 64,499,Evan Hoyt,W,60 64


In [None]:
dropdown_xpath = "//ul[contains(@data-value, 'year')]"
year_dropdowns = xpath_parse(q, dropdown_xpath)
year_dropdowns[0]

In [None]:
type(z)

In [None]:
title_content_xpath = "//td[contains(@class, 'title-content')]"
tourney_contents = xpath_parse(q, title_content_xpath)

In [None]:
z = tourney_contents[1]
print(lxml.etree.tostring(z, pretty_print=True).decode())

In [None]:
rel_tourney_xpath = ".//a[contains(@class, 'tourney-title')]/text()"

In [None]:
z.xpath(rel_tourney_xpath)

In [None]:
tourney_title = xpath_parse(z, tourney_title_xpath)

In [None]:
?xpath_parse

In [None]:
tourney_title_xpath = "//a[contains(@class, 'tourney-title')]/text()"
tourney_titles = xpath_parse(q, tourney_title_xpath)
tourney_titles

In [None]:
len(tourney_titles)

In [None]:
tourney_dates_xpath = "//span[contains(@class, 'tourney-dates')]/text()"
tourney_dates = xpath_parse(q, tourney_dates_xpath)
# assert len(tourney_dates) == len(tourney_titles)


#### Some Tourney Months are Missing

In some rare cases in the old data, the tourney month is missing...

In [28]:
from tennis_new.fetch.defs import STORED_DATA_PATH
from tennis_new.fetch.atp_api.defs import TOURNAMENTS_HEADER
from datetime import datetime
import pandas as pd
import os

q = pd.read_csv(
    os.path.join(
        STORED_DATA_PATH,
        'joined.tsv',
    ),
    parse_dates=['date'],
    sep='\t'
)

  interactivity=interactivity, compiler=compiler, result=result)


In [29]:
q.shape

(198106, 49)