In this notebook, we look into some weird stuff in our data fetching, such as:

* String vs. bytes in match_stats data 

#### String vs. bytes in match stats data

In [None]:
import os
from tennis_new.fetch.atp_api.defs import (
    API_RESULTS_DIR,
    MATCH_STATS_HEADER,
    MATCH_SCORES_HEADER
)
import pandas as pd

In [None]:
match_scores_path = os.path.join(
    API_RESULTS_DIR,
    'match_scores',
    'match_scores_2016-2016.csv'
)
match_scores = pd.read_csv(match_scores_path)

In [None]:
assert match_scores['match_id'].value_counts().max() == 1

In [None]:
match_scores[[
    'winner_name',
    'loser_name',
    'winner_player_id',
    'loser_player_id',
    'tourney_slug',
    'tourney_round_name',
    'match_score_tiebreaks'
]].head(10)

In [None]:
[x for x in match_scores.columns if match_scores[x].dtype == 'O']

In [None]:
match_scores['winner_name'].dtype

#### Look into match stats

In [None]:
match_stats_path = os.path.join(
    API_RESULTS_DIR,
    'match_stats',
    'match_stats_2018_0.csv'
)
match_stats = pd.read_csv(match_stats_path)

In [None]:
match_stats.iloc[0]['match_stats_url_suffix']

In [None]:
match_stats.iloc[0]

#### Tourney Data

In [None]:
tourney_path = os.path.join(
    API_RESULTS_DIR,
    'tournaments',
    'tournaments_2017-2019.csv'
)
tourneys = pd.read_csv(tourney_path)

In [None]:
tourneys.iloc[0]

I can probably decode tourney_name and tourney_location in tournaments upon parsing... 

In [None]:
[x for x in tourneys.columns if tourneys[x].dtype == 'O']

#### Tourneys Combined...

In [None]:
from tennis_new.fetch.defs import STORED_DATA_PATH

tourneys_combined_path = os.path.join(STORED_DATA_PATH, 'tournaments', 'combined.csv')
tourneys = pd.read_csv(tourneys_combined_path)

In [None]:
tourneys.iloc[0]

#### Match Scores Combined...

In [None]:
from tennis_new.fetch.defs import STORED_DATA_PATH

ms_combined_path = os.path.join(STORED_DATA_PATH, 'match_scores', 'combined.csv')
match_scores = pd.read_csv(
    ms_combined_path
)

In [None]:
match_scores.columns[23]

In [None]:
match_scores.iloc[0]

In [None]:
match_scores.shape

#### Read Joined


In [None]:
from tennis_new.fetch.defs import STORED_DATA_PATH

jd = pd.read_csv(os.path.join(STORED_DATA_PATH, 'joined.tsv'), sep='\t')

In [None]:
jd.columns[19]

In [None]:
jd['tourney_fin_commit'].value_counts(dropna=False)

#### Missing Tournaments

We are missing a bunch of tournaments from our dataset.  For example, Soonwoo Kwon has played many more matches than we see in our dataset.  In addition, these matches even appear on the ATP website, but these tournaments don't appear in the way we're searching for them...

In [1]:
BASE_URL = "https://www.atptour.com/en/players/soonwoo-kwon/kf17/player-activity?year=all"


class PlayerParser():

    @staticmethod
    def return_first_elem(l):
        if len(l) > 1:
            raise ValueError("Two tournament names")
        if len(l) > 0:
            return l[0]
        else:
            return None
    
    def _get_tourney_name(self, tourney_elem):
        rel_tourney_xpath = ".//a[contains(@class, 'tourney-title')]/text()"
        tourney_names = tourney_elem.xpath(rel_tourney_xpath)
        tn = self.return_first_elem(tourney_names)
        if tn is None:
            rel_tourney_name_path = ".//span[contains(@class, 'tourney-title')]/text()"
            tourney_names = tourney_elem.xpath(rel_tourney_name_path)
            tn = self.return_first_elem(tourney_names)
        return tn
    
    def _parse_tourney(self, tourney_elem):
        tn = self._get_tourney_name(tourney_elem)
        
        rel_tourney_dates_xpath = ".//span[contains(@class, 'tourney-dates')]/text()"
        tourney_dates = tourney_elem.xpath(rel_tourney_dates_xpath)
        td = self.return_first_elem(tourney_dates)
        return (tn, td)
        
    def parse_tourneys(self):
        title_content_xpath = "//td[contains(@class, 'title-content')]"
        self.tourney_elems = xpath_parse(q, title_content_xpath)
        self.tourneys = [self._parse_tourney(t) for t in self.tourney_elems]
        
    def __init__(self, base_url):
        self.base_parse_tree = html_parse_tree(base_url)
        self.parse_tourneys()

In [2]:
from tennis_new.fetch.atp_api.scrapers.functions import html_parse_tree, xpath_parse
import lxml

BASE_URL = "https://www.atptour.com/en/players/soonwoo-kwon/kf17/player-activity?year=all"

q = html_parse_tree(BASE_URL)

In [3]:
pp = PlayerParser(BASE_URL)

In [4]:
import pandas as pd

tourney_df = pd.DataFrame(
    pp.tourneys,
    columns=[
        'tourney_name',
        'tourney_dates'
    ]
)

In [5]:
tourney_df.head()

Unnamed: 0,tourney_name,tourney_dates
0,Wimbledon,\r\n2019.07.01 - 2019.07.14
1,Ilkley Trophy,\r\n2019.06.17 - 2019.06.23
2,Nature Valley Open Nottingham,\r\n2019.06.10 - 2019.06.16
3,Surbiton Trophy,\r\n2019.06.03 - 2019.06.09
4,Roland Garros,\r\n2019.05.27 - 2019.06.09


In [6]:
w = lxml.etree.tostring(q)
print(w.decode())

<html class="no-js ">&#13;
<!--<![endif]-->&#13;
	<head>&#13;
		<!-- disable auto format for telephone numbers -->&#13;
		<meta name="format-detection" content="telephone=no"/>&#13;
		&#13;
&#13;
<title>&#13;
	Soonwoo Kwon | Player Activity | ATP Tour | Tennis&#13;
</title>&#13;
&#13;
<meta name="viewport" content="initial-scale=1.0, width=768, user-scalable=yes, minimum-scale=1.0, maximum-scale=1.25"/>&#13;
&#13;
<meta charset="UTF-8"/>&#13;
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"/>&#13;
&#13;
<meta name="keywords" content=""/>&#13;
<meta name="description" content=""/>&#13;
&#13;
<meta name="SKYPE_TOOLBAR" content="SKYPE_TOOLBAR_PARSER_COMPATIBLE"/>&#13;
<meta name="thumbnail" content="www.atptour.com/-/media/tennis/players/head-shot/2018/soonwon_kwon_head_ao18.png"/>&#13;
&#13;
<meta property="pageTransitionTitle" content="Soonwoo Kwon"/>&#13;
<meta property="fb:app_id" content="132901818553"/>&#13;
<meta property="og:locale" content="en"/>&#13;
<meta property=

In [38]:
table_xpath = "//table[contains(@class, 'mega-table')]"
tables = xpath_parse(q, table_xpath)
html_table = lxml.etree.tostring(tables[1], pretty_print=True).decode()
print(html_table)

<table class="mega-table">&#13;
                                        <thead>&#13;
                                        <th>&#13;
                                            Round&#13;
                                        </th>&#13;
                                        <th>Rank</th>&#13;
                                        <th>Opponent</th>&#13;
                                        <th>W-L</th>&#13;
                                        <th>Score</th>&#13;
                                        </thead>&#13;
                                        <tbody>&#13;
                                                        <tr>&#13;
                                                            <td>Round of 16</td>&#13;
                                                            <td>&#13;
64                                                            </td>&#13;
                                                            <td>&#13;
&#13;
                                         

In [39]:
z = pd.read_html(html_table)

In [40]:
z[0]

Unnamed: 0,Round,Rank,Opponent,W-L,Score
0,Round of 16,64,Ugo Humbert,L,26 57
1,Round of 32,129,Gianluca Mager,W,46 62 60
2,Round of 64,499,Evan Hoyt,W,60 64


In [None]:
dropdown_xpath = "//ul[contains(@data-value, 'year')]"
year_dropdowns = xpath_parse(q, dropdown_xpath)
year_dropdowns[0]

In [None]:
type(z)

In [None]:
title_content_xpath = "//td[contains(@class, 'title-content')]"
tourney_contents = xpath_parse(q, title_content_xpath)

In [None]:
z = tourney_contents[1]
print(lxml.etree.tostring(z, pretty_print=True).decode())

In [None]:
rel_tourney_xpath = ".//a[contains(@class, 'tourney-title')]/text()"

In [None]:
z.xpath(rel_tourney_xpath)

In [None]:
tourney_title = xpath_parse(z, tourney_title_xpath)

In [None]:
?xpath_parse

In [None]:
tourney_title_xpath = "//a[contains(@class, 'tourney-title')]/text()"
tourney_titles = xpath_parse(q, tourney_title_xpath)
tourney_titles

In [None]:
len(tourney_titles)

In [None]:
tourney_dates_xpath = "//span[contains(@class, 'tourney-dates')]/text()"
tourney_dates = xpath_parse(q, tourney_dates_xpath)
# assert len(tourney_dates) == len(tourney_titles)


#### Some Tourney Months are Missing

In some rare cases in the old data, the tourney month is missing...

In [28]:
from tennis_new.fetch.defs import STORED_DATA_PATH
from tennis_new.fetch.atp_api.defs import TOURNAMENTS_HEADER
from datetime import datetime
import pandas as pd
import os

q = pd.read_csv(
    os.path.join(
        STORED_DATA_PATH,
        'joined.tsv',
    ),
    parse_dates=['date'],
    sep='\t'
)

  interactivity=interactivity, compiler=compiler, result=result)


In [29]:
q.shape

(198106, 49)

#### Some Dates are in the Future

In [2]:
import pandas as pd
from pathlib import Path 
from tennis_new.fetch.defs import STORED_DATA_PATH

tourneys = pd.read_csv(
    Path.joinpath(STORED_DATA_PATH, 'tournaments', 'combined.csv')
)

In [6]:
weirdos = tourneys[tourneys['tourney_start_date'] > '2019-07-01']

In [14]:
tourneys[tourneys['tourney_name'] == 'Brisbane'].sort_values('tourney_dates')

Unnamed: 0,tourney_year,tourney_order,tourney_name,tourney_id,tourney_slug,tourney_location,tourney_dates,tourney_month,tourney_day,tourney_singles_draw,...,doubles_winner_1_name,doubles_winner_1_url,doubles_winner_1_player_slug,doubles_winner_1_player_id,doubles_winner_2_name,doubles_winner_2_url,doubles_winner_2_player_slug,doubles_winner_2_player_id,tourney_year_id,tourney_start_date
720,1972,81,Brisbane,326.0,singapore,Brisbane,1972.12.03,12,3,64,...,Ross Case,/en/players/ross-case/c020/overview,ross-case,c020,Geoff Masters,/en/players/geoff-masters/m139/overview,geoff-masters,m139,1972-326,1972-12-03
1100,1976,84,Brisbane,326.0,singapore,Brisbane,1976.10.11,10,11,32,...,Syd Ball,/en/players/syd-ball/b106/overview,syd-ball,b106,Kim Warwick,/en/players/kim-warwick/w006/overview,kim-warwick,w006,1976-326,1976-10-11
1198,1977,78,Brisbane,326.0,singapore,Brisbane,1977.10.10,10,10,32,...,Vitas Gerulaitis,/en/players/vitas-gerulaitis/g008/overview,vitas-gerulaitis,g008,Bill Scanlon,/en/players/bill-scanlon/s017/overview,bill-scanlon,s017,1977-326,1977-10-10
1299,1978,75,Brisbane,326.0,singapore,Brisbane,1978.10.09,10,9,32,...,John Alexander,/en/players/john-alexander/a014/overview,john-alexander,a014,Phil Dent,/en/players/phil-dent/d074/overview,phil-dent,d074,1978-326,1978-10-09
1395,1979,74,Brisbane,326.0,singapore,Brisbane,1979.10.09,10,9,32,...,Ross Case,/en/players/ross-case/c020/overview,ross-case,c020,Geoff Masters,/en/players/geoff-masters/m139/overview,geoff-masters,m139,1979-326,1979-10-09
1495,1980,77,Brisbane,326.0,singapore,Brisbane,1980.10.06,10,6,32,...,John McEnroe,/en/players/john-mcenroe/m047/overview,john-mcenroe,m047,Matt Mitchell,/en/players/matt-mitchell/m071/overview,matt-mitchell,m071,1980-326,1980-10-06
1593,1981,72,Brisbane,326.0,singapore,Brisbane,1981.10.05,10,5,32,...,Rod Frawley,/en/players/rod-frawley/f039/overview,rod-frawley,f039,Chris Lewis (NZL),/en/players/chris-lewis-nzl/l024/overview,chris-lewis-nzl,l024,1981-326,1981-10-05
1788,1983,65,Brisbane,326.0,singapore,Brisbane,1983.10.03,10,3,32,...,Pat Cash,/en/players/pat-cash/c023/overview,pat-cash,c023,Paul McNamee,/en/players/paul-mcnamee/m050/overview,paul-mcnamee,m050,1983-326,1983-10-03
1863,1984,55,Brisbane,326.0,singapore,Brisbane,1984.10.01,10,1,32,...,Francisco Gonzalez,/en/players/francisco-gonzalez/g024/overview,francisco-gonzalez,g024,Matt Mitchell,/en/players/matt-mitchell/m071/overview,matt-mitchell,m071,1984-326,1984-10-01
1938,1985,56,Brisbane,326.0,singapore,Brisbane,1985.10.07,10,7,32,...,Marty Davis,/en/players/marty-davis/d005/overview,marty-davis,d005,Brad Drewett,/en/players/brad-drewett/d040/overview,brad-drewett,d040,1985-326,1985-10-07


In [8]:
tkourneys[
    (tourneys['tourney_month'] == 12) &
    (tourneys['tourney_day'] == 31)
]

Unnamed: 0,tourney_year,tourney_order,tourney_name,tourney_id,tourney_slug,tourney_location,tourney_dates,tourney_month,tourney_day,tourney_singles_draw,...,doubles_winner_1_name,doubles_winner_1_url,doubles_winner_1_player_slug,doubles_winner_1_player_id,doubles_winner_2_name,doubles_winner_2_url,doubles_winner_2_player_slug,doubles_winner_2_player_id,tourney_year_id,tourney_start_date
69,2018,1,Brisbane,339.0,brisbane,"Brisbane, Australia",2017.12.31,12,31,28,...,Henri Kontinen,/en/players/henri-kontinen/ka80/overview,henri-kontinen,ka80,John Peers,/en/players/john-peers/pc96/overview,john-peers,pc96,2018-339,2018-12-31
138,2019,1,Qatar ExxonMobil Open,451.0,doha,"Doha, Qatar",2018.12.31,12,31,32,...,David Goffin,/en/players/david-goffin/gb88/overview,david-goffin,gb88,Pierre-Hugues Herbert,/en/players/pierre-hugues-herbert/h996/overview,pierre-hugues-herbert,h996,2019-451,2019-12-31
139,2019,2,Brisbane International,339.0,brisbane,"Brisbane, Australia",2018.12.31,12,31,28,...,Marcus Daniell,/en/players/marcus-daniell/d763/overview,marcus-daniell,d763,Wesley Koolhof,/en/players/wesley-koolhof/kc41/overview,wesley-koolhof,kc41,2019-339,2019-12-31
140,2019,3,Tata Open Maharashtra,891.0,pune,"Pune, India",2018.12.31,12,31,28,...,Rohan Bopanna,/en/players/rohan-bopanna/b757/overview,rohan-bopanna,b757,Divij Sharan,/en/players/divij-sharan/sd46/overview,divij-sharan,sd46,2019-891,2019-12-31
479,1969,1,Perth,243.0,perth,"Perth, Australia",1968.12.31,12,31,32,...,,,,,,,,,1969-243,1969-12-31
1018,1976,2,Auckland,301.0,auckland,Auckland,1975.12.31,12,31,32,...,,,,,,,,,1976-301,1976-12-31
1419,1980,1,Hobart,713.0,hobart,Hobart,1979.12.31,12,31,32,...,John James,/en/players/john-james/j029/overview,john-james,j029,Chris Kachel,/en/players/chris-kachel/k044/overview,chris-kachel,k044,1980-713,1980-12-31
2347,1991,1,Adelaide,7308.0,adelaide,Adelaide,1990.12.31,12,31,32,...,Wayne Ferreira,/en/players/wayne-ferreira/f196/overview,wayne-ferreira,f196,Stefan Kruger,/en/players/stefan-kruger/k028/overview,stefan-kruger,k028,1991-7308,1991-12-31
2348,1991,2,Wellington,354.0,wellington,Wellington,1990.12.31,12,31,32,...,Luiz Mattar,/en/players/luiz-mattar/m035/overview,luiz-mattar,m035,Nicolas Pereira,/en/players/nicolas-pereira/p218/overview,nicolas-pereira,p218,1991-354,1991-12-31
3251,2002,1,Adelaide,7308.0,adelaide,Adelaide,2001.12.31,12,31,32,...,Wayne Black,/en/players/wayne-black/b499/overview,wayne-black,b499,Kevin Ullyett,/en/players/kevin-ullyett/u024/overview,kevin-ullyett,u024,2002-7308,2002-12-31


In [7]:
weirdos

Unnamed: 0,tourney_year,tourney_order,tourney_name,tourney_id,tourney_slug,tourney_location,tourney_dates,tourney_month,tourney_day,tourney_singles_draw,...,doubles_winner_1_name,doubles_winner_1_url,doubles_winner_1_player_slug,doubles_winner_1_player_id,doubles_winner_2_name,doubles_winner_2_url,doubles_winner_2_player_slug,doubles_winner_2_player_id,tourney_year_id,tourney_start_date
138,2019,1,Qatar ExxonMobil Open,451.0,doha,"Doha, Qatar",2018.12.31,12,31,32,...,David Goffin,/en/players/david-goffin/gb88/overview,david-goffin,gb88,Pierre-Hugues Herbert,/en/players/pierre-hugues-herbert/h996/overview,pierre-hugues-herbert,h996,2019-451,2019-12-31
139,2019,2,Brisbane International,339.0,brisbane,"Brisbane, Australia",2018.12.31,12,31,28,...,Marcus Daniell,/en/players/marcus-daniell/d763/overview,marcus-daniell,d763,Wesley Koolhof,/en/players/wesley-koolhof/kc41/overview,wesley-koolhof,kc41,2019-339,2019-12-31
140,2019,3,Tata Open Maharashtra,891.0,pune,"Pune, India",2018.12.31,12,31,28,...,Rohan Bopanna,/en/players/rohan-bopanna/b757/overview,rohan-bopanna,b757,Divij Sharan,/en/players/divij-sharan/sd46/overview,divij-sharan,sd46,2019-891,2019-12-31
177,2019,40,Hall of Fame Open,,,"Newport, United States",2019.07.15,7,15,28,...,,,,,,,,,2019-,2019-07-15
178,2019,41,Plava Laguna Croatia Open Umag,,,"Umag, Croatia",2019.07.15,7,15,28,...,,,,,,,,,2019-,2019-07-15
179,2019,42,Swedish Open,,,"Bastad, Sweden",2019.07.15,7,15,28,...,,,,,,,,,2019-,2019-07-15
180,2019,43,Hamburg European Open,,,"Hamburg, Germany",2019.07.22,7,22,32,...,,,,,,,,,2019-,2019-07-22
181,2019,44,BB&T Atlanta Open,,,"Atlanta, United States",2019.07.22,7,22,28,...,,,,,,,,,2019-,2019-07-22
182,2019,45,J.Safra Sarasin Swiss Open Gstaad,,,"Gstaad, Switzerland",2019.07.22,7,22,28,...,,,,,,,,,2019-,2019-07-22
183,2019,46,Citi Open,,,"Washington, United States",2019.07.29,7,29,48,...,,,,,,,,,2019-,2019-07-29


#### Some Tourney Dates are Missing

In [51]:
from pathlib import Path
from tennis_new.fetch.defs import STORED_DATA_PATH
from tennis_new.fetch.atp_api.defs import TOURNAMENTS_HEADER
import pandas as pd

static_tourneys = pd.read_csv(
    Path.joinpath(STORED_DATA_PATH, 'tournaments', 'tournaments_1877-2017_UNINDEXED.csv'),
    header=None,
    names=TOURNAMENTS_HEADER
)

In [60]:
# We have non-unique tourney_year_ids...
static_tourneys['tourney_year_id'].value_counts().head()

1975-       2
1976-414    1
2000-499    1
1982-316    1
2003-341    1
Name: tourney_year_id, dtype: int64

Above, we see we have two tournaments with tourney_year_id == '1975-'.  This means they have NaN tourney_id.  What's up with this!?

In [58]:
static_tourneys[
    static_tourneys['tourney_year_id'] == '1975-'
].iloc[0]

tourney_year                                                     1975
tourney_order                                                      43
tourney_name                                                Tokyo WCT
tourney_id                                                        NaN
tourney_slug                                                      NaN
tourney_location                                                Tokyo
tourney_dates                                              1975.04.14
tourney_month                                                       4
tourney_day                                                        14
tourney_singles_draw                                               32
tourney_doubles_draw                                               32
tourney_conditions                                            Outdoor
tourney_surface                                                  Clay
tourney_fin_commit                                            $60,000
tourney_url_suffix  

It looks like we don't have match results from this tournament, although they are available on the ATP website!!!

In [52]:
static_tourneys[
    static_tourneys['tourney_dates'].isnull()
]

Unnamed: 0,tourney_year,tourney_order,tourney_name,tourney_id,tourney_slug,tourney_location,tourney_dates,tourney_month,tourney_day,tourney_singles_draw,...,singles_winner_player_id,doubles_winner_1_name,doubles_winner_1_url,doubles_winner_1_player_slug,doubles_winner_1_player_id,doubles_winner_2_name,doubles_winner_2_url,doubles_winner_2_player_slug,doubles_winner_2_player_id,tourney_year_id


In [53]:
static_tourneys['tourney_dates']

0       1877.07.09
1       1878.07.08
2       1879.07.07
3       1880.07.05
4       1881.07.02
5       1881.08.31
6       1882.07.08
7       1882.08.30
8       1883.07.07
9       1883.08.21
10      1884.07.05
11      1884.08.27
12      1885.07.04
13      1885.08.18
14      1886.07.03
15      1886.08.23
16      1887.07.02
17      1887.08.22
18      1888.07.10
19      1888.08.20
20      1889.07.01
21      1889.08.21
22      1890.06.30
23      1890.08.27
24      1891.06.29
25      1891.08.18
26      1892.06.27
27      1892.08.23
28      1893.07.10
29      1893.08.22
           ...    
4083    2017.06.25
4084    2017.06.26
4085    2017.07.03
4086    2017.07.17
4087    2017.07.17
4088    2017.07.17
4089    2017.07.24
4090    2017.07.24
4091    2017.07.24
4092    2017.07.31
4093    2017.07.31
4094    2017.07.31
4095    2017.08.07
4096    2017.08.13
4097    2017.08.20
4098    2017.08.28
4099    2017.09.18
4100    2017.09.18
4101    2017.09.25
4102    2017.09.25
4103    2017.10.02
4104    2017

In [39]:
static_tourneys = static_tourneys[static_tourneys['tourney_dates'].notnull()]

In [41]:
static_tourneys.to_csv(
    Path.joinpath(STORED_DATA_PATH, 'tournaments', 'tournaments_1877-2017_UNINDEXED.csv'),
    index=False,
    header=False
)

The above tournament does not have a date...Do we have match info from this tournament

In [32]:
match_results = pd.read_csv(
    Path.joinpath(STORED_DATA_PATH, 'match_scores', 'combined.csv')
)

  interactivity=interactivity, compiler=compiler, result=result)


In [61]:
match_results[['winner_name', 'loser_name']].head()

Unnamed: 0,winner_name,loser_name
0,Novak Djokovic,Rafael Nadal
1,Novak Djokovic,Tomas Berdych
2,Rafael Nadal,Illya Marchenko
3,Novak Djokovic,Leonardo Mayer
4,Rafael Nadal,Andrey Kuznetsov


In [66]:
match_results.iloc[0]

tourney_year_id                                                    2016-451
tourney_order                                                             1
tourney_slug                                                           doha
tourney_url_suffix                 /en/scores/archive/doha/451/2016/results
tourney_round_name                                                   Finals
round_order                                                               1
match_order                                                               1
winner_name                                                  Novak Djokovic
winner_player_id                                                       d643
winner_slug                                                  novak-djokovic
loser_name                                                     Rafael Nadal
loser_player_id                                                        n409
loser_slug                                                     rafael-nadal
winner_seed 

In [74]:
match_results[
    match_results['winner_name'].map(lambda x: 'Lutz' in x) &
    match_results['loser_name'].map(lambda x: 'Holecek' in x)
][['tourney_year_id', 'winner_name', 'loser_name', 'match_score_tiebreaks']]

Unnamed: 0,tourney_year_id,winner_name,loser_name,match_score_tiebreaks
43788,1973-813,Robert Lutz,Milan Holecek,76(6) 64
43890,1973-802,Robert Lutz,Milan Holecek,62 62
44685,1973-254,Robert Lutz,Milan Holecek,63 46 61
48137,1974-729,Robert Lutz,Milan Holecek,76 64


In [57]:
match_results[match_results['tourney_year_id'] == '197-']

Unnamed: 0,tourney_year_id,tourney_order,tourney_slug,tourney_url_suffix,tourney_round_name,round_order,match_order,winner_name,winner_player_id,winner_slug,...,loser_seed,match_score_tiebreaks,winner_sets_won,loser_sets_won,winner_games_won,loser_games_won,winner_tiebreaks_won,loser_tiebreaks_won,match_id,match_stats_url_suffix


There are no match results here, so we can safely remove this tournament...

In [35]:
match_results.head()

Unnamed: 0,tourney_year_id,tourney_order,tourney_slug,tourney_url_suffix,tourney_round_name,round_order,match_order,winner_name,winner_player_id,winner_slug,...,loser_seed,match_score_tiebreaks,winner_sets_won,loser_sets_won,winner_games_won,loser_games_won,winner_tiebreaks_won,loser_tiebreaks_won,match_id,match_stats_url_suffix
0,2016-451,1,doha,/en/scores/archive/doha/451/2016/results,Finals,1,1,Novak Djokovic,d643,novak-djokovic,...,2.0,61 62,2,0,12,3,0,0,2016-451-d643-n409,/en/scores/2016/451/MS001/match-stats?isLive=F...
1,2016-451,1,doha,/en/scores/archive/doha/451/2016/results,Semi-Finals,2,1,Novak Djokovic,d643,novak-djokovic,...,3.0,63 76(3),2,0,13,9,1,0,2016-451-d643-ba47,/en/scores/2016/451/MS002/match-stats?isLive=F...
2,2016-451,1,doha,/en/scores/archive/doha/451/2016/results,Semi-Finals,2,2,Rafael Nadal,n409,rafael-nadal,...,,63 64,2,0,12,7,0,0,2016-451-n409-me89,/en/scores/2016/451/MS003/match-stats?isLive=F...
3,2016-451,1,doha,/en/scores/archive/doha/451/2016/results,Quarter-Finals,3,1,Novak Djokovic,d643,novak-djokovic,...,8.0,63 75,2,0,13,8,0,0,2016-451-d643-md56,/en/scores/2016/451/MS004/match-stats?isLive=F...
4,2016-451,1,doha,/en/scores/archive/doha/451/2016/results,Quarter-Finals,3,2,Rafael Nadal,n409,rafael-nadal,...,,63 57 64,2,1,17,14,0,0,2016-451-n409-kb54,/en/scores/2016/451/MS007/match-stats?isLive=F...


In [36]:
static_tourneys[static_tourneys['tourney_year_id'] == '2016-451']

Unnamed: 0,tourney_year,tourney_order,tourney_name,tourney_id,tourney_slug,tourney_location,tourney_dates,tourney_month,tourney_day,tourney_singles_draw,...,singles_winner_player_id,doubles_winner_1_name,doubles_winner_1_url,doubles_winner_1_player_slug,doubles_winner_1_player_id,doubles_winner_2_name,doubles_winner_2_url,doubles_winner_2_player_slug,doubles_winner_2_player_id,tourney_year_id
3982,2016,3,Doha,451.0,doha,"Doha, Qatar",2016.01.04,1.0,4.0,32,...,d643,Feliciano Lopez,/en/players/feliciano-lopez/l397/overview,feliciano-lopez,l397,Marc Lopez,/en/players/marc-lopez/l480/overview,marc-lopez,l480,2016-451


In [76]:
static_tourneys[
    static_tourneys['tourney_url_suffix'].isnull()
]

Unnamed: 0,tourney_year,tourney_order,tourney_name,tourney_id,tourney_slug,tourney_location,tourney_dates,tourney_month,tourney_day,tourney_singles_draw,...,singles_winner_player_id,doubles_winner_1_name,doubles_winner_1_url,doubles_winner_1_player_slug,doubles_winner_1_player_id,doubles_winner_2_name,doubles_winner_2_url,doubles_winner_2_player_slug,doubles_winner_2_player_id,tourney_year_id
383,1971,14,Nice,,,Nice,1971.04.01,4.0,1.0,64,...,n008,,,,,,,,,1971-
462,1972,29,Nice,,,Nice,1972.04.23,4.0,23.0,64,...,n008,Jan Kodes,/en/players/jan-kodes/k049/overview,jan-kodes,k049,Stan Smith,/en/players/stan-smith/s060/overview,stan-smith,s060,1972-
552,1973,37,Nice,,,Nice,1973.04.15,4.0,15.0,32,...,o017,,,,,,,,,1973-
650,1974,37,Tokyo WCT,,,Tokyo,1974.04.08,4.0,8.0,32,...,l058,Raymond Moore,/en/players/raymond-moore/m118/overview,raymond-moore,m118,Onny Parun,/en/players/onny-parun/p070/overview,onny-parun,p070,1974-
754,1975,43,Tokyo WCT,,,Tokyo,1975.04.14,4.0,14.0,32,...,l045,Robert Lutz,/en/players/robert-lutz/l045/overview,robert-lutz,l045,Stan Smith,/en/players/stan-smith/s060/overview,stan-smith,s060,1975-
760,1975,49,Nice,,,Nice,1975.04.28,4.0,28.0,64,...,c093,,,,,,,,,1975-
849,1976,39,Nice,,,Nice,1976.04.05,4.0,5.0,64,...,b007,Patrice Dominguez,/en/players/patrice-dominguez/d080/overview,patrice-dominguez,d080,Francois Jauffret,/en/players/francois-jauffret/j031/overview,francois-jauffret,j031,1976-
944,1977,30,Nice,,,Nice,1977.03.28,3.0,28.0,32,...,b058,Ion Tiriac,/en/players/ion-tiriac/t040/overview,ion-tiriac,t040,Guillermo Vilas,/en/players/guillermo-vilas/v028/overview,guillermo-vilas,v028,1977-
1044,1978,26,Nice,,,Nice,1978.04.17,4.0,17.0,32,...,h019,Patrice Dominguez,/en/players/patrice-dominguez/d080/overview,patrice-dominguez,d080,Francois Jauffret,/en/players/francois-jauffret/j031/overview,francois-jauffret,j031,1978-
1138,1979,23,Nice,,,Nice,1979.04.02,4.0,2.0,32,...,p015,Paul McNamee,/en/players/paul-mcnamee/m050/overview,paul-mcnamee,m050,Peter McNamara,/en/players/peter-mcnamara/m051/overview,peter-mcnamara,m051,1979-


In [44]:
from tennis_new.fetch.atp_api.defs import API_RESULTS_DIR

q = pd.read_csv(
    Path.joinpath(Path(API_RESULTS_DIR), 'tournaments', 'tournaments_2017-2019.csv')
)

In [46]:
q['tourney_dates'].isnull().value_counts()

False    206
Name: tourney_dates, dtype: int64