# Import Libraries

In [1]:
import sys
from lxml import html
import requests
import re
import csv
import time
import numbers
import pandas as pd

# Tournaments Data

## Functions

In [2]:
# For writing final scraped data to csv
def array2csv(array, filename):
    with open(filename, "w+") as my_csv:
        csvWriter = csv.writer(my_csv, delimiter = ',')
        csvWriter.writerows(array)

# To get source code of defined url
def html_parse_tree(url):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    return tree

# Extracting specific xpath location from the source code
def xpath_parse(tree, xpath):
    result = tree.xpath(xpath)
    return result

# Removing redundant empty spaces in string
def strip_string(string):
    string = re.sub('\n', '', string).strip()
    string = re.sub('\r', '', string).strip()
    string = re.sub('\t', '', string).strip()
    return string

# Mapping strip_string method onto an array
def strip_array(array):
    for i in range(0, len(array)):
        array[i] = strip_string(array[i]).strip()
    return array

In [3]:
def tournaments(year):
    # Setup
    year_url = "http://www.atptour.com/en/scores/results-archive?year=" + year
    main_url = "http://www.atptour.com"

    # HTML tree
    year_tree = html_parse_tree(year_url)

    # Use XPath to find number of tournaments in a given year
    tourney_titles_xpath = "//span[contains(@class, 'tourney-title')]/text()"  # Filter all the string values where @class == 'tourney-title' in a span
    tourney_titles_parsed = xpath_parse(year_tree, tourney_titles_xpath)
    tourney_titles_cleaned = strip_array(tourney_titles_parsed)                # List of all tournaments in a given year

    tourney_count = len(tourney_titles_cleaned)

    # Iterate through each row in the tournaments table
    output = []
    for i in range(0, tourney_count):
        tourney_order = i + 1
        
        # Tournament type
        tourney_type_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[2]/img[contains(@alt, 'tournament badge')]/@src"
        tourney_type_parsed = xpath_parse(year_tree, tourney_type_xpath)

        if len(tourney_type_parsed) > 0:
            if tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_grandslam.png': tourney_type = 'Grand Slam'
            elif tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_finals.svg': tourney_type = "ATP Finals"
            elif tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_1000.png': tourney_type = "Masters 1000"
            elif tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_500.png': tourney_type = "ATP 500"
            elif tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_250.png': tourney_type = "ATP 250"
            elif tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_lvr.png': tourney_type = "Laver Cup"
            elif tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_nextgen.svg': tourney_type = "Next Gen Finals"
            else:
                tourney_type = 'undefined'
        else:
            tourney_type = ''

        # Tournament name, location, and start date
        tourney_info_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[3]/span/text()"
        tourney_info_parsed = xpath_parse(year_tree, tourney_info_xpath)
        tourney_info_cleaned = strip_array(tourney_info_parsed)

        #tourney_name = tourney_info_cleaned[0].encode('utf-8')
        tourney_name = tourney_info_cleaned[0]
        #tourney_location = tourney_info_cleaned[1].encode('utf-8')
        tourney_location = tourney_info_cleaned[1]
        tourney_date = tourney_info_cleaned[2]
        tourney_year = int(year)
        try:
            tourney_date_split = tourney_date.split('.')
            tourney_month = int(tourney_date_split[1])
            tourney_day = int(tourney_date_split[2])
        except Exception:
            tourney_month = ''
            tourney_day = ''
        
        # Tournament conditions
        tourney_conditions_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[5]/div/div[contains(., 'Outdoor') or contains(., 'Indoor')]/text()[normalize-space()]"
        tourney_conditions_parsed = xpath_parse(year_tree, tourney_conditions_xpath)
        tourney_conditions_cleaned = strip_array(tourney_conditions_parsed)
        try:
            tourney_conditions = tourney_conditions_cleaned[0].strip()
        except Exception:
            tourney_conditions = ''
        
        # Tournament surface
        tourney_surface_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[5]/div/div[contains(., 'Outdoor') or contains(., 'Indoor')]/span/text()[normalize-space()]"
        tourney_surface_parsed = xpath_parse(year_tree, tourney_surface_xpath)
        tourney_surface_cleaned = strip_array(tourney_surface_parsed)
        try:
            tourney_surface = tourney_surface_cleaned[0].strip()
        except Exception:
            tourney_surface = ''                

        # Tournament results
        tourney_details_url_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[8]/a/@href"
        tourney_details_url_parsed = xpath_parse(year_tree, tourney_details_url_xpath)        

        if len(tourney_details_url_parsed) > 0:
            tourney_url_suffix = tourney_details_url_parsed[0]
            tourney_url_split = tourney_url_suffix.split('/')
            tourney_slug = tourney_url_split[4]
            tourney_id = tourney_url_split[5]
        else:
            tourney_url_suffix = ''
            tourney_slug = ''
            tourney_id = ''    
        
        # Store data
        tourney_year_id = str(year) + '-' + tourney_id
        output.append([tourney_year_id, tourney_order, tourney_type,
                       tourney_name, tourney_id, tourney_slug, tourney_location,
                       tourney_date, year, tourney_conditions, tourney_surface])
    
    # Output progress
    print(year + '    ' + str(tourney_count))
    
    # Output data
    return output

## Output To CSV

### 1968 - 1989

In [37]:
print('')
print('Year    Tournaments')
print('----    -----------')

start_year = '1968'
end_year = '1989'

tourney_data = []
for h in range(int(start_year), int(end_year) + 1):
    year = str(h)
    tourney_data += tournaments(year)

# Output to CSV
filename = 'tournaments_' + start_year + '-' + end_year + '.csv'
array2csv(tourney_data, filename)


Year    Tournaments
----    -----------
1968    15
1969    61
1970    86
1971    81
1972    93
1973    110
1974    105
1975    109
1976    112
1977    109
1978    101
1979    102
1980    104
1981    99
1982    108
1983    84
1984    75
1985    75
1986    71
1987    78
1988    81
1989    78


### 1990 - 2019

In [38]:
print('')
print('Year    Tournaments')
print('----    -----------')

start_year = '1990'
end_year = '2019'

tourney_data = []
for h in range(int(start_year), int(end_year) + 1):
    year = str(h)
    tourney_data += tournaments(year)

# Output to CSV
filename = 'tournaments_' + start_year + '-' + end_year + '.csv'
array2csv(tourney_data, filename)


Year    Tournaments
----    -----------
1990    82
1991    83
1992    87
1993    92
1994    93
1995    88
1996    86
1997    82
1998    81
1999    73
2000    72
2001    71
2002    67
2003    68
2004    69
2005    68
2006    68
2007    67
2008    67
2009    66
2010    66
2011    66
2012    67
2013    65
2014    65
2015    66
2016    67
2017    69
2018    69
2019    68


# Matches Data

## Functions

In [7]:
# For writing final scraped data to csv
def array2csv(array, filename):
    with open(filename, "w+") as my_csv:
        csvWriter = csv.writer(my_csv, delimiter = ',')
        csvWriter.writerows(array)

# To get source code of defined url
def html_parse_tree(url):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    return tree

# Extracting specific xpath location from the source code
def xpath_parse(tree, xpath):
    result = tree.xpath(xpath)
    return result

# Removing redundant empty spaces in string
def regex_strip_string(string):
    string = re.sub('\n', '', string).strip()
    string = re.sub('\r', '', string).strip()
    string = re.sub('\t', '', string).strip()
    return string

# Mapping strip_string method onto an array
def regex_strip_array(array):
    for i in range(0, len(array)):
        array[i] = regex_strip_string(array[i]).strip()
    return array

def format_spacing(max_spacing, variable):
    spacing_count = max_spacing - len(variable)
    output = ''
    for i in range(0, spacing_count):
        output += ' '
    return output

def fraction_stats(string):
    string = string.replace('(', '')
    string = string.replace(')', '')
    return string.split('/')

In [8]:
def scrape_match_stats(match_stats_url):
    match_tree = html_parse_tree(match_stats_url)

    # Match time
    try:
        match_time_xpath = "//td[contains(@class, 'time')]/text()"
        match_time_parsed = xpath_parse(match_tree, match_time_xpath)
        match_time_cleaned = regex_strip_array(match_time_parsed)
        match_time = match_time_cleaned[0].replace("Time: ", "")
        match_time_split = match_time.split(":")            
        match_time_hours = int(match_time_split[0])
        match_time_minutes = int(match_time_split[1])
        match_duration = 60*match_time_hours + match_time_minutes                                        
    except Exception:
        match_time = ""
        match_duration = ""

    # Match info
    match_year = match_stats_url_suffix.split('/')[3]
    tourney_id = match_stats_url_suffix.split('/')[4]
    match_index = match_stats_url_suffix.split('/')[5]

    try:
        winner_slug_xpath = "//div[@class='player-left-name']/a/@href"
        winner_slug_parsed = xpath_parse(match_tree, winner_slug_xpath)
        winner_slug = winner_slug_parsed[0].split('/')[4]
    except Exception:
        winner_slug = ''

    try:
        loser_slug_xpath = "//div[@class='player-right-name']/a/@href"
        loser_slug_parsed = xpath_parse(match_tree, loser_slug_xpath)
        loser_slug = loser_slug_parsed[0].split('/')[4]
    except Exception:
        loser_slug = ''

    match_id = tourney_year + "-" + tourney_id + "-" + match_index + "-" + round_match_id + "-" + winner_player_id + "-" + loser_player_id    

    # Match Stats
    try:                
        # Stats Xpaths
        left_stats_xpath = "//td[@class='match-stats-number-left']/span/text()"
        left_stats_parsed = xpath_parse(match_tree, left_stats_xpath)
        left_stats_cleaned = regex_strip_array(left_stats_parsed)

        right_stats_xpath = "//td[@class='match-stats-number-right']/span/text()"
        right_stats_parsed = xpath_parse(match_tree, right_stats_xpath)
        right_stats_cleaned = regex_strip_array(right_stats_parsed)        

        # Ratings Xpaths
        left_ratings_xpath = "//td[@class='match-stats-number-left']/span/a/text()"
        left_ratings_parsed = xpath_parse(match_tree, left_ratings_xpath)
        right_ratings_xpath = "//td[@class='match-stats-number-right']/span/a/text()"
        right_ratings_parsed = xpath_parse(match_tree, right_ratings_xpath)

        # Left stats
        left_serve_rating = int(left_ratings_parsed[0])
        left_aces = int(left_stats_cleaned[2])
        left_double_faults = int(left_stats_cleaned[3])

        left_first_serves_in = int(fraction_stats(left_stats_cleaned[5])[0])
        left_first_serves_total = int(fraction_stats(left_stats_cleaned[5])[1])

        left_first_serve_points_won = int(fraction_stats(left_stats_cleaned[7])[0])
        left_first_serve_points_total = int(fraction_stats(left_stats_cleaned[7])[1])

        left_second_serve_points_won = int(fraction_stats(left_stats_cleaned[9])[0])
        left_second_serve_points_total = int(fraction_stats(left_stats_cleaned[9])[1])

        left_break_points_saved = int(fraction_stats(left_stats_cleaned[11])[0])
        left_break_points_serve_total = int(fraction_stats(left_stats_cleaned[11])[1])

        left_service_points_won = int(fraction_stats(left_stats_cleaned[23])[0])
        left_service_points_total = int(fraction_stats(left_stats_cleaned[23])[1])

        left_return_rating = int(left_ratings_parsed[1])
        left_first_serve_return_won = int(fraction_stats(left_stats_cleaned[16])[0])
        left_first_serve_return_total = int(fraction_stats(left_stats_cleaned[16])[1])

        left_second_serve_return_won = int(fraction_stats(left_stats_cleaned[18])[0])
        left_second_serve_return_total = int(fraction_stats(left_stats_cleaned[18])[1])

        left_break_points_converted = int(fraction_stats(left_stats_cleaned[20])[0])
        left_break_points_return_total = int(fraction_stats(left_stats_cleaned[20])[1])

        left_service_games_played = int(left_stats_cleaned[12])
        left_return_games_played = int(left_stats_cleaned[21])

        left_return_points_won = int(fraction_stats(left_stats_cleaned[25])[0])
        left_return_points_total = int(fraction_stats(left_stats_cleaned[25])[1])

        left_total_points_won = int(fraction_stats(left_stats_cleaned[27])[0])
        left_total_points_total = int(fraction_stats(left_stats_cleaned[27])[1])
        
        # Loser stats
        right_serve_rating = int(right_ratings_parsed[0])
        right_aces = int(right_stats_cleaned[2])
        right_double_faults = int(right_stats_cleaned[3])

        right_first_serves_in = int(fraction_stats(right_stats_cleaned[5])[0])
        right_first_serves_total = int(fraction_stats(right_stats_cleaned[5])[1])

        right_first_serve_points_won = int(fraction_stats(right_stats_cleaned[7])[0])
        right_first_serve_points_total = int(fraction_stats(right_stats_cleaned[7])[1])

        right_second_serve_points_won = int(fraction_stats(right_stats_cleaned[9])[0])
        right_second_serve_points_total = int(fraction_stats(right_stats_cleaned[9])[1])

        right_break_points_saved = int(fraction_stats(right_stats_cleaned[11])[0])
        right_break_points_serve_total = int(fraction_stats(right_stats_cleaned[11])[1])

        right_service_points_won = int(fraction_stats(right_stats_cleaned[23])[0])
        right_service_points_total = int(fraction_stats(right_stats_cleaned[23])[1])

        right_return_rating = int(right_ratings_parsed[1])
        right_first_serve_return_won = int(fraction_stats(right_stats_cleaned[16])[0])
        right_first_serve_return_total = int(fraction_stats(right_stats_cleaned[16])[1])

        right_second_serve_return_won = int(fraction_stats(right_stats_cleaned[18])[0])
        right_second_serve_return_total = int(fraction_stats(right_stats_cleaned[18])[1])

        right_break_points_converted = int(fraction_stats(right_stats_cleaned[20])[0])
        right_break_points_return_total = int(fraction_stats(right_stats_cleaned[20])[1])

        right_service_games_played = int(right_stats_cleaned[12])
        right_return_games_played = int(right_stats_cleaned[21])

        right_return_points_won = int(fraction_stats(right_stats_cleaned[25])[0])
        right_return_points_total = int(fraction_stats(right_stats_cleaned[25])[1])

        right_total_points_won = int(fraction_stats(right_stats_cleaned[27])[0])
        right_total_points_total = int(fraction_stats(right_stats_cleaned[27])[1])

        # # # # # # # # # # # # # # # # # # #
        # Assign stats to winner and loser  #
        # # # # # # # # # # # # # # # # # # #

        # Left player url
        left_player_url_xpath = "//div[@class='player-left-name']/a/@href"
        left_player_url_xpath_parsed = xpath_parse(match_tree, left_player_url_xpath)
            
        # Right player url
        right_player_url_xpath = "//div[@class='player-right-name']/a/@href"
        right_player_url_xpath_parsed = xpath_parse(match_tree, right_player_url_xpath)                

        if left_player_url_xpath_parsed == winner_slug_parsed:
            winner_serve_rating = left_serve_rating
            winner_aces = left_aces
            winner_double_faults = left_double_faults
            winner_first_serves_in = left_first_serves_in
            winner_first_serves_total = left_first_serves_total
            winner_first_serve_points_won = left_first_serve_points_won
            winner_first_serve_points_total = left_first_serve_points_total
            winner_second_serve_points_won = left_second_serve_points_won
            winner_second_serve_points_total = left_second_serve_points_total
            winner_break_points_saved = left_break_points_saved
            winner_break_points_serve_total = left_break_points_serve_total
            winner_service_points_won = left_service_points_won
            winner_service_points_total = left_service_points_total
            winner_return_rating = left_return_rating
            winner_first_serve_return_won = left_first_serve_return_won
            winner_first_serve_return_total = left_first_serve_return_total
            winner_second_serve_return_won = left_second_serve_return_won
            winner_second_serve_return_total = left_second_serve_return_total
            winner_break_points_converted = left_break_points_converted
            winner_break_points_return_total = left_break_points_return_total
            winner_service_games_played = left_service_games_played
            winner_return_games_played = left_return_games_played
            winner_return_points_won = left_return_points_won
            winner_return_points_total = left_return_points_total
            winner_total_points_won = left_total_points_won
            winner_total_points_total = left_total_points_total

            loser_serve_rating = right_serve_rating
            loser_aces = right_aces
            loser_double_faults = right_double_faults
            loser_first_serves_in = right_first_serves_in
            loser_first_serves_total = right_first_serves_total
            loser_first_serve_points_won = right_first_serve_points_won
            loser_first_serve_points_total = right_first_serve_points_total
            loser_second_serve_points_won = right_second_serve_points_won
            loser_second_serve_points_total = right_second_serve_points_total
            loser_break_points_saved = right_break_points_saved
            loser_break_points_serve_total = right_break_points_serve_total
            loser_service_points_won = right_service_points_won
            loser_service_points_total = right_service_points_total
            loser_return_rating = right_return_rating
            loser_first_serve_return_won = right_first_serve_return_won
            loser_first_serve_return_total = right_first_serve_return_total
            loser_second_serve_return_won = right_second_serve_return_won
            loser_second_serve_return_total = right_second_serve_return_total
            loser_break_points_converted = right_break_points_converted
            loser_break_points_return_total = right_break_points_return_total
            loser_service_games_played = right_service_games_played
            loser_return_games_played = right_return_games_played
            loser_return_points_won = right_return_points_won
            loser_return_points_total = right_return_points_total
            loser_total_points_won = right_total_points_won
            loser_total_points_total = right_total_points_total                    

        elif right_player_url_xpath_parsed == winner_slug_parsed:
            winner_serve_rating = right_serve_rating
            winner_aces = right_aces
            winner_double_faults = right_double_faults
            winner_first_serves_in = right_first_serves_in
            winner_first_serves_total = right_first_serves_total
            winner_first_serve_points_won = right_first_serve_points_won
            winner_first_serve_points_total = right_first_serve_points_total
            winner_second_serve_points_won = right_second_serve_points_won
            winner_second_serve_points_total = right_second_serve_points_total
            winner_break_points_saved = right_break_points_saved
            winner_break_points_serve_total = right_break_points_serve_total
            winner_service_points_won = right_service_points_won
            winner_service_points_total = right_service_points_total
            winner_return_rating = right_return_rating
            winner_first_serve_return_won = right_first_serve_return_won
            winner_first_serve_return_total = right_first_serve_return_total
            winner_second_serve_return_won = right_second_serve_return_won
            winner_second_serve_return_total = right_second_serve_return_total
            winner_break_points_converted = right_break_points_converted
            winner_break_points_return_total = right_break_points_return_total
            winner_service_games_played = right_service_games_played
            winner_return_games_played = right_return_games_played
            winner_return_points_won = right_return_points_won
            winner_return_points_total = right_return_points_total
            winner_total_points_won = right_total_points_won
            winner_total_points_total = right_total_points_total

            loser_serve_rating = left_serve_rating
            loser_aces = left_aces
            loser_double_faults = left_double_faults
            loser_first_serves_in = left_first_serves_in
            loser_first_serves_total = left_first_serves_total
            loser_first_serve_points_won = left_first_serve_points_won
            loser_first_serve_points_total = left_first_serve_points_total
            loser_second_serve_points_won = left_second_serve_points_won
            loser_second_serve_points_total = left_second_serve_points_total
            loser_break_points_saved = left_break_points_saved
            loser_break_points_serve_total = left_break_points_serve_total
            loser_service_points_won = left_service_points_won
            loser_service_points_total = left_service_points_total
            loser_return_rating = left_return_rating
            loser_first_serve_return_won = left_first_serve_return_won
            loser_first_serve_return_total = left_first_serve_return_total
            loser_second_serve_return_won = left_second_serve_return_won
            loser_second_serve_return_total = left_second_serve_return_total
            loser_break_points_converted = left_break_points_converted
            loser_break_points_return_total = left_break_points_return_total
            loser_service_games_played = left_service_games_played
            loser_return_games_played = left_return_games_played
            loser_return_points_won = left_return_points_won
            loser_return_points_total = left_return_points_total
            loser_total_points_won = left_total_points_won
            loser_total_points_total = left_total_points_total                          
    except Exception:
        winner_serve_rating = ''
        winner_aces = ''
        winner_double_faults = ''
        winner_first_serves_in = ''
        winner_first_serves_total = ''
        winner_first_serve_points_won = ''
        winner_first_serve_points_total = ''
        winner_second_serve_points_won = ''
        winner_second_serve_points_total = ''
        winner_break_points_saved = ''
        winner_break_points_serve_total = ''
        winner_service_points_won = ''
        winner_service_points_total = ''
        winner_return_rating = ''
        winner_first_serve_return_won = ''
        winner_first_serve_return_total = ''
        winner_second_serve_return_won = ''
        winner_second_serve_return_total = ''
        winner_break_points_converted = ''
        winner_break_points_return_total = ''
        winner_service_games_played = ''
        winner_return_games_played = ''
        winner_return_points_won = ''
        winner_return_points_total = ''
        winner_total_points_won = ''
        winner_total_points_total = ''

        loser_serve_rating = ''
        loser_aces = ''
        loser_double_faults = ''
        loser_first_serves_in = ''
        loser_first_serves_total = ''
        loser_first_serve_points_won = ''
        loser_first_serve_points_total = ''
        loser_second_serve_points_won = ''
        loser_second_serve_points_total = ''
        loser_break_points_saved = ''
        loser_break_points_serve_total = ''
        loser_service_points_won = ''
        loser_service_points_total = ''
        loser_return_rating = ''
        loser_first_serve_return_won = ''
        loser_first_serve_return_total = ''
        loser_second_serve_return_won = ''
        loser_second_serve_return_total = ''
        loser_break_points_converted = ''
        loser_break_points_return_total = ''
        loser_service_games_played = ''
        loser_return_games_played = ''
        loser_return_points_won = ''
        loser_return_points_total = ''
        loser_total_points_won = ''
        loser_total_points_total = ''                    

    # Store data
    output = [match_id, tourney_slug, match_stats_url_suffix, match_time, match_duration,
              winner_slug, winner_serve_rating, winner_aces, winner_double_faults,
              winner_first_serves_in, winner_first_serves_total, winner_first_serve_points_won,
              winner_first_serve_points_total, winner_second_serve_points_won, winner_second_serve_points_total,
              winner_break_points_saved, winner_break_points_serve_total, winner_service_games_played,
              winner_return_rating, winner_first_serve_return_won, winner_first_serve_return_total,
              winner_second_serve_return_won, winner_second_serve_return_total, winner_break_points_converted,
              winner_break_points_return_total, winner_return_games_played,
              winner_service_points_won, winner_service_points_total,
              winner_return_points_won, winner_return_points_total, winner_total_points_won, winner_total_points_total,
              loser_slug, loser_serve_rating, loser_aces, loser_double_faults,
              loser_first_serves_in, loser_first_serves_total, loser_first_serve_points_won, loser_first_serve_points_total,
              loser_second_serve_points_won, loser_second_serve_points_total,
              loser_break_points_saved, loser_break_points_serve_total, loser_service_games_played,
              loser_return_rating, loser_first_serve_return_won, loser_first_serve_return_total,
              loser_second_serve_return_won, loser_second_serve_return_total,
              loser_break_points_converted, loser_break_points_return_total,
              loser_return_games_played, loser_service_points_won, loser_service_points_total,
              loser_return_points_won, loser_return_points_total, loser_total_points_won, loser_total_points_total]
    return output

## Output To CSV

### 1990 - 2019

In [1]:
# Command line input
year = '2019'

# Setup
year_url = "https://www.atptour.com/en/scores/results-archive?year=" + year
url_prefix = "https://www.atptour.com"

# Tourney count
year_tree = html_parse_tree(year_url)
tourney_details_url_xpath = "//tr[contains(@class, 'tourney-result')][*]/td[8]/a/@href"
tourney_url_suffixes = xpath_parse(year_tree, tourney_details_url_xpath)
tourney_count = len(tourney_url_suffixes)

try: start_index = str(sys.argv[2])
except Exception: start_index = str(0)

try: end_index = str(int(sys.argv[3]) + 1)
except Exception: end_index = str(tourney_count)

# Command line output
print('')
print('Collecting match stats data for ' + '\x1b[0;32;40m' + str(tourney_count) + '\x1b[0m' + ' tournaments:')
print('')
print('Index    Tourney slug           Matches')
print('-----    ------------           -------')

# Iterate through each tournament
match_stats_data_scrape = []
for i in range(int(start_index), int(end_index)):

    # Parse tourney tree
    tourney_url_suffix = tourney_url_suffixes[i]
    tourney_url = url_prefix + tourney_url_suffix
    tourney_tree = html_parse_tree(tourney_url)

    # Extract tourney details
    url_split = tourney_url.split("/")
    tourney_slug = url_split[6]
    tourney_year = url_split[8]
    tourney_id = url_split[7]
    tourney_index = str(i)

    # Tourney round count
    tourney_round_name_xpath = "//table[contains(@class, 'day-table')]/thead/tr/th/text()"
    tourney_round_name_parsed = xpath_parse(tourney_tree, tourney_round_name_xpath)
    tourney_round_count = len(tourney_round_name_parsed)
    
    # Match stats URL XPath
    match_stats_url_xpath = "//table[contains(@class, 'day-table')]/tbody[*]/tr[*]/td[contains(@class, 'day-table-score')]/a/@href"
    match_stats_url_cleaned = xpath_parse(tourney_tree, match_stats_url_xpath)
    # Filter problematic URL's
    match_stats_url_suffixes = []
    for foo in match_stats_url_cleaned:
        if foo.find('//') == -1:
            match_stats_url_suffixes.append(foo)

    # Total match count
    total_matches = len(match_stats_url_suffixes)

    # Output tournaments with different match structure
    if total_matches == 0:
        spacing1 = format_spacing(5, tourney_index)
        spacing2 = format_spacing(19, tourney_slug)
        sys.stdout.write('\r' + tourney_index + spacing1 + '    ' + tourney_slug + spacing2 + '    ' + '\x1b[1;31m' + 'Match structure/stats URL problem' + '\x1b[0m')        
    else:
        # Iterate through each round
        output = []
        match_counter = 0
        alt_counter = 0
        for j in range(0, tourney_round_count):

            # Round order and match count
            round_order = j + 1
            #tourney_round_name = tourney_round_name_parsed[j]
            round_match_count_xpath = "//table[contains(@class, 'day-table')]/tbody[" + str(j + 1) + "]/tr/td[contains(@class, 'day-table-name')][1]/a/text()"
            round_match_count_parsed = xpath_parse(tourney_tree, round_match_count_xpath)
            round_match_count = len(round_match_count_parsed)

            # Iterate through each match
            for k in range(0, round_match_count):

                # Match order and round match ID
                match_order = k + 1
                round_match_id = str(tourney_round_count - j) + '-' + str(round_match_count - k)

                # Winner info
                winner_name_xpath = "//table[contains(@class, 'day-table')]/tbody[" + str(j + 1) + "]/tr[" + str(k + 1) + "]/td[contains(@class, 'day-table-name')][1]/a/text()"
                winner_name_parsed = xpath_parse(tourney_tree, winner_name_xpath)
                winner_url_xpath = "//table[contains(@class, 'day-table')]/tbody[" + str(j + 1) + "]/tr[" + str(k + 1) + "]/td[contains(@class, 'day-table-name')][1]/a/@href"
                winner_url_parsed = xpath_parse(tourney_tree, winner_url_xpath)
                winner_name = winner_name_parsed[0]
                winner_url = winner_url_parsed[0]
                winner_url_split = winner_url.split('/')
                winner_slug = winner_url_split[3]
                winner_player_id = winner_url_split[4]      

                # Loser info
                loser_name_xpath = "//table[contains(@class, 'day-table')]/tbody[" + str(j + 1) + "]/tr[" + str(k + 1) + "]/td[contains(@class, 'day-table-name')][2]/a/text()"
                loser_name_parsed = xpath_parse(tourney_tree, loser_name_xpath)
                loser_url_xpath = "//table[contains(@class, 'day-table')]/tbody[" + str(j +1) + "]/tr[" + str(k + 1) + "]/td[contains(@class, 'day-table-name')][2]/a/@href"
                loser_url_parsed = xpath_parse(tourney_tree, loser_url_xpath)

                try:                
                    loser_name = loser_name_parsed[0]
                    loser_url = loser_url_parsed[0]
                    loser_url
                    loser_url_split = loser_url.split('/')
                    loser_slug = loser_url_split[3]
                    loser_player_id = loser_url_split[4]
                except Exception:
                    loser_name = ''
                    loser_url = ''
                    loser_slug = ''
                    loser_player_id = ''
                
                # Match stats URL
                match_stats_url_xpath = "//table[contains(@class, 'day-table')]/tbody[" + str(j + 1) + "]/tr[" + str(k + 1) + "]/td[contains(@class, 'day-table-score')]/a/@href"
                match_stats_url_parsed = xpath_parse(tourney_tree, match_stats_url_xpath)
                match_stats_url_cleaned = []
                for element in match_stats_url_parsed:
                    if len(element) > 0:
                        match_stats_url_cleaned.append(regex_strip_string(element))
                    else:
                        match_stats_url_cleaned.append("TIEBREAK")
                
                # Scrape match stats data synchronously
                match_urls = []
                if len(match_stats_url_cleaned) > 0:
                    match_counter += 1
                    alt_counter += 1
                    # Match stats URL
                    match_stats_url_suffix = match_stats_url_cleaned[0]
                    match_stats_url_suffix_split = match_stats_url_suffix.split('/')
                    match_urls.append(match_stats_url_suffix)
                    match_stats_url = url_prefix + match_stats_url_suffix


                    scraped_stats = scrape_match_stats(match_stats_url)
                    
                    # Check for walkovers because it overcounts matches with match stats
                    if scraped_stats[4] == '': match_counter -= 1
            
                    # Store scraped stats
                    match_stats_data_scrape += [scraped_stats]           

                # Command line output for match details
                current_count = str(match_counter)
                spacing1 = format_spacing(5, tourney_index)
                spacing2 = format_spacing(19, tourney_slug)
                percent_completed = '{:.0%}'.format(match_counter/float(total_matches))
                if total_matches != 0:
                    if alt_counter == total_matches and match_counter < total_matches:
                        sys.stdout.write('\r' + '\x1b[1;31m' + tourney_index + spacing1 + '    ' + tourney_slug + spacing2 + '    ' + current_count + "/" + str(total_matches) + " (" + str(percent_completed) + ")" + '\x1b[0m')
                    else:                    
                        sys.stdout.write('\r' + tourney_index + spacing1 + '    ' + tourney_slug + spacing2 + '    ' + current_count + '/' + str(total_matches) + ' (' + str(percent_completed) + ')')
                sys.stdout.flush()

    # Print new line after each tournament
    sys.stdout.write('\n')




# Player Stats

## Web Scraping Preparation

We plan to only scrape players with at least 50 wins on the tour, with the match stats data ready, we will use it to filter for player id associated to the ATPTour Website with at least 50 wins under their belt.

In [2]:
df = pd.read_csv('match_stats_1990-2019.csv')
slug_id_pairings = pd.read_csv('slug_id_pairings.csv')

In [3]:
# Players with at least 5
print(f"Number of players with more than 50 wins on tour: {(df.winner_slug.value_counts() >= 50).sum()}")
list1 = df.winner_slug.value_counts()
list1 = list1[0:(df.winner_slug.value_counts() >= 50).sum()]

player_slug_to_scrape = []
for i in list1.keys():
    player_slug_to_scrape.append(i)

Number of players with more than 50 wins on tour: 488


In [4]:
# Create a dictionary with slug:id pairs
pairings = {}
for i in range(0,len(slug_id_pairings.slug)):
    pairings[slug_id_pairings.slug[i]] = slug_id_pairings.player_id[i]

## Main Functions

In [5]:
# For writing final scraped data to csv
def array2csv(array, filename):
    with open(filename, "w+") as my_csv:
        csvWriter = csv.writer(my_csv, delimiter = ',')
        csvWriter.writerows(array)

# To get source code of defined url
def html_parse_tree(url):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    return tree

# Extracting specific xpath location from the source code
def xpath_parse(tree, xpath):
    result = tree.xpath(xpath)
    return result

# Removing redundant empty spaces in string
def strip_string(string):
    string = re.sub('\n', '', string).strip()
    string = re.sub('\r', '', string).strip()
    string = re.sub('\t', '', string).strip()
    return string

# Mapping strip_string method onto an array
def strip_array(array):
    for i in range(0, len(array)):
        array[i] = strip_string(array[i]).strip()
    return array

In [32]:
slug = player_slug_to_scrape[0]
player_id = pairings[slug]
url = "https://www.atptour.com/en/players/" + player_id + "/" + slug + "/player-stats"

# HTML Tree
player_tree = html_parse_tree(url)
        
height_cm_xpath = '//span[contains(@class, "table-height-cm-wrapper")]/text()'
height_cm_parsed = xpath_parse(player_tree, height_cm_xpath)
height_cm_cleaned = strip_array(height_cm_parsed)

height_cm = int(height_cm_cleaned[0][1:-3])

# first_serve_perc = first_serve_perc_cleaned[]

In [33]:
height_cm

185

In [34]:
def player_stats(player_slug_to_scrape, pairings):
    
    output = []
    
    for i in player_slug_to_scrape:
        slug = i
        player_id = pairings[slug]
        url = "https://www.atptour.com/en/players/" + player_id + "/" + slug + "/player-stats"

        # HTML Tree
        player_tree = html_parse_tree(url)
        
        # Height & Weight
        height_cm_xpath = '//span[contains(@class, "table-height-cm-wrapper")]/text()'
        height_cm_parsed = xpath_parse(player_tree, height_cm_xpath)
        height_cm_cleaned = strip_array(height_cm_parsed)

        height_cm = int(height_cm_cleaned[0][1:-3])
        
        weight_kg_xpath = '//span[contains(@class, "table-weight-kg-wrapper")]/text()'
        weight_kg_parsed = xpath_parse(player_tree, weight_kg_xpath)
        weight_kg_cleaned = strip_array(weight_kg_parsed)

        weight_kg = int(weight_kg_cleaned[0][1:-3])

        
        # First Serve Percentage
        first_serve_perc_xpath = '//*[@id="playerMatchFactsContainer"]/table[1]/tbody/tr[3]/td[2]/text()'
        first_serve_perc_parsed = xpath_parse(player_tree, first_serve_perc_xpath)
        first_serve_perc_cleaned = strip_array(first_serve_perc_parsed)

        first_serve_perc = first_serve_perc_cleaned[0][:-1]

        # First Serve Points Won Percentage
        first_serve_points_won = '//*[@id="playerMatchFactsContainer"]/table[1]/tbody/tr[4]/td[2]/text()'
        first_serve_points_won_parsed = xpath_parse(player_tree, first_serve_points_won)
        first_serve_points_won_cleaned = strip_array(first_serve_points_won_parsed)

        first_serve_points_won_perc = first_serve_points_won_cleaned[0][:-1]

        # Second Serve Points Won Percentage
        second_serve_points_won = '//*[@id="playerMatchFactsContainer"]/table[1]/tbody/tr[5]/td[2]/text()'
        second_serve_points_won_parsed = xpath_parse(player_tree, second_serve_points_won)
        second_serve_points_won_cleaned = strip_array(second_serve_points_won_parsed)

        second_serve_points_won_perc = second_serve_points_won_cleaned[0][:-1]

        # Break Points Saved Percentage
        break_points_saved = '//*[@id="playerMatchFactsContainer"]/table[1]/tbody/tr[7]/td[2]/text()'
        break_points_saved_parsed = xpath_parse(player_tree, break_points_saved)
        break_points_saved_cleaned = strip_array(break_points_saved_parsed)

        break_points_saved_perc = break_points_saved_cleaned[0][:-1]

        # Service Games Won Percentage
        service_games_won = '//*[@id="playerMatchFactsContainer"]/table[1]/tbody/tr[9]/td[2]/text()'
        service_games_won_parsed = xpath_parse(player_tree, service_games_won)
        service_games_won_cleaned = strip_array(service_games_won_parsed)

        service_games_won_perc = service_games_won_cleaned[0][:-1]

        # Service Points Won Percentage
        service_points_won = '//*[@id="playerMatchFactsContainer"]/table[1]/tbody/tr[10]/td[2]/text()'
        service_points_won_parsed = xpath_parse(player_tree, service_points_won)
        service_points_won_cleaned = strip_array(service_points_won_parsed)

        service_points_won_perc = service_points_won_cleaned[0][:-1]

        # First Serve Return Points Won Percentage
        first_serve_return_points_won = '//*[@id="playerMatchFactsContainer"]/table[2]/tbody/tr[1]/td[2]/text()'
        first_serve_return_points_won_parsed = xpath_parse(player_tree, first_serve_return_points_won)
        first_serve_return_points_won_cleaned = strip_array(first_serve_return_points_won_parsed)

        first_serve_return_points_won_perc = first_serve_return_points_won_cleaned[0][:-1]

        # Second Serve Return Points Won Percentage
        second_serve_return_points_won = '//*[@id="playerMatchFactsContainer"]/table[2]/tbody/tr[2]/td[2]/text()'
        second_serve_return_points_won_parsed = xpath_parse(player_tree, second_serve_return_points_won)
        second_serve_return_points_won_cleaned = strip_array(second_serve_return_points_won_parsed)

        second_serve_return_points_won_perc = second_serve_return_points_won_cleaned[0][:-1]

        # Break Points Converted Percentage
        break_points_converted = '//*[@id="playerMatchFactsContainer"]/table[2]/tbody/tr[4]/td[2]/text()'
        break_points_converted_parsed = xpath_parse(player_tree, break_points_converted)
        break_points_converted_cleaned = strip_array(break_points_converted_parsed)

        break_points_converted_perc = break_points_converted_cleaned[0][:-1]

        # Return Games Won Percentage
        return_games_won = '//*[@id="playerMatchFactsContainer"]/table[2]/tbody/tr[6]/td[2]/text()'
        return_games_won_parsed = xpath_parse(player_tree, return_games_won)
        return_games_won_cleaned = strip_array(return_games_won_parsed)

        return_games_won_perc = return_games_won_cleaned[0][:-1]

        # Return Points Won Percentage
        return_points_won = '//*[@id="playerMatchFactsContainer"]/table[2]/tbody/tr[7]/td[2]/text()'
        return_points_won_parsed = xpath_parse(player_tree, return_points_won)
        return_points_won_cleaned = strip_array(return_points_won_parsed)

        return_points_won_perc = return_points_won_cleaned[0][:-1]
        
        # Store Data
        output.append([slug, player_id, height_cm, weight_kg,
                       first_serve_perc, first_serve_points_won_perc, second_serve_points_won_perc,
                       first_serve_return_points_won_perc, second_serve_return_points_won_perc,
                      break_points_saved_perc, break_points_converted_perc, 
                       service_games_won_perc, service_points_won_perc,
                      return_games_won_perc, return_points_won_perc])
    
    # Output
    return output

## Output To CSV

In [35]:
# Iterate over the players
output = player_stats(player_slug_to_scrape, pairings)

# * First run may take a while*

In [36]:
# Output to CSV
filename = 'player_stats_1990-2019.csv'
array2csv(output, filename)

In [37]:
# Define header for the dataframe
df = pd.read_csv('player_stats_1990-2019.csv', header = None)

header = ['slug', 'player_id', 'height_cm', 'weight_kg',
          'first_serve_perc', 'first_serve_points_won_perc', 'second_serve_points_won_perc',
          'first_serve_return_points_won_perc', 'second_serve_return_points_won_perc',
          'break_points_saved_perc', 'break_points_converted_perc', 
          'service_games_won_perc', 'service_points_won_perc',
          'return_games_won_perc', 'return_points_won_perc']

df.columns = header

In [38]:
# Save to csv again
df.to_csv('player_stats_1990-2019.csv', index = False)

In [39]:
# For inspection
df = pd.read_csv('player_stats_1990-2019.csv')
df

Unnamed: 0,slug,player_id,height_cm,weight_kg,first_serve_perc,first_serve_points_won_perc,second_serve_points_won_perc,first_serve_return_points_won_perc,second_serve_return_points_won_perc,break_points_saved_perc,break_points_converted_perc,service_games_won_perc,service_points_won_perc,return_games_won_perc,return_points_won_perc
0,f324,roger-federer,185,85,62,77,57,33,51,67,41,89,70,27,40
1,n409,rafael-nadal,185,85,68,72,57,34,55,67,45,86,67,34,42
2,d643,novak-djokovic,188,77,65,74,56,34,55,65,44,86,67,32,42
3,f401,david-ferrer,175,73,63,69,52,33,55,60,44,78,63,31,42
4,mc10,andy-murray,191,82,58,74,52,33,55,63,44,82,65,32,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,mn13,maximilian-marterer,191,84,63,70,53,25,44,60,35,79,64,14,32
484,w023,mats-wilander,183,77,65,66,46,30,52,57,43,71,59,25,39
485,p491,kristian-pless,188,84,61,70,51,26,47,60,39,77,62,18,35
486,p488,olivier-patience,180,75,63,66,49,27,48,58,37,72,59,20,36
