In [1]:
### IMPORTS ###

import pandas as pd
import numpy as np
import re
import time

from dictionaries import *

In [2]:
### GLOBAL CONTROL ###

t = time.time()             # time the code
sample_size = 4


In [3]:
### READ IN DATA ###
# Note: points table has two empty entries in the Gm column which must be fixed manually

points = pd.DataFrame()
matches_tmp = pd.DataFrame()

# Read in for both men's and women's games, join the two tmp dataframes at the end
for gender in ['w', 'm']:                                          
    
    # Points table
    pts_tmp = pd.read_csv(

        'data_raw/charting-'+gender+'-points.csv', 
        encoding='latin-1', 
        low_memory=False, 
        index_col='match_id',
        dtype = str,

        usecols = [            
            'match_id',          # match ID
            'Gm#',               # game number then (point number)
            '1st',               # first serve rally outcome
            '2nd',               # second serve rally outcome
            'isRally2nd',        # fault counter
            'isSvrWinner',       # whether the server won
            'PtWinner',          # who won the point
            'GmW',               # whether the point concluded a game
            'SetW',              # whether the point concluded a set
            'Svr'
        ]

    )
    
    # Matches table
    matches_tmp2 = pd.read_csv(

        'data_raw/charting-'+gender+'-matches.csv',
        encoding='latin-1',
        low_memory=False,
        dtype=str,

        usecols = [
            'match_id',
            'Pl 1 hand',
            'Pl 2 hand',
            'Surface'
        ]
    
    )
    
    # Add the gender column to matches - female is 0, other is 1
    matches_tmp2['gender'] = 0 if gender=='w' else 1                                
    
    # Append everything and clear excess large variables
    points = points.append(pts_tmp)
    matches_tmp = matches_tmp.append(matches_tmp2)
    del pts_tmp, matches_tmp2                     

# Tidy up, including by dropping duplicates before setting index in matches, which deals with several duplicate rows
points = points.fillna('')
points.to_csv('points.csv')
matches_tmp = matches_tmp.drop_duplicates().set_index('match_id')                   


In [4]:
### BUILD THE TABLE OF MATCH RESULTS ###
# Note: Matches-m- table has '"' characters which must be removed manually around rows 955, 1180
# Some data entry neglects to write that the final point of a match is also a set win

# Function to read properties of each match
def get_match_params(m_id, matches_tmp):
    
        # Hands
        p0hand = int(matches_tmp.loc[m_id, 'Pl 1 hand'] in 'rR') if m_id in matches_tmp.index else 1                    # 1 for right-handed (also defaults to 1 when '')
        p1hand = int(matches_tmp.loc[m_id, 'Pl 2 hand'] in 'rR') if m_id in matches_tmp.index else 1
        
        # Surface
        surface = str(matches_tmp.loc[m_id, 'Surface']).strip().lower() if m_id in matches_tmp.index else 'hard'                             # default to hard surface
        if surface not in match_attributes:
            surface = 'hard' 
        hard = int(surface == 'hard')
        clay = int(surface == 'clay')
        grass = int(surface == 'grass')
            
        # Gender
        gender = matches_tmp.loc[m_id, 'gender']
        
        return p0hand, p1hand, hard, clay, grass, gender
    
matches = pd.DataFrame(columns = ['match_id', 'p0hand', 'p1hand', 'hard', 'clay', 'grass', 'gender', 'sets_wr', 'sets_won', 'match_wr', 'match_win'], dtype=object)      # match-level data    

# Find winner of each match and write match- and set-level details
prev_row = points.iloc[0,:]
prev_index = points.index[0]
sets_won = []
sets_wr = []
pts_won_match = [0, 0]
pts_won_set = [0, 0]
for index, row in points.iterrows():

    # If a set has ended
    if int(prev_row['SetW']) or (prev_index != index):
        sets_won.append(pt_win)
        sets_wr.append(pts_won_set[1] / sum(pts_won_set))
        pts_won_set = [0, 0]

    # If a match has ended
    if prev_index != index:  

        # Calculate rate that p1 wins points
        match_wr = pts_won_match[1] / sum(pts_won_match)

        # Arrange other match parameters using the function defined above, noting the final point win always goes to the match winner
        p0hand, p1hand, hard, clay, grass, gender = get_match_params(prev_index, matches_tmp)
        matches.loc[len(matches), :] = [prev_index, p0hand, p1hand, hard, clay, grass, gender, sets_wr, sets_won, match_wr, pt_win]   

        # Reset counters
        sets_won = []
        sets_wr = []
        pts_won_match = [0, 0]

    # Find the winner of the current point, and set if applicable
    pt_win = int(row['PtWinner']) - 1
    pts_won_match[pt_win] += 1
    pts_won_set[pt_win] += 1

    prev_row = row
    prev_index = index

# Run once more for the final point
sets_won.append(pt_win)
sets_wr.append(pts_won_set[1] / sum(pts_won_set))
match_wr = pts_won_match[1] / sum(pts_won_match)
p0hand, p1hand, hard, clay, grass, gender = get_match_params(prev_index, matches_tmp)
matches.loc[len(matches), :] = [prev_index, p0hand, p1hand, hard, clay, grass, gender, sets_wr, sets_won, match_wr, pt_win]    

# Clear memory and write useful tables to csv
del matches_tmp
matches = matches.set_index('match_id')
matches.to_csv('matches.csv')


In [5]:
### EXTRACT THE ATTRIBUTES ###

# Initialise data structures
filename = 'ML_input_' + str(sample_size) + 'gm.csv'
data = pd.DataFrame(columns = all_keys) 
t = time.time()   

# Debugging parameters for short runs and reporting
index_ctr = 0
stop_index = 0
reporting_interval = 25000

# Iterate over every point
prev_game = 1 + 1
gms_won = 0
pts_won = 0
sample_ctr = 1
set_ctr = 0
for index, row in points.iterrows():   
              
    # Debug and admin
    index_ctr += 1
    if stop_index == index_ctr:
        break
    if index_ctr % reporting_interval == 0:
        data.to_csv(filename, index=False)
        print(f'Running: {int(index_ctr / len(points) * 100)}% complete. Written: {len(data)}. Time: {round(time.time() - t) / 60} min')

    # Manage sampling
    game = int(re.search('\d+', str(row['Gm#'])).group())   
    if (game != prev_game):
            
        gms_won += pt_win
            
        if (sample_ctr == sample_size) or (game < prev_game) :
        
            # Write to the dataframe if the sample is at capacity
            if (sample_ctr == sample_size) and (index_ctr > 1): 
                
                for player in [0, 1]:

                    for shot in rally_shots + ['+']: 
                        row_out[shot + str(player)] = row_out[shot + str(player)] / shot_ctr[player] if shot_ctr[player] else float(0)   
                    for shot in shot_dir:
                        row_out[shot + str(player)] = row_out[shot + str(player)] / shot_dir_ctr[player] if shot_dir_ctr[player] else float(0)   
                    for shot in sv_dir:
                        row_out[shot + str(player)] = row_out[shot + str(player)] / sv_ctr[player] if sv_ctr[player] else float(0) 
                    for shot in ret_dir:
                        row_out[shot + str(player)] = row_out[shot + str(player)] / ret_dir_ctr[player] if ret_dir_ctr[player] else float(0) 
                        
                    row_out['fr' + str(player)] = fault_ctr[player] / sv_ctr[player] if sv_ctr[player] else float(0) 
                    row_out['fsr' + str(player)] = 1 - (fault_ctr[player] / sv_ctr[player]) if sv_ctr[player] else float(0)
                    row_out['swr' + str(player)] = sv_win_ctr[player] / sv_pt_ctr[player] if sv_pt_ctr[player] else float(0)
                    row_out['ssr' + str(player)] = ret_2_win_ctr[player] / ret_2_ctr[player] if ret_2_ctr[player] else float(0)
                    
                row_out['svr'] = sv_pt_ctr[1] / pt_ctr
                row_out['gm_wr'] = gms_won / sample_size
                row_out['pt_wr'] = pts_won / pt_ctr
                
                # Undoes an overcount issue for the last game of each match
                if (game < prev_game) and row['SetW']:
                    set_ctr -= 1
                
                row_out['set_wr'] = sets_wr[set_ctr] if sets_wr else row_out['match_wr']
                row_out['set_win'] = sets_won[set_ctr] if sets_wr else row_out['match_win']
                
                data.loc[index_ctr, :] = row_out
                
                # print('Wrote sample up to', str(prev_game), 'match', index, 'winner', row_out['match_win'])
                
            if (game < prev_game):
                
                # Set up the set data for a new match
                set_ctr = 0         
                sets_won = matches.loc[index, 'sets_won']
                sets_wr = matches.loc[index, 'sets_wr']
            
            # Reset the sample size
            sample_ctr = 1
            pt_ctr = 0
            
            # Create a new sample data placeholder
            row_out = {k:0 for k in all_keys}
            row_out['p0hand'] = matches.loc[index, 'p0hand']
            row_out['p1hand'] = matches.loc[index, 'p1hand']
            row_out['hard'] = matches.loc[index, 'hard']
            row_out['clay'] = matches.loc[index, 'clay']
            row_out['grass'] = matches.loc[index, 'grass']
            row_out['gender'] = matches.loc[index, 'gender']
            row_out['match_wr'] = matches.loc[index, 'match_wr']
            row_out['match_win'] = matches.loc[index, 'match_win']
            
            # Set/Reset the counting variables used for averaging
            gms_won = 0
            pts_won = 0
            shot_ctr = [0, 0] 
            shot_dir_ctr = [0, 0]
            sv_ctr = [0, 0]
            fault_ctr = [0, 0]
            ret_dir_ctr = [0, 0]
            ret_2_ctr = [0, 0]
            ret_2_win_ctr = [0, 0]
            sv_pt_ctr = [0, 0]
            sv_win_ctr = [0, 0]

        else:
        
            # Continue counting up samples
            sample_ctr += 1
            
    # Record the game number for reference in the next iteration and increment the point
    prev_game = game
    pt_win = int(row['PtWinner']) - 1
    pts_won += pt_win
    pt_ctr += 1
    if int(row['SetW']):
        set_ctr += 1

    # Returner Data
    player = 2 - int(row['Svr'])
    ret_2_ctr[player] += int(row['isRally2nd']) 
    ret_2_win_ctr[player] += int(row['isRally2nd']) * (1 - int(row['isSvrWinner']) )  
    
    # Server Data
    player = 1 - player
    sv_pt_ctr[player] += 1 
    sv_win_ctr[player] += int(row['isSvrWinner'])   
    
    # Read the Rally
    serving = True
    for char in (row['1st'] + row['2nd']):           

        # A serve was attempted
        if char in sv_dir:

            serving = True
            sv_ctr[player] += 1
            row_out[char + str(player)] += 1
            
        elif serving and char in failure:
            
            fault_ctr[player] += 1

        # Check for faults, lets, or serve-and-volleys
        elif serving and char != 'c':

            serving = False
            if char == '+':               
                row_out[char + str(player)] += 1
            player = 1 - player

        # Record regular shots
        if not serving:          

            if char in rally_shots:
                row_out[char + str(player)] += 1
                shot_ctr[player] += 1
                player = 1 - player

            elif char in shot_dir:
                row_out[char + str(player)] += 1
                shot_dir_ctr[player] += 1

            elif char in ret_dir:
                row_out[char + str(player)] += 1
                ret_dir_ctr[player] += 1

# Final output
data.to_csv(filename, index=False)
print(f'Finished Running. Final length: {len(data)}. Runtime: {round(time.time() - t) / 60} min.')


Running: 3% complete. Written: 890. Time: 0.1 min
Running: 7% complete. Written: 1774. Time: 0.21666666666666667 min
Running: 11% complete. Written: 2660. Time: 0.36666666666666664 min
Running: 15% complete. Written: 3545. Time: 0.5333333333333333 min
Running: 19% complete. Written: 4433. Time: 0.7 min
Running: 23% complete. Written: 5322. Time: 0.9 min
Running: 27% complete. Written: 6196. Time: 1.1 min
Running: 31% complete. Written: 7113. Time: 1.3333333333333333 min
Running: 35% complete. Written: 8059. Time: 1.6 min
Running: 39% complete. Written: 9009. Time: 1.8666666666666667 min
Running: 43% complete. Written: 9930. Time: 2.15 min
Running: 47% complete. Written: 10869. Time: 2.466666666666667 min
Running: 51% complete. Written: 11826. Time: 2.783333333333333 min
Running: 55% complete. Written: 12770. Time: 3.1333333333333333 min
Running: 59% complete. Written: 13718. Time: 3.5166666666666666 min
Running: 63% complete. Written: 14656. Time: 3.9 min
Running: 67% complete. Written