# Tennis Serve/Return Data Processing

In [1]:
import pandas as pd # for data processing
import re # for string matching
import time # measure length of time for data processing

### Helper Functions to Process Data By Each Row
Input for each of these functions is a single row of data

In [2]:
# returns string representing server of point
# first name in match_id is first server, so we can use the Svr field to get the name of the server
def get_server(row):
    if row['Svr'] == 1: # if server is 1st, return first name in match_id
        if row['match_id'].split('-')[-2][-1] == '_':
            return row['match_id'].split('-')[-2][:-1]
        else:
            return row['match_id'].split('-')[-2]
    else: # otherwise, return second name in match_id
        if row['match_id'].split('-')[-1][-1] == '_':
            return row['match_id'].split('-')[-1][:-1]
        else:
            return row['match_id'].split('-')[-1]

In [3]:
# returns string representing returner of point
# reverse logic of get_server() function
def get_returner(row):
    if row['Svr'] == 2:
        if row['match_id'].split('-')[-2][-1] == '_':
            return row['match_id'].split('-')[-2][:-1]
        else:
            return row['match_id'].split('-')[-2]
    else:
        if row['match_id'].split('-')[-1][-1] == '_':
            return row['match_id'].split('-')[-1][:-1]
        else:
            return row['match_id'].split('-')[-1]

In [4]:
# returns string representing whether point is played on deuce court or not
# deuce court points are played when point score is even or score is two points apart
def get_deuce(row):
    if row['Pts'].split('-')[0] == row['Pts'].split('-')[1] or \
    row['Pts'] == '30-0' or row['Pts'] == '40-15' or row['Pts'] == '15-40':
        return "Yes"
    else:
        return "No"

In [5]:
# returns string representing whether point has high stakes or not
# pressure points include all points in close-game, late-set situations and when returner is 2 points or closer to breaking serve
def get_pressure_point(row):
    if (abs(row['Gm1'] - row['Gm2']) <= 1 and (row['Gm1'] >= 4 or row['Gm2'] >= 4)) or \
    row['Pts'].split('-')[1] == 'AD' or int(row['Pts'].split('-')[1]) >= 30:
        return 'Yes'
    else:
        return 'No'

In [6]:
# isolate the date of the point from the match_id
def get_date(row):
    return row['match_id'].split('-')[0]

### Using Regex in Helper Functions
The dataset contains encoding of shot sequences for each point. The characters and their interpretations are as follows:

| Character | Interpretation |
| :--: | :--: |
| 4 | wide serve |
| 5 | body serve |
| 6 | T serve |
| + | approach |
| d | deep miss |
| w | wide miss |
| x | deep and wide miss |
| n | net miss |
| f | forehand |
| b | backhand |
| r | forehand slice |
| s | backhand slice |
| * | serve ace/rally winner |
| # | serve unreturned/rally forced error |
| @ | unforced error |

In [7]:
serve_miss = r".*[456]\+?[dwxn]" # pattern identifies serve direction followed by error code
ace_unreturned = r".*[456]\+?[\*#]" # pattern identifies serve direction followed by ace/unreturned code
forcError = r".*[fbrs].*#" # pattern identifies a ralley that ends in a forced error
unforcError = r".*[fbrs].*@" # pattern identifies a ralley that ends in an unforced error
winner = r".*[fbrs].*\*" # pattern identifies a ralley that ends in a winner

In [8]:
# returns string representing where in the service box the ball was served to
# wide is towards the alley, body is the middle of the box, and T is towards the center of the court
def get_1SrvDir(row):
    if re.match(r"4", row['1st']):
        return "Wide"
    elif re.match(r"5", row['1st']):
        return "Body"
    elif re.match(r"6", row['1st']):
        return "T"

In [9]:
# same logic as the function for first serve direction but accounting for a 2nd serve not being hit if the 1st serve was made 
def get_2SrvDir(row):
    if row['2nd'] == '$':
        return "Not Served"
    elif re.match(r"4", row['2nd']):
        return "Wide"
    elif re.match(r"5", row['2nd']):
        return "Body"
    elif re.match(r"6", row['2nd']):
        return "T"

In [10]:
# matches regex patterns to encoding for 1st serve
def get_1SrvOutcome(row):
    if re.match(serve_miss, row['1st']):
        return "Miss"
    elif re.match(ace_unreturned, row['1st']):
        return "Ace/Unreturned"
    elif re.match(forcError, row['1st']):
        return "Forced Err"
    elif re.match(unforcError, row['1st']):
        return "Unforced Err"
    elif re.match(winner, row['1st']):
        return "Winner"

In [11]:
# matches regex patterns to encoding for 2nd serve but accounting for a 2nd serve not being hit if the 1st serve was made 
def get_2SrvOutcome(row):
    if row['2nd'] == '$':
        return "Not Served"
    elif re.match(serve_miss, row['2nd']):
        return "Miss"
    elif re.match(ace_unreturned, row['2nd']):
        return "Ace/Unreturned"
    elif re.match(forcError, row['2nd']):
        return "Forced Err"
    elif re.match(unforcError, row['2nd']):
        return "Unforced Err"
    elif re.match(winner, row['2nd']):
        return "Winner"

### Combine Helper Functions Into One Data Processor

In [12]:
# use this function to apply the helper functions to every row in the dataset
def process_data(df):
    df['2nd'] = df['2nd'].fillna('$') # blank records in the 2nd field are replaced with $ to represent 'Not Served'
    df['Server'] = df.apply(get_server, axis = 1)
    df['Returner'] = df.apply(get_returner, axis = 1)
    df['Deuce_Ad'] = df.apply(get_deuce, axis = 1)
    df['Pressure'] = df.apply(get_pressure_point, axis = 1)
    df['1stDir'] = df.apply(get_1SrvDir, axis = 1)
    df['2ndDir'] = df.apply(get_2SrvDir, axis = 1)
    df['1stOutcome'] = df.apply(get_1SrvOutcome, axis = 1)
    df['2ndOutcome'] = df.apply(get_2SrvOutcome, axis = 1)
    df['Date'] = df.apply(get_date, axis = 1)
    # rename columns that do not need alteration, but will be used in analysis
    df = df.rename(columns = {'isSvrWinner':'PointWon', 'rallyCount':'RallyLength'})
    
    return df

### Define Function to Read File and Process Data

In [13]:
# function to read in file, piece by piece, and process file together
def read_file(filename, chunk_size):
    # read file "chunk_size" records at a time
    chunks = pd.read_csv(filename, 
                         header = 0, 
                         chunksize = chunk_size, 
                         encoding = "windows-1252",
                         usecols = ['match_id', 'Gm1', 'Gm2', 'Pts', 'Svr',\
                                    '1st', '2nd', '1stIn', '2ndIn', 'isSvrWinner', 'rallyCount'])
    
    # create empty dataframe and populate it with each of the chunks from the file
    df = pd.DataFrame()
    for chunk in chunks:
        df = pd.concat([chunk, df], axis = 0)
    
    # add Tour field to indicate whether point was played on ATP or WTA tours
    if '-m-' in filename:
        df['Tour'] = 'ATP'
    elif '-w-' in filename:
        df['Tour'] = 'WTA'
    
    # call the data processing function from above that applies helper functions to create new fields
    df = process_data(df)

    return df
    

### Call Function to Read in File and Process Data

In [14]:
start_time = time.time()
chunk_size = 10000
men_since_2017 = read_file("charting-m-points-from-2017.csv", chunk_size)
print(round(time.time() - start_time, 2), "seconds")
men_through_2016 = read_file("charting-m-points-to-2016.csv", chunk_size)
print(round(time.time() - start_time, 2), "seconds")
women_since_2017 = read_file("charting-w-points-from-2017.csv", chunk_size)
print(round(time.time() - start_time, 2), "seconds")
women_through_2016 = read_file("charting-w-points-to-2016.csv", chunk_size)
print(round(time.time() - start_time, 2), "seconds")

33.82 seconds
81.73 seconds
105.52 seconds
121.0 seconds


### Combine Dataframes and Select/Order Relevant Rows

In [15]:
data = pd.concat([men_since_2017, men_through_2016, women_since_2017, women_through_2016], axis = 0)
data = data[['Tour', 'Date', 'Server', 'Returner', 'Deuce_Ad', 'Pressure', \
             '1stDir', '1stOutcome', '2ndDir', '2ndOutcome', 'RallyLength', 'PointWon']]
data.shape

(985419, 12)

### Drop Records With Empty Fields and Export to CSV

In [16]:
data = data.dropna()
print(data.shape)
data.to_csv("PointByPointServingData.csv", index = False)

(932665, 12)
