In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('precision', 3)

In [2]:
import os
os.chdir('/Users/serkankd/Documents/Master_Thesis/atp_data')
data = pd.read_excel('merged.xlsx')

(53188, 51)

### Introduction to data set

In [None]:
data.shape

In [3]:
# check the variable info of the dataset
# there are 51 columns with maximum 53,188 entries.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53188 entries, 0 to 53187
Data columns (total 51 columns):
id                    53188 non-null int64
tourney_id            53188 non-null object
tourney_name          53188 non-null object
surface               53070 non-null object
draw_size             53188 non-null int64
tourney_level         53188 non-null object
month                 53188 non-null int64
year                  53188 non-null int64
match_num             53188 non-null int64
winner_id             53188 non-null int64
winner_seed           22149 non-null float64
winner_entry          6201 non-null object
winner_name           53188 non-null object
winner_hand           53174 non-null object
winner_ht             49691 non-null float64
winner_ioc            53188 non-null object
winner_age            53170 non-null float64
winner_rank           52112 non-null float64
winner_rank_points    52112 non-null float64
loser_id              53188 non-null int64
loser_seed    

The dataset is consist of 3 different groups:
    Basic information factors of the matches (tourney id, name, level and so on)
    Descriptive parameters for each player (the player’s name, rank, height and so on)
    Basic performance statistics for each player (number of aces, number of double fouls and so on).

### Data Cleaning

In the data set the columns winner/loser entry, winner/loser height, winner/loser seed and minutes have many missing values. Since they are not highly relevant for match outcome prediction and to prevent unnecessary data loss, they are excluded from the data set. Moreover, the matches from Davis Cup and Olympic games are also excluded.

In [4]:
data = data.drop(['winner_seed', 'winner_entry', 'winner_ht', 'loser_seed', 'loser_entry', 'loser_ht', 'minutes'], axis = 1)
data = data[(data['tourney_level'] != 'D')]
data = data[data['tourney_level'] != 'C']

In [5]:
#Then, let's check whether all data points for each season are available or not
data.groupby('year').count()

Unnamed: 0_level_0,id,tourney_id,tourney_name,surface,draw_size,tourney_level,month,match_num,winner_id,winner_name,...,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000,3300,3300,3300,3300,3300,3300,3300,3300,3300,3300,...,2941,2941,2941,2941,2941,2941,2941,2941,2941,2941
2001,3400,3400,3400,3400,3400,3400,3400,3400,3400,3400,...,3060,3060,3060,3060,3060,3060,3060,3060,3060,3060
2002,3236,3236,3236,3236,3236,3236,3236,3236,3236,3236,...,2840,2840,2840,2840,2840,2840,2840,2840,2840,2840
2003,3121,3121,3121,3121,3121,3121,3121,3121,3121,3121,...,2716,2716,2716,2716,2716,2716,2716,2716,2716,2716
2004,3213,3213,3213,3213,3213,3213,3213,3213,3213,3213,...,2880,2880,2880,2880,2880,2880,2880,2880,2880,2880
2005,3257,3257,3257,3257,3257,3257,3257,3257,3257,3257,...,2912,2912,2912,2912,2912,2912,2912,2912,2912,2912
2006,3257,3257,3257,3257,3257,3257,3257,3257,3257,3257,...,2908,2908,2908,2908,2908,2908,2908,2908,2908,2908
2007,3245,3245,3245,3245,3245,3245,3245,3245,3245,3245,...,2899,2899,2899,2899,2899,2899,2899,2899,2899,2899
2008,2953,2953,2953,2953,2953,2953,2953,2953,2953,2953,...,2610,2610,2610,2610,2610,2610,2610,2610,2610,2610
2009,3074,3074,3074,3074,3074,3074,3074,3074,3074,3074,...,2726,2726,2726,2726,2726,2726,2726,2726,2726,2726


In [6]:
# drop the data form year 2017 since there are only 388 entries in 2017 although
# the full season has on average three thousand entries
data = data[data['year'] != 2017]

In [7]:
# The variable score can be used to create the number of sets, which would allow us to eliminate incomplete matches.
# - in scores can be used to count number of sets
data['sets'] = data['score'].str.count('-')
# 0 sets means the match is not completed due to retired or walkover etc.
# exclude incomplete matches from data because they cannot be used for prediction
data = data[data['sets'] != 0]

0    3-6 7-6(6) 7-6(4)
1              6-2 7-5
2              6-1 6-3
3          4-6 6-2 7-5
4              6-1 6-4
Name: score, dtype: object

In [11]:
data.describe()

Unnamed: 0,id,draw_size,month,year,match_num,winner_id,winner_age,winner_rank,winner_rank_points,loser_id,...,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,sets
count,45543.0,45543.0,45543.0,45543.0,45543.0,45543.0,45543.0,44953.0,44953.0,45543.0,...,45398.0,45398.0,45398.0,45398.0,45398.0,45398.0,45398.0,45398.0,45398.0,45542.0
mean,26162.609,58.77,5.585,2007.753,41.889,103919.599,26.155,58.34,1610.002,103936.796,...,5.063,3.411,81.324,48.349,32.208,15.185,12.354,4.808,8.713,2.593
std,15274.794,37.102,2.96,4.9,59.708,1251.279,3.601,72.993,2031.065,1645.837,...,4.785,2.528,28.401,18.721,14.032,7.182,4.095,3.248,4.093,0.751
min,1.0,8.0,1.0,2000.0,1.0,100644.0,15.825,1.0,1.0,100644.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,13115.5,32.0,3.0,2004.0,12.0,103206.0,23.537,16.0,590.0,103153.0,...,2.0,2.0,60.0,35.0,22.0,10.0,9.0,2.0,6.0,2.0
50%,26188.0,32.0,6.0,2008.0,23.0,103900.0,26.062,41.0,940.0,103857.0,...,4.0,3.0,76.0,45.0,30.0,14.0,11.0,4.0,8.0,2.0
75%,39401.5,64.0,8.0,2012.0,43.0,104607.0,28.632,76.0,1700.0,104607.0,...,7.0,5.0,97.0,58.0,40.0,19.0,15.0,7.0,11.0,3.0
max,52549.0,128.0,12.0,2016.0,319.0,144923.0,38.313,1890.0,16950.0,202359.0,...,103.0,23.0,489.0,328.0,284.0,101.0,91.0,25.0,34.0,5.0


In [12]:
#data shape before dropping missing values
print(data.shape)
df = data.dropna(axis = 0) # drop rows with any column having NA/null data
#data shape after dropping missing values
print(df.shape)

(45543, 45)
(44705, 45)


### Data Representation

Since a supervised machine learning algorithm requires a set of labeled examples for training, the target value, corresponding to the outcome of a match, has to be defined. In the dataset, the statistics were labeled for the winner and loser of the match. We randomly assigned "Player 1" to be either the winner or loser and "Player 2" to be the other player. 

The target value, without loss of generality, could be defined as follows:
\begin{equation}
    Y = 
    \begin{cases}
    1, & \text{if Player 1 won}\\
    0, & \text{if Player 2 won}\
    \end{cases}
\end{equation}

No other outcomes are possible since incomplete matches are excluded from the dataset. Note that we created a balanced data set, i.e the frequency of $Y = 1$ is almost same as of $Y = 0$.

In [13]:
df = df.reset_index(drop=True)
df_shuffle = pd.DataFrame(df.id)
# Rename winner or loser names with player 1 and player 2, respectively.
# It'll be used to shuffle them 
df_shuffle_1 = df.iloc[:, [9,10,11,12,13,14,15,26,27,28,29,30,31,32,33,34]]
df_shuffle_1.columns

Index(['winner_id', 'winner_name', 'winner_hand', 'winner_ioc', 'winner_age',
       'winner_rank', 'winner_rank_points', 'w_ace', 'w_df', 'w_svpt',
       'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced'],
      dtype='object')

In [14]:
df_shuffle['winner_data'] = df_shuffle_1.apply(lambda x: '-'.join(x.astype(str)), axis=1)
# Rename winner or loser names with player 1 and player 2, respectively.
# It'll be used to shuffle them 
df_shuffle_2 = df.iloc[:, [16,17,18,19,20,21,22,35,36,37,38,39,40,41,42,43]]
df_shuffle_2.columns

Index(['loser_id', 'loser_name', 'loser_hand', 'loser_ioc', 'loser_age',
       'loser_rank', 'loser_rank_points', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn',
       'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced'],
      dtype='object')

In [15]:
df_shuffle['loser_data'] = df_shuffle_2.apply(lambda x: '-'.join(x.astype(str)), axis=1)

In [16]:
N = len(df_shuffle)
swap_boolean = np.random.choice([0, 1], size=N, p=[.5, .5])
df_swap_boolean = pd.DataFrame(swap_boolean)
df_shuffles = pd.concat([df_shuffle, df_swap_boolean], axis= 1)
df_shuffles.columns = [*df_shuffles.columns[:-1], 'swap_boolean']
df_shuffles.head()

Unnamed: 0,id,winner_data,loser_data,swap_boolean
0,1,102179-Antony Dupuis-R-FRA-27.1813826146-113.0...,102776-Andrew Ilie-R-AUS-24.0355920602-50.0-76...,1
1,2,103602-Fernando Gonzalez-R-CHI-19.7563312799-3...,102821-Cecil Mamiit-R-PHI-23.8439425051-139.0-...,0
2,3,103387-Paradorn Srichaphan-R-THA-20.8815879535...,102205-Sebastien Lareau-R-CAN-27.0116358658-13...,0
3,4,101733-Jan Siemerink-L-NED-30.0479123888-107.0...,102925-Justin Gimelstob-R-USA-23.2607802875-95...,0
4,5,101727-Jason Stoltenberg-R-AUS-30.0752908966-7...,101826-Alex Lopez Moron-R-ESP-29.4236824093-11...,0


In [17]:
df_shuffles['p1_data'] = np.nan
df_shuffles['p2_data'] = np.nan
df_shuffles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44705 entries, 0 to 44704
Data columns (total 6 columns):
id              44705 non-null int64
winner_data     44705 non-null object
loser_data      44705 non-null object
swap_boolean    44705 non-null int64
p1_data         0 non-null float64
p2_data         0 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 2.0+ MB


In [18]:
def custom_swap(df):
    df_copy = df
    for i in range(N):
        if df_copy.iloc[i, 3] == 1:
            df_copy.iloc[i, 4] = df_copy.iloc[i, 1]
            df_copy.iloc[i, 5] = df_copy.iloc[i, 2]
        else:
            df_copy.iloc[i, 4] = df_copy.iloc[i, 2]
            df_copy.iloc[i, 5] = df_copy.iloc[i, 1]
    return df_copy

In [19]:
df_shuffled = custom_swap(df_shuffles)
df_shuffled[['p1_id', 'p1_name', 'p1_hand', 'p1_ioc', 'p1_age',
       'p1_rank', 'p1_rank_points', 'p1_ace', 'p1_df', 'p1_svpt',
       'p1_1stIn', 'p1_1stWon', 'p1_2ndWon', 'p1_SvGms', 'p1_bpSaved', 'p1_bpFaced']] = df_shuffled['p1_data'].str.split('-',expand=True)
df_shuffled[['p2_id', 'p2_name', 'p2_hand', 'p2_ioc', 'p2_age',
       'p2_rank', 'p2_rank_points', 'p2_ace', 'p2_df', 'p2_svpt',
       'p2_1stIn', 'p2_1stWon', 'p2_2ndWon', 'p2_SvGms', 'p2_bpSaved', 'p2_bpFaced']] = df_shuffled['p2_data'].str.split('-',expand=True)
df_shuffled.head()

Unnamed: 0,id,winner_data,loser_data,swap_boolean,p1_data,p2_data,p1_id,p1_name,p1_hand,p1_ioc,...,p2_rank_points,p2_ace,p2_df,p2_svpt,p2_1stIn,p2_1stWon,p2_2ndWon,p2_SvGms,p2_bpSaved,p2_bpFaced
0,1,102179-Antony Dupuis-R-FRA-27.1813826146-113.0...,102776-Andrew Ilie-R-AUS-24.0355920602-50.0-76...,1,102179-Antony Dupuis-R-FRA-27.1813826146-113.0...,102776-Andrew Ilie-R-AUS-24.0355920602-50.0-76...,102179,Antony Dupuis,R,FRA,...,762.0,13.0,4.0,110.0,59.0,49.0,31.0,17.0,4.0,4.0
1,2,103602-Fernando Gonzalez-R-CHI-19.7563312799-3...,102821-Cecil Mamiit-R-PHI-23.8439425051-139.0-...,0,102821-Cecil Mamiit-R-PHI-23.8439425051-139.0-...,103602-Fernando Gonzalez-R-CHI-19.7563312799-3...,102821,Cecil Mamiit,R,PHI,...,76.0,4.0,2.0,67.0,35.0,25.0,16.0,10.0,4.0,6.0
2,3,103387-Paradorn Srichaphan-R-THA-20.8815879535...,102205-Sebastien Lareau-R-CAN-27.0116358658-13...,0,102205-Sebastien Lareau-R-CAN-27.0116358658-13...,103387-Paradorn Srichaphan-R-THA-20.8815879535...,102205,Sebastien Lareau,R,CAN,...,380.0,4.0,1.0,46.0,29.0,23.0,11.0,8.0,0.0,0.0
3,4,101733-Jan Siemerink-L-NED-30.0479123888-107.0...,102925-Justin Gimelstob-R-USA-23.2607802875-95...,0,102925-Justin Gimelstob-R-USA-23.2607802875-95...,101733-Jan Siemerink-L-NED-30.0479123888-107.0...,102925,Justin Gimelstob,R,USA,...,371.0,8.0,6.0,109.0,56.0,43.0,21.0,15.0,9.0,12.0
4,5,101727-Jason Stoltenberg-R-AUS-30.0752908966-7...,101826-Alex Lopez Moron-R-ESP-29.4236824093-11...,0,101826-Alex Lopez Moron-R-ESP-29.4236824093-11...,101727-Jason Stoltenberg-R-AUS-30.0752908966-7...,101826,Alex Lopez Moron,R,ESP,...,543.0,3.0,0.0,50.0,27.0,22.0,16.0,9.0,1.0,1.0


In [20]:
columns_numeric = ['p1_id', 'p1_rank',
       'p1_rank_points', 'p1_ace', 'p1_df', 'p1_svpt', 'p1_1stIn', 'p1_1stWon',
       'p1_2ndWon', 'p1_SvGms', 'p1_bpSaved', 'p1_bpFaced', 'p2_id', 'p2_age', 
       'p2_rank', 'p2_rank_points', 'p2_ace', 'p2_df', 'p2_svpt', 'p2_1stIn', 
       'p2_1stWon', 'p2_2ndWon', 'p2_SvGms','p2_bpSaved', 'p2_bpFaced']
columns_str = ['p1_name', 'p1_hand', 'p1_ioc', 'p2_name', 'p2_hand', 'p2_ioc']

In [21]:
df_shuffled['outcome'] = np.nan
df_shuffled[columns_numeric] = df_shuffled[columns_numeric].apply(pd.to_numeric)

In [22]:
# get location of winner/loser id from the original dataset and p1/p2_id from shuffled
# and location of outcome
loc_w = df.columns.get_loc("winner_id")
loc_p1 = df_shuffled.columns.get_loc("p1_id")
loc_outcome = df_shuffled.columns.get_loc("outcome")
print('Location of column winner_id: %i, p1_id: %i, outcome: %i' % (loc_w, loc_p1, loc_outcome))
#print(loc_p1)
#print(loc_outcome)

Location of column winner_id: 9, p1_id: 6, outcome: 38


In [23]:
#create targeted label: outcome. 
#1 if the p1 was the winner, 0 otherwise
def create_target(df1, df2):
    for i in range(N):
        if df1.iloc[i, 9] == df2.iloc[i, 6]:
            df2.iloc[i, 38] = 1
        else:
            df2.iloc[i, 38] = 0
    return df2

In [24]:
df_shuffled = create_target(df, df_shuffled)
df = df[['id', 'tourney_id', 'tourney_name', 'surface', 'draw_size',
       'tourney_level', 'month', 'year', 'match_num', 'best_of', 'round', 'sets']]
df_merged = pd.merge(df, df_shuffled, how='inner', on='id')
df_merged = df_merged.drop(columns=['winner_data', 'loser_data', 'swap_boolean', 'p1_data','p2_data'])


### Feature Engineering

#### Transformed Rankings

Direct use of ATP rankings would give a biased result since the quality in tennis is a pyramid: the difference in ranking points increasingly higher for top rankings in comparison to low rankings. In other words, a win of the 120th player against the 101th is much more common than a victory of the 20th against the 1st, although the difference in positions is the same. The pyramid is based on "round in which we expect the player to lose". For example, 2 for a player who is expected to lose in round 2. However, there is a problem with "expected round", which does not distinguish players from the same bucket. For example, it does not differentiate 9th and 16th players since both of them are expected to lose in round 4 in a grand slam tournament. Therefore, Klaassen and Magnus (2003) suggested a smoother measure, which could be called as transformed rankings. They transformed the ranking of each player by 

\begin{equation}
    R_{player 1} = 8 - \log_2 (Rank_{player 1})
\end{equation}

where $Rank_{player 1}$ is the official ATP ranking position of player 1. They used 8 as a constant variable since they only evaluated matches from the Wimbledon Grand Slam tournament, which consists of 7 rounds. So, 8 for the player who is expected to win the final.  However, we derived "expected round" for each tournament, denoted by $ER$, from the tournament's draw size to make the formula consistent. Therefore, I created transformed rankings as follows: 

\begin{equation}
    R_{player 1} = ER - \log_2 (Rank_{player 1})
\end{equation}

In [25]:
# Expected round based on draw size is hardcoded by using the knowledge of the tennis tournament design. 
#It will be used to calculate transformed rankings
df_merged['ER'] = np.nan
df_merged.loc[df_merged.draw_size == 128, 'ER'] = 8
df_merged.loc[df_merged.draw_size == 96, 'ER'] = 7
df_merged.loc[df_merged.draw_size == 64, 'ER'] = 7
df_merged.loc[df_merged.draw_size == 56, 'ER'] = 6
df_merged.loc[df_merged.draw_size == 48, 'ER'] = 6
df_merged.loc[df_merged.draw_size == 32, 'ER'] = 6
df_merged.loc[df_merged.draw_size == 28, 'ER'] = 5
df_merged.loc[df_merged.draw_size == 16, 'ER'] = 5
df_merged.loc[df_merged.draw_size == 8, 'ER'] = 4

In [26]:
#Since rank differences are like pyramide, we used a transformation. From Magnus and Kaarlsen
#we can take difference of transformed ranks.
df_merged['rank_p1'] = df_merged['ER'] - np.log2(df_merged['p1_rank'])
df_merged['rank_p2'] = df_merged['ER'] - np.log2(df_merged['p2_rank'])

#### Performance Statistics

Basic performance statistics such as ace, double foul, service points and so on could be easily accessible because they are available in the data set. However, more valuable metrics can be created by combining basic performance statistics.

In [27]:
# creating new metrics using performance statistics
data = df_merged
data['p2_2ndIn'] = data['p2_svpt'] - data['p2_1stIn'] # needed to calculate p1_2nd return

data['p1_1stServe_perc'] = data['p1_1stIn']/data['p1_svpt'] # the percentage of successful first serves for the winner
data['p1_2ndIn'] = data['p1_svpt'] - data['p1_1stIn'] # the number of successful second serves for the winner
data['p1_1st_svpt_won_perc'] =data['p1_1stWon']/data['p1_1stIn'] # the percentage of first serve points won by the winner
data['p1_2nd_svpt_won_perc'] =data['p1_2ndWon']/data['p1_2ndIn'] # the percentage of second serve points won by the winner
data['p1_1st_return_won'] = data['p2_1stIn'] - data['p2_1stWon'] # the number of first return points won by the winner
data['p1_2nd_return_won'] = data['p2_2ndIn'] - data['p2_2ndWon'] # the number of second return points won by the winner
data['p1_1st_return_won_perc'] = data['p1_1st_return_won']/data['p2_1stIn'] # the percentage of first return points won by the winner
data['p1_2nd_return_won_perc'] = data['p1_2nd_return_won']/data['p2_2ndIn'] # the percentage of second return points won by the winner
data['p1_bp_won_perc'] = data['p1_bpSaved']/data['p1_bpFaced'] # the percentage of breakpoints won by the winner
data['p1_bp_won'] = data['p2_bpFaced'] - data['p2_bpSaved'] # the number of breakpoints won by the winner
data['p1_bp_converted_perc'] = data['p1_bp_won'] /data['p2_bpFaced'] # the percentage of breakpoints converted for the winner

# same statistics for the loser
data['p2_1stServe_perc'] = data['p2_1stIn']/data['p2_svpt']
data['p2_1st_svpt_won_perc'] =data['p2_1stWon']/data['p2_1stIn']
data['p2_2nd_svpt_won_perc'] =data['p2_2ndWon']/data['p2_2ndIn']
data['p2_1st_return_won'] = data['p1_1stIn'] - data['p1_1stWon']
data['p2_2nd_return_won'] = data['p1_2ndIn'] - data['p1_2ndWon']
data['p2_1st_return_won_perc'] = data['p2_1st_return_won']/data['p1_1stIn']
data['p2_2nd_return_won_perc'] = data['p2_2nd_return_won']/data['p1_2ndIn']
data['p2_bp_won_perc'] = data['p2_bpSaved']/data['p2_bpFaced']
data['p2_bp_won'] = data['p1_bpFaced'] - data['p1_bpSaved']
data['p2_bp_converted_perc'] = data['p2_bp_won'] /data['p1_bpFaced']

When the receiver wins the game, it is called that the receiver has broken the serve, which gives the receiver an advantage to win the match because most games end in favor of the server. Although there might be many reasons for it, intuitively, the most important one is that a server has a chance to serve again if the first serve is a foul. This rule enables the server to try a risky but more effective service at first. If it is a foul, he/she can serve more cautiously again.

Since we calculated the first serve accuracy, the winning percentage on first and second serves and returns for each player, we can combine these statistics to create new features, which are called overall winning on serve and return percentage for each player by 

In [28]:
#Using these variables create other more comprehensive. Overall winning on serve percentage, completeness etc.
data['overall_p1_serve_perc'] = data['p1_1st_svpt_won_perc']*data['p1_1stServe_perc'] + data['p1_2nd_svpt_won_perc']*(1-data['p1_1stServe_perc'])
data['overall_p2_serve_perc'] = data['p2_1st_svpt_won_perc']*data['p2_1stServe_perc'] + data['p2_2nd_svpt_won_perc']*(1-data['p2_1stServe_perc'])

# we can create overall winning on return percentage for each player    
data['overall_p1_return_perc'] = data['p1_1st_return_won_perc']*data['p2_1stServe_perc'] + data['p1_2nd_return_won_perc']*(1-data['p2_1stServe_perc'])
data['overall_p2_return_perc'] = data['p2_1st_return_won_perc']*data['p1_1stServe_perc'] + data['p2_2nd_return_won_perc']*(1-data['p1_1stServe_perc'])

Another important reason why most games end in favor of the server might be that the receiver is in a disadvantageous position. The receiver has to choose the right spot to wait for the service and has less than a second to return the ball into the server's field. Therefore, a good returning skill is highly valuable for a player.

As explained above, both serving and returning skills are important for a tennis player. Therefore, it would make sense to measure the completeness of a player by combining their serve and return winning percentages by

In [None]:
# Completeness
data['comlete_p1'] = data['overall_p1_serve_perc'] * data['overall_p1_return_perc']
data['comlete_p2'] = data['overall_p2_serve_perc'] * data['overall_p2_return_perc']

# The multiplicative relationship ensures that a player has high completeness if 
# they are strong in both offensive and defensive aspects of the game.

Tennis is a two player game as explained in the above parts many times. Therefore, we should also consider the effect of the receiver's skills when we calculate the server's skills and vice versa. For example, Although overall winning on serve percentage could be an important factor, a player's success on serve might also depend on the opponent's return skills. So, we can calculate both players' advantage on serve against the other by 

In [29]:
# serve advantages
data['serveadv_p1'] = data['overall_p1_serve_perc'] - data['overall_p1_return_perc']
data['serveadv_p2'] = data['overall_p2_serve_perc'] - data['overall_p2_return_perc']

We expect these metrics to be more informative and consistent to show a player's performance, which could be an important factor to use in the model.

There could be many more ways to create features by combining performance statistics. However, it would require a deep understanding of tennis and player characteristics.

#### Historical averaging by surface

Performance statistics are crucial to derive the overall strength of a player, which would help the model to decide which player would win the match. Although some variables, such as the surface type, ranks of the players, tourney level, etc. are easily accessible before a match, all performance statistics must be estimated based on the performance of the players in their previous matches. For example, the number of service points acquired by a player varies from match to match. Therefore, we would use the past matches of the players to find their average number of service points.

Moreover, players' performence is affected by surface type. For example, Rafael Nadal has won Rolland Garros grand slam tournament,in which surface type is clay, 12 times in the last 15 years, however, he only has won 2 Wimbledon grand slam tournaments,  which has a grass court.  An intuitive hypothesis could be that most players play only in one surface type until they become a professional tennis  player.   Players’  playing  style  is  shaped  mostly  by  that  surface  type  since  each surface type has a different impact on the bounce of the ball, fastest in the grass, slowest in clay and somewhere in between in hard surface.  (Barnett and Pollard, 2007) [2] showthat player’ performances are, indeed, affected by the court surface.  Therefore, a player’s past matches on the same surface should be more informative than those on the other surfaces when predicting a tennis match

In [31]:
# calculating historical averages for each player by surface requires some workaround.
# We'll apply historical averages only on performance metrics, therefore we create a subset of the dataset with id included
# We'll do that step by step
p1_perf_data = data[['id','p1_id','p1_ace', 'p1_df', 'p1_svpt', 'p1_1stIn', 'p1_1stWon',
       'p1_2ndWon', 'p1_SvGms', 'p1_bpSaved', 'p1_bpFaced', 'p1_2ndIn', 'p1_1stServe_perc', 
       'p1_1st_svpt_won_perc','p1_2nd_svpt_won_perc', 'p1_1st_return_won',
       'p1_2nd_return_won', 'p1_1st_return_won_perc', 'p1_2nd_return_won_perc',
       'p1_bp_won_perc', 'p1_bp_won', 'p1_bp_converted_perc','overall_p1_serve_perc','overall_p1_return_perc',
       'comlete_p1', 'serveadv_p1']]
p2_perf_data = data[['id','p2_id','p2_ace', 'p2_df', 'p2_svpt', 'p2_1stIn', 'p2_1stWon',
       'p2_2ndWon', 'p2_SvGms', 'p2_bpSaved', 'p2_bpFaced', 'p2_2ndIn', 'p2_1stServe_perc', 
       'p2_1st_svpt_won_perc','p2_2nd_svpt_won_perc', 'p2_1st_return_won', 
       'p2_2nd_return_won', 'p2_1st_return_won_perc', 'p2_2nd_return_won_perc',
       'p2_bp_won_perc', 'p2_bp_won', 'p2_bp_converted_perc','overall_p2_serve_perc','overall_p2_return_perc',
       'comlete_p2', 'serveadv_p2']]
print(p1_perf_data.shape)
print(p2_perf_data.shape)

(44705, 26)
(44705, 26)


In [32]:
p1_perf_data.columns = ['id','player_id','ace', 'df', 'svpt', '1stIn', '1stWon',
       '2ndWon', 'SvGms', 'bpSaved', 'bpFaced', '2ndIn', '1stServe_perc', 
       '1st_svpt_won_perc','2nd_svpt_won_perc', '1st_return_won',
       '2nd_return_won', '1st_return_won_perc', '2nd_return_won_perc',
       'bp_won_perc', 'bp_won', 'bp_converted_perc','overall_serve_perc','overall_return_perc',
       'comlete', 'serveadv']
p2_perf_data.columns = ['id','player_id','ace', 'df', 'svpt', '1stIn', '1stWon',
       '2ndWon', 'SvGms', 'bpSaved', 'bpFaced', '2ndIn', '1stServe_perc', 
       '1st_svpt_won_perc','2nd_svpt_won_perc', '1st_return_won',
       '2nd_return_won', '1st_return_won_perc', '2nd_return_won_perc',
       'bp_won_perc', 'bp_won', 'bp_converted_perc','overall_serve_perc','overall_return_perc',
       'comlete', 'serveadv']


In [33]:
perf_data = p1_perf_data.append(p2_perf_data)
perf_data.shape

(89410, 26)

In [34]:
# replace all missing values with 0 since having no info for a performance metric can be thought as having 0.
# Fro example, if one player don't have any double faul, it can be written as 0.
perf_data = perf_data.fillna(0)

In [35]:
# add surface info to data to group datasets by surface.
# Historical averaging will be done by each surface separately.
surface_data = data[['id', 'surface']]

Unnamed: 0,id,surface
0,1,Clay
1,2,Clay
2,3,Clay
3,4,Clay
4,5,Clay


In [36]:
#merge surface info into performance data
perf_data_surface = pd.merge(perf_data, surface_data, how = 'inner', on = 'id')
perf_data_surface.head()

Unnamed: 0,id,player_id,ace,df,svpt,1stIn,1stWon,2ndWon,SvGms,bpSaved,...,1st_return_won_perc,2nd_return_won_perc,bp_won_perc,bp_won,bp_converted_perc,overall_serve_perc,overall_return_perc,comlete,serveadv,surface
0,1,102179,8.0,1.0,126.0,76.0,56.0,29.0,16.0,14.0,...,0.169,0.392,0.933,0.0,0.0,0.675,0.273,0.184,0.402,Clay
1,1,102776,13.0,4.0,110.0,59.0,49.0,31.0,17.0,4.0,...,0.263,0.42,1.0,1.0,0.067,0.727,0.325,0.237,0.402,Clay
2,2,102821,0.0,0.0,57.0,24.0,13.0,17.0,10.0,4.0,...,0.286,0.5,0.444,2.0,0.333,0.526,0.388,0.204,0.138,Clay
3,2,103602,4.0,2.0,67.0,35.0,25.0,16.0,10.0,4.0,...,0.458,0.485,0.667,5.0,0.556,0.612,0.474,0.29,0.138,Clay
4,3,102205,2.0,2.0,65.0,39.0,22.0,10.0,8.0,6.0,...,0.207,0.353,0.6,0.0,0.0,0.492,0.261,0.128,0.231,Clay


In [37]:
perf_data_carpet = perf_data_surface.loc[perf_data_surface.surface == 'Carpet']
perf_data_hard = perf_data_surface.loc[perf_data_surface.surface == 'Hard']
perf_data_clay = perf_data_surface.loc[perf_data_surface.surface == 'Clay']
perf_data_grass = perf_data_surface.loc[perf_data_surface.surface == 'Grass']
print(perf_data_carpet.shape)
print(perf_data_hard.shape)
print(perf_data_clay.shape)
print(perf_data_grass.shape)

(2754, 27)
(47028, 27)
(29698, 27)
(9930, 27)


In [38]:
# drop surface column
carpet = perf_data_carpet.drop(columns = ['surface'])
hard = perf_data_hard.drop(columns = ['surface'])
clay = perf_data_clay.drop(columns = ['surface'])
grass = perf_data_grass.drop(columns = ['surface'])

In [39]:
#create copies
carpet_copy = perf_data_carpet.drop(columns = ['surface'])
hard_copy = perf_data_hard.drop(columns = ['surface'])
clay_copy = perf_data_clay.drop(columns = ['surface'])
grass_copy = perf_data_grass.drop(columns = ['surface'])

In [40]:
ids_carpet = list(carpet['player_id'].unique())
ids_hard = list(hard['player_id'].unique())
ids_clay = list(clay['player_id'].unique())
ids_grass = list(grass['player_id'].unique())

In [41]:
def historical_averaging_carpet(dataframe):
    for i in range(len(ids_carpet)):
        dataframe.loc[dataframe.player_id == ids_carpet[i]] = dataframe.loc[dataframe.player_id == ids_carpet[i]].expanding(min_periods=1).mean()
    return dataframe

def historical_averaging_hard(dataframe):
    for i in range(len(ids_hard)):
        dataframe.loc[dataframe.player_id == ids_hard[i]] = dataframe.loc[dataframe.player_id == ids_hard[i]].expanding(min_periods=1).mean()
    return dataframe

def historical_averaging_clay(dataframe):
    for i in range(len(ids_clay)):
        dataframe.loc[dataframe.player_id == ids_clay[i]] = dataframe.loc[dataframe.player_id == ids_clay[i]].expanding(min_periods=1).mean()
    return dataframe

def historical_averaging_grass(dataframe):
    for i in range(len(ids_grass)):
        dataframe.loc[dataframe.player_id == ids_grass[i]] = dataframe.loc[dataframe.player_id == ids_grass[i]].expanding(min_periods=1).mean()
    return dataframe

In [42]:
historical_averaging_carpet(carpet)
historical_averaging_hard(hard)
historical_averaging_clay(clay)
historical_averaging_grass(grass)

Unnamed: 0,id,player_id,ace,df,svpt,1stIn,1stWon,2ndWon,SvGms,bpSaved,...,2nd_return_won,1st_return_won_perc,2nd_return_won_perc,bp_won_perc,bp_won,bp_converted_perc,overall_serve_perc,overall_return_perc,comlete,serveadv
1250,644.000,101320.0,2.000,7.000,75.000,37.000,31.000,21.000,13.000,2.000,...,22.000,0.261,0.440,0.500,2.000,0.286,0.693,0.354,0.246,0.339
1251,644.000,103061.0,9.000,13.000,96.000,46.000,34.000,28.000,14.000,5.000,...,17.000,0.162,0.447,0.714,2.000,0.500,0.646,0.307,0.198,0.339
1252,645.000,102563.0,7.000,10.000,83.000,34.000,27.000,24.000,13.000,7.000,...,24.000,0.289,0.436,0.700,3.000,0.429,0.614,0.376,0.231,0.238
1253,645.000,101733.0,7.000,11.000,93.000,38.000,27.000,31.000,14.000,4.000,...,25.000,0.206,0.510,0.571,3.000,0.300,0.624,0.386,0.240,0.238
1254,646.000,102093.0,6.000,1.000,43.000,32.000,27.000,9.000,9.000,0.000,...,15.000,0.372,0.577,0.000,3.000,0.375,0.837,0.449,0.376,0.388
1255,646.000,101969.0,9.000,5.000,69.000,43.000,27.000,11.000,9.000,5.000,...,2.000,0.156,0.182,0.625,0.000,0.000,0.551,0.163,0.090,0.388
1256,647.000,101990.0,19.000,9.000,94.000,51.000,47.000,21.000,15.000,0.000,...,19.000,0.333,0.452,0.000,4.000,0.364,0.723,0.387,0.280,0.336
1257,647.000,103171.0,5.000,10.000,93.000,51.000,34.000,23.000,14.000,7.000,...,22.000,0.078,0.512,0.636,1.000,1.000,0.613,0.277,0.170,0.336
1258,648.000,102158.0,12.000,2.000,68.000,50.000,39.000,7.000,10.000,4.000,...,20.000,0.207,0.667,0.800,3.000,0.429,0.676,0.441,0.298,0.236
1259,648.000,102615.0,2.000,5.000,59.000,29.000,23.000,10.000,9.000,4.000,...,11.000,0.220,0.611,0.571,1.000,0.200,0.559,0.324,0.181,0.236


In [43]:
carpet[['id', 'player_id']] = carpet_copy[['id' ,'player_id']]
hard[['id', 'player_id']] = hard_copy[['id', 'player_id']]
clay[['id' ,'player_id']] = clay_copy[['id' ,'player_id']]
grass[['id' ,'player_id']] = grass_copy[['id', 'player_id']]

In [44]:
perf_data_all = carpet.append([hard, clay, grass])
perf_data_all.shape

(89410, 26)

In [45]:
data_p1 = data[['id', 'tourney_id', 'tourney_name', 'surface', 'draw_size',
       'tourney_level', 'month', 'year', 'match_num', 'best_of','round' ,'sets','ER',
        'p1_id', 'p1_name', 'p1_hand', 'p1_ioc', 'p1_age', 'p1_rank_points','rank_p1','outcome']]
data_p2 = data[['id', 'p2_id', 'p2_name', 'p2_hand', 'p2_ioc', 'p2_age', 'p2_rank_points','rank_p2']]

In [46]:
data_p1 = data_p1.rename(columns={'p1_id':'player_id'})
data_p2 = data_p2.rename(columns={'p2_id':'player_id'})

In [47]:
data_p1_merged = pd.merge(data_p1, perf_data_all, how = 'left', on = ['id', 'player_id'])
data_p2_merged = pd.merge(data_p2, perf_data_all, how = 'left', on = ['id', 'player_id'])

In [48]:
data_p1_merged.columns = ['id', 'tourney_id', 'tourney_name', 'surface', 'draw_size',
       'tourney_level', 'month', 'year', 'match_num', 'best_of','round', 'sets', 'ER','p1_id', 'p1_name', 'p1_hand', 'p1_ioc', 'p1_age', 'p1_rank_points',
       'rank_p1', 'outcome', 'p1_ace', 'p1_df', 'p1_svpt', 'p1_1stIn', 'p1_1stWon',
       'p1_2ndWon', 'p1_SvGms', 'p1_bpSaved', 'p1_bpFaced', 'p1_2ndIn', 'p1_1stServe_perc', 
       'p1_1st_svpt_won_perc','p1_2nd_svpt_won_perc', 'p1_1st_return_won',
       'p1_2nd_return_won', 'p1_1st_return_won_perc', 'p1_2nd_return_won_perc',
       'p1_bp_won_perc', 'p1_bp_won', 'p1_bp_converted_perc','p1_overall_serve_perc','p1_overall_return_perc',
       'p1_complete', 'p1_serveadv']

data_p2_merged.columns = ['id', 'p2_id', 'p2_name', 'p2_hand', 'p2_ioc', 'p2_age',
       'p2_rank_points', 'rank_p2', 'p2_ace', 'p2_df', 'p2_svpt', 'p2_1stIn', 'p2_1stWon',
       'p2_2ndWon', 'p2_SvGms', 'p2_bpSaved', 'p2_bpFaced', 'p2_2ndIn', 'p2_1stServe_perc',
       'p2_1st_svpt_won_perc', 'p2_2nd_svpt_won_perc', 'p2_1st_return_won',
       'p2_2nd_return_won', 'p2_1st_return_won_perc', 'p2_2nd_return_won_perc',
       'p2_bp_won_perc', 'p2_bp_won', 'p2_bp_converted_perc', 'p2_overall_serve_perc',
       'p2_overall_return_perc', 'p2_complete', 'p2_serveadv']

In [49]:
data_final = pd.merge(data_p1_merged, data_p2_merged, how = 'inner', on = 'id')
data_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44705 entries, 0 to 44704
Data columns (total 76 columns):
id                        44705 non-null int64
tourney_id                44705 non-null object
tourney_name              44705 non-null object
surface                   44705 non-null object
draw_size                 44705 non-null int64
tourney_level             44705 non-null object
month                     44705 non-null int64
year                      44705 non-null int64
match_num                 44705 non-null int64
best_of                   44705 non-null int64
round                     44705 non-null object
sets                      44705 non-null float64
ER                        44705 non-null float64
p1_id                     44705 non-null int64
p1_name                   44705 non-null object
p1_hand                   44705 non-null object
p1_ioc                    44705 non-null object
p1_age                    44705 non-null object
p1_rank_points            44705 

### Feature Encoding

The dataset contains various categorical variables such as surface, tourney level, round and player's strong hand. We transform them to numeric variables using dummy variable encoding technique.

In [50]:
categorical_features = ['surface','tourney_level', 'round', 'p1_hand', 'p2_hand']
dummy_categorical_features = pd.get_dummies(data[categorical_features] , drop_first=True)
dummy_categorical_features.head()

Unnamed: 0,surface_Clay,surface_Grass,surface_Hard,tourney_level_ATP500,tourney_level_F,tourney_level_G,tourney_level_M,round_QF,round_R128,round_R16,round_R32,round_R64,round_RR,round_SF,p1_hand_R,p1_hand_U,p2_hand_R,p2_hand_U
0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0
1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0
2,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0
3,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
4,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0


In [51]:
frames = [data_final, dummy_categorical_features]
data = pd.concat(frames, axis = 1)
data = data.drop(columns=categorical_features)

### Symmetric Feature Representation

In the dataset, we have the characteristics of both players, winner and loser, for each match. As explained above, "Player 1" and "Player 2" are randomly assigned to the winner or loser. We construct new features by taking the difference between player 1's and player 2's characteristics to achieve symmetry, which would help us to avoid any inherent bias due to randomly labeled players. Furthermore, the number of features is halved by this approach, which reduces the variance of the model. 

All features, except match characteristics, are of the following form:

\begin{equation}
    STAT_i = STAT_{i, player 1} - STAT_{i, player 2} 
\end{equation}

In [52]:
data['rank_diff'] = data['rank_p1'] - data['rank_p2']
data['rank_pts_diff'] = data['p1_rank_points'] - data['p2_rank_points']
data['aces'] = data['p1_ace'] - data['p2_ace']
data['dfs'] = data['p1_df'] - data['p2_df']
data['svpts'] = data['p1_svpt'] - data['p2_svpt']
data['firstIns'] = data['p1_1stIn'] - data['p2_1stIn']
data['firstWons'] = data['p1_1stWon'] - data['p2_1stWon']
data['secondWons'] = data['p1_2ndWon'] - data['p2_2ndWon']
data['SvGms'] = data['p1_SvGms'] - data['p2_SvGms']
data['bpSaveds'] = data['p1_bpSaved'] - data['p2_bpSaved']
data['bpFaceds'] = data['p1_bpFaced'] - data['p2_bpFaced']
data['secondIns'] = data['p1_2ndIn'] - data['p2_2ndIn']
data['firstServe_percs'] = data['p1_1stServe_perc'] - data['p2_1stServe_perc']
data['firstSvptWons_perc'] = data['p1_1st_svpt_won_perc'] - data['p2_1st_svpt_won_perc']
data['secondSvptWons_perc'] = data['p1_2nd_svpt_won_perc'] - data['p2_2nd_svpt_won_perc']
data['firstReturnWons'] = data['p1_1st_return_won'] - data['p2_1st_return_won']
data['secondReturnWons'] = data['p1_2nd_return_won'] - data['p2_2nd_return_won']
data['firstReturnWons_perc'] = data['p1_1st_return_won_perc'] - data['p2_1st_return_won_perc']
data['secondReturnWons_perc'] = data['p1_2nd_return_won_perc'] - data['p2_2nd_return_won_perc']
data['bpWons_perc'] = data['p1_bp_won_perc'] - data['p2_bp_won_perc']
data['bpWons'] = data['p1_bp_won'] - data['p2_bp_won']
data['bpConverteds_perc'] = data['p1_bp_converted_perc'] - data['p2_bp_converted_perc']
data['overallServes_perc'] = data['p1_overall_serve_perc'] - data['p2_overall_serve_perc']
data['overallReturnes_perc'] = data['p1_overall_return_perc'] - data['p2_overall_return_perc']
data['completeness'] = data['p1_complete'] - data['p2_complete']
data['serveAdv'] = data['p1_serveadv'] - data['p2_serveadv']

In [53]:
# From now on, data preprocessing steps such as data splitting, cleaning, outlier detection 
#can be applied to the dataset.
#Therefore, we store this dataset as a csv file.
data.to_csv('atp_data_prepared.csv', index = False)