# Prelims

In [1]:
# -----------------MAGIC COMMANDS---------------
# magic commands to enable autoreload for imported packages
%load_ext autoreload
%autoreload 2
%matplotlib inline

# -------------------IMPORTS---------------------
# basic project related stuff
import os
import sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..')) # setting parent_dir in sys.path
sys.path.insert(0, parent_dir) # so that tennis_main can be found as a module

# importing classes and methods
from tennis_main.data import Tennis
    
# basic EDA
import ydata_profiling # basic EDA package
import pandas as pd
import numpy as np

# viz
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import plotly.express as px

# Data

In [2]:
singles = Tennis().get_singles()
doubles = Tennis().get_doubles()
players = Tennis().get_players()
rankings = Tennis().get_rankings()

# Basic EDA

## Profiling one random dataframe from singles

In [3]:
singles_2010 = singles[2010]

In [4]:
singles_2010.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
draw_size,3030.0,52.67987,39.552964,4.0,28.0,32.0,56.0,128.0
tourney_date,3030.0,20100570.0,297.971804,20100103.0,20100305.0,20100524.0,20100815.0,20101203.0
match_num,3030.0,26.70132,28.374126,1.0,7.0,19.0,32.0,291.0
winner_id,3030.0,104367.3,646.745901,101962.0,103843.0,104417.0,104792.0,108961.0
winner_seed,1282.0,7.599844,7.06122,1.0,3.0,5.0,10.0,33.0
winner_ht,2967.0,186.122,6.578788,168.0,183.0,185.0,190.0,208.0
winner_age,3030.0,26.00373,3.062247,16.1,23.7,25.8,28.6,38.3
loser_id,3030.0,104378.4,814.443513,101404.0,103812.0,104338.0,104871.0,108993.0
loser_seed,671.0,9.177347,7.66168,1.0,4.0,7.0,12.0,33.0
loser_ht,2898.0,185.45,6.619686,168.0,180.0,185.0,190.0,208.0


In [5]:
singles_2010.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3030 entries, 0 to 3029
Data columns (total 49 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tourney_id          3030 non-null   object 
 1   tourney_name        3030 non-null   object 
 2   surface             3030 non-null   object 
 3   draw_size           3030 non-null   int64  
 4   tourney_level       3030 non-null   object 
 5   tourney_date        3030 non-null   int64  
 6   match_num           3030 non-null   int64  
 7   winner_id           3030 non-null   int64  
 8   winner_seed         1282 non-null   float64
 9   winner_entry        321 non-null    object 
 10  winner_name         3030 non-null   object 
 11  winner_hand         3030 non-null   object 
 12  winner_ht           2967 non-null   float64
 13  winner_ioc          3030 non-null   object 
 14  winner_age          3030 non-null   float64
 15  loser_id            3030 non-null   int64  
 16  loser_

In [6]:
# singles_profile = ydata_profiling.ProfileReport(singles_2010, title = "Singles 2010 Profile")
# singles_profile

In [7]:
singles_2010[singles_2010["tourney_id"] == "2010-339"].sample(10)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
2,2010-339,Brisbane,Hard,32,A,20100103,3,104755,,,...,58.0,38.0,14.0,14.0,7.0,11.0,52.0,850.0,88.0,568.0
18,2010-339,Brisbane,Hard,32,A,20100103,19,104607,4.0,,...,13.0,7.0,6.0,6.0,2.0,7.0,20.0,1655.0,42.0,945.0
13,2010-339,Brisbane,Hard,32,A,20100103,14,104639,,,...,42.0,32.0,11.0,11.0,4.0,7.0,105.0,521.0,63.0,667.0
0,2010-339,Brisbane,Hard,32,A,20100103,1,104053,1.0,,...,34.0,29.0,11.0,10.0,3.0,5.0,7.0,4410.0,77.0,598.0
17,2010-339,Brisbane,Hard,32,A,20100103,18,104755,,,...,36.0,22.0,9.0,9.0,3.0,6.0,52.0,850.0,285.0,151.0
22,2010-339,Brisbane,Hard,32,A,20100103,23,104639,,,...,44.0,33.0,14.0,13.0,4.0,9.0,105.0,521.0,81.0,587.0
9,2010-339,Brisbane,Hard,32,A,20100103,10,102967,,,...,21.0,15.0,10.0,8.0,1.0,5.0,58.0,744.0,80.0,587.0
5,2010-339,Brisbane,Hard,32,A,20100103,6,104571,,,...,36.0,28.0,20.0,12.0,3.0,6.0,42.0,945.0,55.0,790.0
8,2010-339,Brisbane,Hard,32,A,20100103,9,103484,,,...,52.0,41.0,16.0,14.0,3.0,5.0,44.0,935.0,25.0,1320.0
19,2010-339,Brisbane,Hard,32,A,20100103,20,105064,8.0,,...,42.0,39.0,18.0,15.0,3.0,5.0,36.0,1021.0,119.0,461.0


For now we will continue with the 2010 dataframe only, to create a blueprint

# Data Cleaning

## Checking and handling duplicates

In [8]:
singles_2010.duplicated().sum()

np.int64(0)

In [9]:
singles_2010.drop_duplicates(inplace = True)

## Dealing with missing data

In [10]:
singles_2010.isnull().sum()/len(singles_2010)

tourney_id            0.000000
tourney_name          0.000000
surface               0.000000
draw_size             0.000000
tourney_level         0.000000
tourney_date          0.000000
match_num             0.000000
winner_id             0.000000
winner_seed           0.576898
winner_entry          0.894059
winner_name           0.000000
winner_hand           0.000000
winner_ht             0.020792
winner_ioc            0.000000
winner_age            0.000000
loser_id              0.000000
loser_seed            0.778548
loser_entry           0.799670
loser_name            0.000000
loser_hand            0.000000
loser_ht              0.043564
loser_ioc             0.000000
loser_age             0.000000
score                 0.000000
best_of               0.000000
round                 0.000000
minutes               0.113531
w_ace                 0.113531
w_df                  0.113531
w_svpt                0.113531
w_1stIn               0.113531
w_1stWon              0.113531
w_2ndWon

**A few thoughts on how to deal with missing data**

- <u>Winner and Loser seed<u>

Refers to the player's seeding in the tournament. Players are "seeded" based on their ranking to ensure that the highest ranked players do not face each other in the early rounds. There is a clear pattern in this dataframe that the loser is unseeded more often. Ths indicates that the Nan values come because of rankings. When a player is higher in the rankings than the cutoff of the tournament, they automatically qualify for it. Protected Ranking also has its identifier (PR)
*How should we deal with these missing values (58% and 78% respectively)? It is obvious that they hold predicting power for the outcome of a match*

- <u>Winner and Loser entry<u>

Refers to the way the player entered the tournament, for example through qualifiers, through a wild card etc. Again, here there is a pattern that the winner has more NaN values, which may be linked to the fact that the winner has entered the tournament through their ranking possibly? This should be investigated. It potentially holds predicting power. We will assume that all the NaN values are due to the rankings and hence, filled with "R".

- <u>Winner and Loser ht<u>

Refers to the height of the player, which obviously for some people is unknown. Possibly not too important, and has less than 5% data missing, so it could be filled with the mean.

- <u>Minutes, aces, double faults, etc <u>

All these are match statistics that are missing. They hold important value, however, they would obviously not be available beforehand if one is to predict the outcome of a match. Hence, as a first step they can be dropped, and possibly reconsidered down the line, when player specific stats up to that point in time can be engineered. Still, as NaN values they cannot be filled with anything of real value.

- <u> Winner and Loser Rank and rank points <u>

Probably the missing values are values of players that were not even ranked before the tournament. There again is a pattern that losers are missing their rank more often, which supports this hypothesis. Could all be filled by a number that is bigger than the current last ranked player, to preserve order

In [11]:
singles_2010["loser_entry"].unique()

array([nan, 'Q', 'WC', 'LL'], dtype=object)

### Dropping stats columns

In [12]:
singles_2010.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'],
      dtype='object')

In [13]:
stats_columns = ['minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced']

for stat in stats_columns:
    singles_2010.drop(columns = stat, inplace = True)

### Filling player_ht with mean

In [18]:
singles_2010.replace({"winner_ht": {np.nan : singles_2010["winner_ht"].mean()}}, inplace = True)
singles_2010.replace({"loser_ht": {np.nan : singles_2010["loser_ht"].mean()}}, inplace = True)

# sanity check
singles_2010.isnull().sum()/len(singles_2010)

tourney_id            0.000000
tourney_name          0.000000
surface               0.000000
draw_size             0.000000
tourney_level         0.000000
tourney_date          0.000000
match_num             0.000000
winner_id             0.000000
winner_seed           0.576898
winner_entry          0.894059
winner_name           0.000000
winner_hand           0.000000
winner_ht             0.000000
winner_ioc            0.000000
winner_age            0.000000
loser_id              0.000000
loser_seed            0.778548
loser_entry           0.799670
loser_name            0.000000
loser_hand            0.000000
loser_ht              0.000000
loser_ioc             0.000000
loser_age             0.000000
score                 0.000000
best_of               0.000000
round                 0.000000
winner_rank           0.006931
winner_rank_points    0.006931
loser_rank            0.016172
loser_rank_points     0.016172
dtype: float64