# Tennis analysis: data understanding and preparation

In [1]:
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
pio.templates.default = "seaborn"

# Read datasets
df_tennis = pd.read_csv("../datasets/tennis_matches.csv", index_col=0) # index_col takes the index from the csv rather than creating it automatically (i.e. unnamed col is removed)
df_male = pd.read_csv("../datasets/male_players.csv")
df_female = pd.read_csv("../datasets/female_players.csv")

df_tennis.drop_duplicates(inplace=True)
df_male.drop_duplicates(inplace=True)
df_female.drop_duplicates(inplace=True)

In [None]:
# UTILS
def get_stats(attribute):
    stats = df_tennis[attribute].describe(datetime_is_numeric=True)
    stats["missing"] = df_tennis[attribute].isna().sum()
    return stats

## Initial overview

In [None]:
df_tennis.head()

In [None]:
df_tennis.info()

In [None]:
df_tennis.describe(include="all", datetime_is_numeric=True)

Fix the problem with date type and apply normalization on strings
-   tourney_date is converted from float64 to a date object
-   apply lowercase everywhere
-   subsitute double spaces ('  ') with a single space 
-   remove leading and trailing spaces

In [None]:
def preprocess_strings(df):
    df = df.applymap(lambda x:x.lower().strip() if type(x) == str else x)
    return df.replace(r"\s{2,}", " ", regex=True)

# Change type for date
df_tennis.tourney_date = pd.to_datetime(df_tennis.tourney_date, format='%Y%m%d')

# Normalize strings
df_tennis = preprocess_strings(df_tennis)
df_male = preprocess_strings(df_male)
df_female = preprocess_strings(df_female)

More visual overview about null count for each attribute

In [None]:
x = pd.DataFrame(df_tennis.isna().sum())
x.rename(columns={0: 'Missing values'}, inplace=True)
x.index.name = 'Feature'
x.reset_index(inplace=True)
#x.set_index('Feature', inplace=True)

fig = px.bar(x, x="Feature", y="Missing values", log_y=True)

fig.update_layout(
    xaxis_dtick = 1,
    xaxis_tickangle = 45
)

fig.show()

## Tennis matches: Attributes analysis
Let's analyize and understand each attribute of the tennis matches dataset, to ensure high data quality by fixing wrong data, managing outliers and missing values

### tourney_id
It's the ID of the tourney. By analyzing the length several groups have been found and each of them provides different information:
- Length 8-9 contains 83024 observations
    - year
- Length 23-24 contains 100676 observations
    - year, only woman tourney, country of the tourney, available on the itftennis.com website
- Length 30-to-40 contains 2373 observations
    - year, woman/man, country of the first and second player

Every tourney has on average 38 matches.

In [None]:
matches_per_tourney = df_tennis.groupby("tourney_id").tourney_id.count() # 38

ids_grouped_by_length = df_tennis.tourney_id.str.len().fillna(-1)
ids_grouped_by_length = ids_grouped_by_length.groupby(ids_grouped_by_length).count()
df = pd.DataFrame({"tourney_id length": ids_grouped_by_length.index.astype(int), "count": ids_grouped_by_length.values})

# GROUPS
tourney_id_8_9 = df_tennis[(df_tennis.tourney_id.str.len() >= 8) & (df_tennis.tourney_id.str.len() <= 9)]
tourney_id_8_9_years = (tourney_id_8_9["tourney_id"].str.split('-', expand=True)[[0]])

tourney_id_23_24 = df_tennis[(df_tennis.tourney_id.str.len() >= 23) & (df_tennis.tourney_id.str.len() <= 24)]
tourney_id_country_23_24 = (tourney_id_23_24["tourney_id"].str.split('-', expand=True)[[3]]).stack().unique()

tourney_id_30_to_40 = df_tennis[(df_tennis.tourney_id.str.len() >= 30) & (df_tennis.tourney_id.str.len() <= 40)]
tourney_id_country30_to_40 = (tourney_id_23_24["tourney_id"].str.split('-', expand=True)[[3]]).stack().unique()

# Countries information
tennis_23 = df_tennis[df_tennis['tourney_id'].str.len() == 23]
countries_in_tennis_23 = (tennis_23["tourney_id"].str.split('-', expand=True)[[3]]).stack().unique()
countries_in_tennis_23

px.bar(df, x='tourney_id length', y='count').show()
print(get_stats("tourney_id"))

### tourney_name
This is the name of the tourney and there are several representations that offers different informations such as city name, prize, nationalities of players, category (from which prize, number of events can be obtained).  
 
Example of the different groups found:
- Fed Cup WG F: USA vs BLR, Davis Cup Finals RR: FRA vs JPN
- Biella CH
- W25 Rome
- Arad $10K

In [None]:
tourney_name_8_9 = df_tennis[(df_tennis.tourney_name.str.len() >= 8) & (df_tennis.tourney_name.str.len() <= 9)]
print(tourney_name_8_9.tourney_name.unique())

print(get_stats("tourney_name"))

### surface

In [None]:
# Fix missing values
df_tennis.loc[(df_tennis.surface.isna()) & (~df_tennis.tourney_id.isna()) & (df_tennis.tourney_id != "2017-1059"), "surface"] = df_tennis.surface.mode()[0]

df = df_tennis.groupby("surface").surface.count().reset_index(name = "matches")
df.plot.bar(x="surface", y="matches", color="surface").show()
print(get_stats("surface"))

### draw_size

- 5% of tourneys have a larger draw size than the players in the tour

In [None]:
# The number of players in a draw, usually rounded to the next power of 2
df = df_tennis.groupby(["tourney_id", "draw_size"]).tourney_id.agg(["nunique"]).reset_index()
df = df.groupby(["draw_size"]).draw_size.agg(tourney="count").reset_index()
px.histogram(df, x="draw_size", y="tourney").show()

# Estimate of unique players per tourney
df_unique_winner_players = df_tennis.groupby(["tourney_id", "winner_name"]).tourney_id.agg(unique="nunique").reset_index().rename({"winner_name":"player"}, axis='columns')
df_unique_loser_players = df_tennis.groupby(["tourney_id", "loser_name"]).tourney_id.agg(unique="nunique").reset_index().rename({"loser_name":"player"}, axis='columns')
df_unique_players = pd.merge(df_unique_winner_players, df_unique_loser_players, how="inner", on=["tourney_id", "player", "unique"])
df_unique_players_per_tourney = df_unique_players.groupby("tourney_id").unique.agg(unique_players_per_tourney="sum").reset_index()
df_unique_players_per_tourney

result = pd.merge(df_tennis, df_unique_players_per_tourney, how="inner", on=["tourney_id"])
equals = (result.draw_size == result.unique_players_per_tourney).sum() / result.shape[0] * 100
tourneis_with_draw_size_larger_than_players = result.loc[(result.draw_size <= result.unique_players_per_tourney)].groupby("tourney_id").tourney_id.count().size
print(f"There are {tourneis_with_draw_size_larger_than_players / 4853 * 100}% tourneys with a draw size larger than the number of players partecipating")

# df_tennis[df_tennis.tourney_id=="2016-0083"]
# df_draw_size_estimate[df_draw_size_estimate.tourney_id=="2016-0083"]

print(get_stats("draw_size"))

### tourney_level
It's the level of the tourney
- Male: A C D (davis cup) F G M S
- Female: D (fed billie jean king cup, wightman cup, bonnie bell cup) I P PM
- MIXED: E (not sanctionated by tour) J (juniors) T (team tennis) - NOT IN THE DATASET RIGHT NOW
- O olimpics 128 both male and female but not mixed
- W - not in the documentation, it should be woman
- ITF[10, 15, 25, 50, 60, 75, 80, 100] are the prizes

In [None]:
# Tourney level 186099, 19 unique values, 29 missing
df = df_tennis.groupby(df_tennis.tourney_level).tourney_id.count().reset_index(name="matches").sort_values(by="matches")
df.plot.bar(x="tourney_level", y="matches").show()

print(get_stats("tourney_level"))

### tourney_date
It's the date of the tourney, usually it's the monday of the week.
- The years between 2016-2019 are the most represented while 2020 and 2021 have less matches
- November and December are the least represented months

In [None]:
# Grouped by day of the week
df = df_tennis.groupby([df_tennis.tourney_date.dt.day_name().rename("day")]).tourney_date.agg(matches="count").reset_index()
px.bar(df, x="day", y="matches", color="day", barmode = 'group').show()

# Grouped by month and year
df = df_tennis.groupby([df_tennis.tourney_date.dt.month.rename("dmonth"), df_tennis.tourney_date.dt.strftime('%b').rename("month"), df_tennis.tourney_date.dt.year.rename("year")]).tourney_date.agg(matches="count").reset_index()
px.bar(df, x="month", y="matches", color="year", barmode = 'group').show()

# Grouped by year
df = df_tennis.tourney_date.value_counts().reset_index(name="count").rename(columns={"index":"date"})
df = df_tennis.groupby(df_tennis.tourney_date.dt.year.rename("year")).tourney_date.agg(matches="count").reset_index()
px.bar(df, x="year", y="matches", color="year", barmode = 'group').show()

print(get_stats("tourney_date"))

In [None]:
df_tennis.draw_size = df_tennis.draw_size.fillna(df_tennis.draw_size.median())


### score

We progressively applied masks based on regex in order to identify all the erroneus values and all the possible formats.

Every couple n1-n2 (e.g., 6-4) represents the score of a single set, where n1 are the games won by the winner and n2 those won by the loser of the match. When after the couple of numbers representing a set there is a number n3 between brackets (e.g., 7-6(4)), it means that the set ended at the tie-break and n3 represents the points scored during it by the loser of the set.<br><br>
When we find a couple between square brackets (e.g. \[10-7\]), it represents the result of the super tie-break, which is played in some tourneys, on the 6-6 of the last set (on the 12-12 in Wimbledon). In these cases, the score of the final set is omitted.<br><br>
We can also find some abbreviations, which indicate particular conditions:<br>
<ul>
<li>"ret", "ret.", "re", "ret+64": placed at the end of the score, to indicate the retirement of a player during the match.
<li>"w/o", "walkover": it's the retirement of a player before the match starts. 
<li>"def", "def.": it's a default, i.e. the disqualification of a player.
<li>"bye": It's the automatic advancement of a player to the next round of a tournament without facing an opponent.
</ul>

In [None]:
df_tennis['score'].value_counts()

In [None]:
#"ordinary score": no super tie-breaks; no walkover, retirements etc.
ordscr_regex = "^((\d-\d)(\(\d{1,2}\))? ){1,4}(\d{1,2}-\d{1,2})(\(\d{1,2}\))?$"
nonos_mask = df_tennis['score'].str.match(ordscr_regex) == False
df_tennis[nonos_mask]['score'].value_counts().head(10)

In [None]:
#matches with retirement score or walkover
retscr_regex = "(^((\d-\d)(\(\d{1,2}\))? ){0,4}(\d{1,2}-\d{1,2})(\(\d{1,2}\))? ret$)|(^w/o$)"
nonrs_mask = df_tennis['score'].str.match(retscr_regex) == False

#points with super tie-break
stbscr_regex = "^((\d-\d)(\(\d{1,2}\))? ){2,4}\[\d{1,2}-\d{1,2}\]( ret)?$"
nonss_mask = df_tennis['score'].str.match(stbscr_regex) == False

df_tennis[nonos_mask & nonss_mask & nonrs_mask]['score'].value_counts()

In [None]:
df_tennis['score'] = df_tennis['score'].str.replace('walkover', 'w/o')
df_tennis['score'] = df_tennis['score'].str.replace('def.', 'def')
df_tennis['score'] = df_tennis['score'].str.replace(r're$', 'ret', regex=True)
df_tennis['score'] = df_tennis['score'].str.replace(r'ret\+h64$', 'ret', regex=True)

In [None]:
#recomputing masks
nonss_mask = df_tennis['score'].str.match(stbscr_regex) == False
nonrs_mask = df_tennis['score'].str.match(retscr_regex) == False
df_tennis[nonos_mask & nonss_mask & nonrs_mask]['score'].value_counts()

In [None]:
#matches ended with a with default
defscr_regex = "(^def$)|(^((\d-\d)(\(\d{1,2}\))? ){0,4}(\d{1,2}-\d{1,2})(\(\d{1,2}\))? def$)|(^(((\d-\d)(\(\d{1,2}\))? ){2,4}\[\d{1,2}-\d{1,2}\] def)$)"
nondef_mask = df_tennis['score'].str.match(defscr_regex) == False

nonbye_mask = df_tennis['score'].str.match("bye") == False
df_tennis[nonos_mask & nonss_mask & nonrs_mask & nondef_mask & nonbye_mask]
df_tennis[nonos_mask & nonss_mask & nonrs_mask & nondef_mask & nonbye_mask]['score'].value_counts()

The values above are the erroneus ones. Therefore we substituded them with NaN.

In [None]:
score_errors_indexes = df_tennis.index[nonos_mask & nonss_mask & nonrs_mask & nondef_mask & nonbye_mask].tolist()
df_tennis.loc[score_errors_indexes, "score"] = np.nan

#### Computation of games won by the winner and by the loser

In [None]:
#tie breaks
df_tennis['score_norm'] = df_tennis['score'].str.replace(r'\(\d{1,2}\)', '', regex=True)
df_tennis[['score', 'score_norm']].head()

In [None]:
#tie breaks
df_tennis['score_norm'] = df_tennis['score'].str.replace(r'\(\d{1,2}\)', '', regex=True)
df_tennis[['score', 'score_norm']].head()

In [None]:
#super tie breaks
ret_def_mask = (df_tennis['score'].str.match('.*(ret|def).*') == True) & (df_tennis['tourney_name'] != "wimbledon")
wimbledon_mask = (df_tennis['tourney_name'] == "wimbledon") & (df_tennis['score'].str.match('.*(ret|def).*') == False)
wimbledon_rd_mask = (df_tennis['tourney_name'] == "wimbledon") & (df_tennis['score'].str.match('.*(ret|def).*') == True)
other_sit_mask = (df_tennis['tourney_name'] != "wimbledon") & (df_tennis['score'].str.match('.*(ret|def).*') == False)

ret_def_indexes = df_tennis.index[ret_def_mask].tolist()
wimbledon_indexes = df_tennis.index[wimbledon_mask].tolist()
wimbledon_rd_indexes = df_tennis.index[wimbledon_rd_mask].tolist()
other_sit_indexes = df_tennis.index[other_sit_mask].tolist()

In [None]:
df_tennis.loc[other_sit_indexes, 'score_norm'] = df_tennis['score_norm'].str.replace(r'\[\d{1,2}-\d{1,2}\]', '7-6', regex=True)
df_tennis.loc[wimbledon_indexes, 'score_norm'] = df_tennis['score_norm'].str.replace(r'\[\d{1,2}-\d{1,2}\]', '13-12', regex=True)
df_tennis.loc[ret_def_indexes, 'score_norm'] = df_tennis['score_norm'].str.replace(r'\[\d{1,2}-\d{1,2}\]', '6-6', regex=True)
df_tennis.loc[wimbledon_rd_indexes, 'score_norm'] = df_tennis['score_norm'].str.replace(r'\[\d{1,2}-\d{1,2}\]', '12-12', regex=True)

In [None]:
selection_mask = df_tennis['score'].str.match('.*\[\d{1,2}-\d{1,2}\].*') == True
df_tennis[selection_mask][['score', 'score_norm']].head()

In [None]:
df_tennis['score_norm'] = df_tennis['score_norm'].fillna('')

df_tennis['games_list'] = df_tennis['score_norm'].str.findall('\d{1,2}')

In [None]:
def w_games_won(games_list):
    return sum(map(int, games_list[0::2]))

def l_games_won(games_list):
    return sum(map(int, games_list[1::2]))

In [None]:
df_tennis['w_gmsWon'] = df_tennis['games_list'].apply(w_games_won)
df_tennis['l_gmsWon'] = df_tennis['games_list'].apply(l_games_won)
df_tennis[['score', 'score_norm', 'games_list', 'w_gmsWon', 'l_gmsWon']].head()

### w_ace, l_ace

In [None]:
px.box(df_tennis, y=["w_ace", "l_ace"]).show()

#### Analysis of bigger outliers - w_ace

In [None]:
df_tennis[["w_ace", "score"]].sort_values(by='w_ace', ascending=False).head(10)

Looking at the scores, the second highest value looks suspicious.

#### Analysis of bigger outliers - l_ace

In [None]:
df_tennis[["l_ace", "score"]].sort_values(by='l_ace', ascending=False).head(10)

Looking at the scores, there are not implausible values for this attribute bigger outliers.

### w_df, l_df

In [None]:
px.box(df_tennis, y=["w_df", "l_df"]).show()

#### Analysis of the biggest  outliers - w_df

In [None]:
df_tennis[["w_df", "score"]].sort_values(by='w_df', ascending=False).head(10)

Looking at the scores, the three highest values look suspicious.

#### Analysis of the biggest  outliers - l_df

In [None]:
df_tennis[["l_df", "score"]].sort_values(by='l_df', ascending=False).head(10)

Looking at the scores, the two highest values look suspicious.

### w_svpt, l_svpt

In [None]:
px.box(df_tennis, y=["w_svpt", "l_svpt"]).show()

#### Analysis of the biggest  outliers - w_svpt

In [None]:
df_tennis[["w_svpt", "score"]].sort_values(by='w_svpt', ascending=False).head(10)

Looking at the scores, the five highest values are totally implausible.

#### Analysis of the biggest  outliers - l_svpt

In [None]:
df_tennis[["l_svpt", "score"]].sort_values(by='l_svpt', ascending=False).head(10)

Looking at the scores, the five highest values are totally implausible.

### w_1stIn, w_1stWon, w_2ndWon and loser's ones

In [None]:
px.box(df_tennis, y=["w_1stIn", "w_1stWon", "w_2ndWon"]).show()

In [None]:
px.box(df_tennis, y=["l_1stIn", "l_1stWon", "l_2ndWon"]).show()

#### Analysis of the biggest  outliers - w_1stIn

In [None]:
df_tennis[["w_1stIn", "score"]].sort_values(by='w_1stIn', ascending=False).head(10)

Looking at the scores, the five highest values are totally implausible.

#### Analysis of the biggest outliers - l_1stIn

In [None]:
df_tennis[["l_1stIn", "score"]].sort_values(by='l_1stIn', ascending=False).head(10)

Looking at the scores, the five highest values are totally implausible.

#### Analysis of the biggest outliers - w_1stWon

In [None]:
df_tennis[["w_1stWon", "score"]].sort_values(by='w_1stWon', ascending=False).head(10)

Looking at the scores, the five highest values are totally implausible.

#### Analysis of the biggest outliers - l_1stWon

In [None]:
df_tennis[["l_1stWon", "score"]].sort_values(by='l_1stWon', ascending=False).head(10)

Looking at the scores, the five highest values are totally implausible.

#### Analysis of the biggest outliers - w_2ndWon

In [None]:
df_tennis[["w_2ndWon" ,"score"]].sort_values(by='w_2ndWon', ascending=False).head(10)

Looking at the scores, the four highest values are totally implausible.

#### Analysis of the biggest outliers - l_2ndWon

In [None]:
df_tennis[["l_2ndWon", "score"]].sort_values(by='l_2ndWon', ascending=False).head(10)

Looking at the scores, the four highest values are totally implausible.

### w_bpFaced, w_bpSaved and loser's ones

In [None]:
px.box(df_tennis, y=["w_bpFaced", "w_bpSaved"]).show()

In [None]:
px.box(df_tennis, y=["l_bpFaced", "l_bpSaved"]).show()

#### Analysis of the biggest outliers - w_bpFaced

In [None]:
df_tennis[["w_bpFaced", "score"]].sort_values(by='w_bpFaced', ascending=False).head(10)

Looking at the scores, the three highest values are totally implausible.

#### Analysis of the biggest outliers - l_bpFaced

In [None]:
df_tennis[["l_bpFaced", "score"]].sort_values(by='l_bpFaced', ascending=False).head(10)

Looking at the scores, the five highest values are totally implausible.

#### Analysis of the biggest outliers - w_bpSaved

In [None]:
df_tennis[["w_bpSaved", "score"]].sort_values(by='w_bpSaved', ascending=False).head(10)

Looking at the scores, the two highest values are totally implausible.

#### Analysis of the biggest outliers - l_bpSaved

In [None]:
df_tennis[["l_bpSaved", "score"]].sort_values(by='l_bpSaved', ascending=False).head(10)

Looking at the scores, the three highest values are totally implausible.

### w_SvGms, l_SvGms

In [None]:
px.box(df_tennis, y=["w_SvGms", "l_SvGms"]).show()

#### Analysis of the biggest outliers - w_SvGms

In [None]:
df_tennis.columns

In [None]:
df_tennis[["w_SvGms", "w_gmsWon", "l_gmsWon"]].sort_values(by='w_SvGms', ascending=False).head(20)

Looking at the number of games played, the outliers have plausible values.

#### Analysis of the smallest outliers - w_SvGms

In [None]:
df_tennis[["w_SvGms", "w_gmsWon", "l_gmsWon"]].sort_values(by='w_SvGms', ascending=True).head(30)

Most of the ouliers with a value equal to 0 are erroneus (the number of games played is much greater than 0). But we don't apply any correction because it's a  not significant attribute, that we won't use later.

#### Analysis of the biggest outliers - l_SvGms

In [None]:
df_tennis[["l_SvGms",  "w_gmsWon", "l_gmsWon"]].sort_values(by='l_SvGms', ascending=False).head(30)

Looking at the number of games played, the outliers have plausible values.

#### Analysis of the smallest outliers - l_SvGms

In [None]:
df_tennis[["l_SvGms", "w_gmsWon", "l_gmsWon"]].sort_values(by='l_SvGms', ascending=True).head(30)

Most of the ouliers with a value equal to 0 are erroneus (the number of games played is much greater than 0). But we don't apply any correction because it's a not significant attribute, that we won't use later.

### Preprocessing based on in-match statistics outliers
For the attributes in the table below, during the outliers analysis we noticed that there were 5 recurrent instances (such as these that have the five highest values for l_svpt) in which the values were usually extremely high and sometimes totally implausible considering the score of the match. Therefore we dropped the corresponding rows.

In [None]:
outliers_indexes = df_tennis[["l_svpt"]].sort_values(by='l_svpt', ascending=False).head(5).index.tolist()
df_outliers = df_tennis.loc[outliers_indexes]
df_outliers[['w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_bpSaved', 'w_bpFaced', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_bpSaved', 'l_bpFaced']]

In [None]:
df_tennis = df_tennis.drop(outliers_indexes)

### winner_rank, loser_rank

In [None]:
px.box(df_tennis, y=["winner_rank", "loser_rank"]).show()

The values seem to be plausible, in relation to the total number of players

In [None]:
len(df_tennis["winner_name"].unique())

### winner_rank_points, loser_rank_points

In [None]:
px.box(df_tennis, y=["winner_rank_points", "loser_rank_points"]).show()

In [None]:
selection_mask = df_tennis['winner_rank_points'] > df_tennis['winner_rank_points'].quantile(0.999)
df_outliers = df_tennis[selection_mask]
df_outliers['winner_name'].unique()

In [None]:
selection_mask = df_tennis['loser_rank_points'] > df_tennis['loser_rank_points'].quantile(0.999)
df_outliers = df_tennis[selection_mask]
df_outliers['loser_name'].unique()

The players with the biggest ranking points are as expected among the strongest ones.

### minutes

In [None]:
px.box(df_tennis, y="minutes").show()

In [None]:
df_tennis[["minutes", "score"]].sort_values(by='minutes', ascending=False).head(20)

Looking at the score, we noticed that are the outliers with a value greater than 396 are implausible, therefore we set to NaN the corresponding values.

In [None]:
outliers_mask = (df_tennis['minutes'] > 396.0)
outliers_indexes = df_tennis.index[outliers_mask].tolist()
df_tennis.loc[outliers_indexes, 'minutes'] = np.nan

In [None]:
px.histogram(df_tennis, x="minutes").show()

### tourney_revenue

In [None]:
px.histogram(df_tennis, x="tourney_revenue").show()

In [None]:
px.box(df_tennis, y="tourney_revenue").show()

#### Analysis of the biggest and the smallest outliers

In [None]:
selection_mask = df_tennis['tourney_revenue'] > df_tennis['tourney_revenue'].quantile(0.99)
df_outliers = df_tennis[selection_mask]
df_outliers['tourney_name'].unique()

The tourneys with the biggest revenues are as expected two of the most important ones.

In [None]:
selection_mask = df_tennis['tourney_revenue'] < df_tennis['tourney_revenue'].quantile(0.01)
df_outliers = df_tennis[selection_mask]
df_outliers['tourney_name'].unique()

### tourney_spectators

In [None]:
px.histogram(df_tennis, x="tourney_spectators").show()

In [None]:
px.box(df_tennis, y="tourney_spectators").show()

#### Analysis of the biggest and the smallest outliers

In [None]:
selection_mask = df_tennis['tourney_spectators'] > df_tennis['tourney_spectators'].quantile(0.99)
df_outliers = df_tennis[selection_mask]
df_outliers['tourney_name'].unique()

The tourneys with the biggest revenues are as expected among the most important ones.

In [None]:
selection_mask = df_tennis['tourney_revenue'] < df_tennis['tourney_revenue'].quantile(0.01)
df_outliers = df_tennis[selection_mask]
df_outliers['tourney_name'].unique()

### round

An acronym which identifies the stage of the match inside the tournament.

https://en.wikipedia.org/wiki/Tennis_performance_timeline_comparison_(men)

The values can be:



*   'F': which identifies the final match
*   'SF': which identifies the semifinal match
*   'QF': which identifies the quarter finals match
*   'R16': which identifies the match at the stage of the last 16 partecipants
*   'R32': which identifies the match at the stage of the last 32 partecipants
*   'R64': which identifies the match at the stage of the last 64 partecipants
*   'R128': which identifies the match at the stage of the last 128 partecipants
*   'RR': which identifies the match in 'Round Robbin' case, that is when each partecipant meets all other contestants in turn
*   'BR': it isn't listed on wiki, there are just 5 results for 'BR' on the dataset, checking results of those games online it seems to refer to the bronze medal. So te game for third and fourth position.

Number of missing values = 42

In [None]:
df_tennis['round'].unique()

### winner_id

It identifies the players inside a given tournament.

Number of missing values = 82.

In [None]:
(df_tennis.groupby(['winner_name', 'tourney_id']).winner_id.agg('nunique') > 1).sum()

### winner_entry

It's an acronym and it can have one of the following values:

* '**PR**' = (**Protected Ranking**) Players injured for a minimum of six months can ask for a protected ranking, which is based on their average ranking during the first three months of their injury. The player can use their protected ranking to enter tournaments' main draws or qualifying competitions when coming back from injury.[9] It is also used in the WTA for players returning from pregnancy leave.
* '**Q**' = (**Qualifier**) Player who reaches the tournament's main draw by competing in a pre-tournament qualifying competition instead of automatically qualified by virtue of their world ranking, being a wild card, or other exemption
* '**WC**' = (**Wild Card**) Player allowed to play in a tournament, even if their rank is not adequate or they do not register in time. Typically a few places in the draw are reserved for wild cards, which may be for local players who do not gain direct acceptance or for players who are just outside the ranking required to gain direct acceptance. Wild cards may also be given to players whose ranking has dropped due to a long-term injury.
* '**LL**' = (**Lucky Loser**) Player or team that gains acceptance into the main draw of a tournament when a main draw player or team withdraws
* '**SE**' = (**Special Exempt**) Players who are unable to appear in a tournament's qualifying draw because they are still competing in the final rounds of a previous tournament can be awarded a spot in the main draw by special exempt.
* '**Alt**' = (I think **Alternate**) Player or team that gains acceptance into the main draw of a tournament when a main draw player or team withdraws. Such a player may be a lucky loser.
* '**ALT**' = I think the same as before
* '**SR**' = (**Special Ranking**) Same as Protected Ranking (I think)
* '**JE**' = (**Junior exempt**) High-ranking junior players can be awarded a spot in the draw of a tournament.
* '**A**' = ?
* '**ITF**' = High-ranking ITF players can be awarded a spot into ATP Challenger and ITF women's tournaments main draws based on their ITF ranking.
* '**P**' = ?
* '**I**' = ?
* '**IR**' = ?
* '**JR**' = (**Main Draw Junior Reserved**) These are players who have a Top 100 ITF Junior Ranking (also known as a Combined Ranking) and who were unable to be accepted into the Main Draw as a Direct Acceptance. Players that withdraw before the Freeze Deadline will be replaced by players on the Junior Reserved Alternates list.

Number of missing values = 82.

In [None]:
df_tennis.winner_entry.unique()

In [None]:
# Number of missing values
df_tennis.winner_id.isna().sum()

In [None]:
df_tennis.winner_entry.hist(bins = len(df_tennis.winner_entry.unique()))

### winner_hand

It represents:

*   R = right-handed
*   L = left-handed
*   U = unknown

For ambidextrous players, this is their serving hand.

Number of missing values = 60.

In [None]:
df_tennis.winner_hand.hist()

In [None]:
# Number of missing values
df_tennis.winner_hand.isna().sum()

### winner_ht

It represents height in cm of the winner.

Number of missing values = 136581

In [None]:
df_tennis.winner_ht.hist()

In [None]:
#number of missing values
df_tennis.winner_ht.isna().sum()

In [None]:
df_tennis.winner_ht.plot.box()

**Spotting outliers**

Several outliers with winner_ht = 2.0 and also other outliers with height < 146.

In [None]:
df_tennis[df_tennis.winner_ht < df_tennis.winner_ht.quantile(0.0023) ].sort_values('winner_ht')

In [None]:
# adjusting wrong values
df_tennis.loc[df_tennis.winner_name == 'kamilla rakhimova', 'winner_ht'] = 174
df_tennis.loc[df_tennis.winner_name == 'ilija vucic', 'winner_ht'] = 188

In [None]:
df_tennis[df_tennis.winner_ht > df_tennis.winner_ht.quantile(0.995) ].sort_values('winner_ht')

### winner_ioc

It's a 3 character code representing the winner's country.

Number of missing values = 44

In [None]:
df_tennis.winner_ioc.unique()

In [None]:
# Number of missing values
df_tennis.winner_ioc.isna().sum()

In [None]:
filter = 600

winner_ioc_count = df_tennis.winner_ioc.value_counts()
winner_ioc_small_count = winner_ioc_count[winner_ioc_count.values < filter]
winner_ioc_count = winner_ioc_count.drop(winner_ioc_count[winner_ioc_count.values<filter ].index)

winner_ioc_count = winner_ioc_count.append(pd.Series({'others': winner_ioc_small_count.sum()}))

# winner_ioc_count.plot(kind='pie', figsize = (20,20)) #hist(color='orange', ec='black', bins=124, figsize=(120, 50), rwidth = 0.5, xlabelsize=25)
px.pie(winner_ioc_count).show()

### winner_age


The age of the player, in years, depending on the date of the
tournament.

There are outliers.

Number of missing values = 2868.

In [None]:
df_tennis.winner_age.hist()

In [None]:
df_tennis.winner_age.plot.box()

In [None]:
# Number of missing values
df_tennis.winner_age.isna().sum()

**Spotting outliers**

2 outliers which show players whith 95 years old.

In [None]:
df_tennis[df_tennis.winner_age < df_tennis.winner_age.quantile(0.00001)].sort_values('winner_age')

In [None]:
df_tennis[df_tennis.winner_age > df_tennis.winner_age.quantile(0.99997)].sort_values('winner_age')

In [None]:
# adjusting wrong values
df_tennis.loc[0,'winner_age'] = df_tennis.loc[df_tennis.winner_name == 'kei nishikori'].winner_age.mean()
df_tennis.loc[322,'winner_age'] = df_tennis.loc[df_tennis.winner_name == 'daniil medvedev'].winner_age.mean()

### loser_id

Number of missing values = 41

In [None]:
# Number of missing values
df_tennis.loser_id.isna().sum()

### loser_entry

Number of missing values = 141785

In [None]:
df_tennis.loser_entry.unique()

In [None]:
# Number of missing values
df_tennis.loser_entry.isna().sum()

### loser_hand

It represents:
* R = right-handed
* L = left-handed
* U = unknown

For ambidextrous players, this is their serving hand.

Number of missing values = 111.

In [None]:
df_tennis.loser_hand.hist()

In [None]:
# Number of missing values
df_tennis.loser_hand.isna().sum()

### loser_ht

It represents height in cm of the loser.

There are outliers.

Number of missing values = 147555

In [None]:
df_tennis.loser_ht.hist()

In [None]:
#number of missing values
df_tennis.loser_ht.isna().sum()

In [None]:
df_tennis.loser_ht.plot.box()

**Spotting outliers**

Several outliers with winner_ht = 2.0.

In [None]:
df_tennis[df_tennis.loser_ht < df_tennis.loser_ht.quantile(0.0012)].sort_values('loser_ht')

In [None]:
# adjusting wrong values
df_tennis.loc[df_tennis.loser_name == 'kamilla rakhimova', 'loser_ht'] = 174
df_tennis.loc[df_tennis.loser_name == 'ilija vucic', 'loser_ht'] = 188

In [None]:
df_tennis[df_tennis.loser_ht > df_tennis.loser_ht.quantile(0.99)].sort_values('loser_ht')

### loser_ioc

It's a 3 character code representing the loser's country.

Number of missing values = 41

In [None]:
df_tennis.loser_ioc.unique()

In [None]:
# Number of missing values
df_tennis.loser_ioc.isna().sum()

### loser_age

The age of the player, in years, depending on the date of the
tournament.

There are no outliers.

Number of missing values = 6551

In [None]:
# df_tennis.loser_age.hist()

In [None]:
df_tennis.loser_age.plot.box()

In [None]:
# Number of missing values
df_tennis.loser_age.isna().sum()

**Spotting outliers**

No outliers, there are data regarding Gail Falkenberg is a real player who is 74 years old.

In [None]:
df_tennis[df_tennis.loser_age < df_tennis.loser_age.quantile(0.00002)].sort_values('winner_age')

In [None]:
df_tennis[df_tennis.loser_age > df_tennis.loser_age.quantile(0.99997)].sort_values('loser_age')

## Correlation analysis

In [None]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

corr_threshold = 0.7
correlation = df_tennis.corr(method="pearson") #.abs()
correlation = correlation.where(np.tril(np.ones(correlation.shape)).astype(bool)) # remove upper triangle

px.imshow(correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)], labels=dict(color="Correlation"), color_continuous_scale=px.colors.diverging.RdBu, zmin=-1, zmax=1, width=1000, height=1000).show()

correlation = correlation.unstack().drop(labels=get_redundant_pairs(correlation)).sort_values(ascending=False).drop_duplicates()
print(f"These are {len(correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)])} pairs whose correlation is bigger/small than ±{corr_threshold*100}%:")
correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)]

Save dataset

In [None]:
df_tennis.to_csv("../datasets/tennis_matches_cleaned.csv")