# Players dataset preparation

In [None]:
import pandas as pd
import numpy as np

# Visualization
import plotly.express as px
import plotly.io as pio
pd.options.plotting.backend = 'plotly'
pio.templates.default = "seaborn"

## Dataset preparation

In [None]:
# Read datasets
df_tennis = pd.read_csv("../datasets/tennis_matches_cleaned.csv", parse_dates=["tourney_date"], index_col=0)
df_male = pd.read_csv("../datasets/male_players.csv")
df_female = pd.read_csv("../datasets/female_players.csv")

df_male.drop_duplicates(inplace=True)
df_female.drop_duplicates(inplace=True)

def preprocess_strings(df):
    df = df.applymap(lambda x:x.lower().strip() if type(x) == str else x)
    return df.replace(r"\s{2,}", " ", regex=True)

df_tennis.tourney_date = pd.to_datetime(df_tennis.tourney_date, format='%Y%m%d')

# Normalize strings
df_male = preprocess_strings(df_male)
df_female = preprocess_strings(df_female)

In [None]:
# Create a new players dataset whose name is the concatenation of the name and the surname for the male and female players
df = pd.concat([df_male.name + " " + df_male.surname], axis=1, keys=["name", "gender"])
df["gender"] = "m"
df_players = df
df = pd.concat([df_female.name + " " + df_female.surname], axis=1, keys=["name", "gender"])
df["gender"] = "f"
df_players = df_players.append(df)

# Remove duplicates arising from the intersection
df_players = df_players.drop_duplicates(subset=["name"])
df_players = df_players.sort_values(by=["name"])
df_players.reset_index(drop=True, inplace=True)

# Merge the players dataset with the tennis dataset
df = pd.merge(df_tennis, df_players.rename(columns = {'name': 'winner_name', 'gender':'winner_gender'}), on="winner_name", how="left")
df = pd.merge(df, df_players.rename(columns = {'name': 'loser_name', 'gender':'loser_gender'}), on="loser_name", how="left")

# Find the players who do not have a gender and assign them the most common among the genders of the players they played with. 
winners = df[df.winner_gender.isnull()].groupby(["winner_name", "loser_gender"]).loser_gender.count().reset_index(name="occurrences").rename(columns={"winner_name":"name", "loser_gender":"gender"})

losers = df[df.loser_gender.isnull()].groupby(["loser_name", "winner_gender"]).winner_gender.count().reset_index(name="occurrences").rename(columns={"loser_name":"name", "winner_gender":"gender"})

estimate = pd.concat([winners, losers]).groupby(["name", "gender"]).occurrences.sum().reset_index(name="occurrences")
estimate = estimate.sort_values(by=["name", "occurrences"], ascending=[1, 0]).drop_duplicates("name", keep="first").drop(columns=["occurrences"])

# Create the players dataset
df_players = df.loc[:, ['winner_name', 'winner_gender']].set_axis(['name', 'gender'], axis=1, inplace=False)
df_players = df_players.append(df.loc[:, ['loser_name', 'loser_gender']].set_axis(['name', 'gender'], axis=1, inplace=False))
df_players = df_players.append(estimate)
df_players = df_players.dropna().drop_duplicates()
df_players = df_players.sort_values(by=["name"]).reset_index(drop=True)
df_players.head()

## Feature engineering

### Tourneys played

In [None]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_id']].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'tourney_id']].rename(columns={'loser_name': 'name'})

# players and their number of participations in tournaments
df = pd.concat([df_w, df_l]).groupby('name').tourney_id.nunique().reset_index(name="total_tourneys_played")

# add features to the data_frame of player profiles
df_players = df_players.merge(df, on="name")

### Matches played

In [None]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_id']].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'tourney_id']].rename(columns={'loser_name': 'name'})

# players and relative number of matches played
df = pd.concat([df_w, df_l]).groupby('name').tourney_id.count().reset_index(name="total_matches_played")

# add features to the player profile dataframe
df_players = df_players.merge(df, on="name")

In [None]:
px.histogram(df_players, x="total_matches_played").show()

### Matches won

In [None]:
df = df_tennis.loc[:, ['winner_name', 'tourney_id']].rename(columns={'winner_name': 'name'})

# players and relative number of matches won
df = df.groupby('name').tourney_id.count().reset_index(name="total_matches_won")

# add features to the data_frame of player profiles
df_players = df_players.merge(df, on="name", how="left")
df_players.total_matches_won.fillna(0, inplace=True)

### Matches won ratio

In [None]:
df_players["matches_won_ratio"] = df_players["total_matches_won"] / df_players["total_matches_played"]

### Mean, max, min, entropy of performance_index

It's the number of matches won in a tourney over the matches that were supposed to be played in order to win the tourney

In [None]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_id', "draw_size"]].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'tourney_id', "draw_size"]].rename(columns={'loser_name': 'name'})

matches_won_per_tourney = df_w.groupby(['name', 'tourney_id']).tourney_id.count().reset_index(name="total_matches_won_per_tourney")
df = pd.concat([df_w, df_l])
df = df.merge(matches_won_per_tourney, on=['name', 'tourney_id'], how="left")
df["total_matches_won_per_tourney"] = df["total_matches_won_per_tourney"].fillna(0).astype('Int32')

df["matches_to_play_per_tourney"] = np.log2(df["draw_size"]) + 1
df["performance_index"] = df["total_matches_won_per_tourney"] / df["matches_to_play_per_tourney"]
# If performance_index is bigger than 1 (won more matches than the number of matches I am supposed to play), then set it to 1. That's the case of extra qualifications
df["performance_index"].clip(lower=0, upper=1, inplace=True)

# Stats about performance_index
df_stats = df.groupby('name')['performance_index'].agg(mean_performance_index='mean', max_performance_index='max', min_performance_index='min')

# Entropy of performance_index
g_sum = df.groupby('name')['performance_index'].transform('sum')
values = df['performance_index']/g_sum
df['performance_index_entropy'] = -(values*np.log(values))
df_entropy = df.groupby('name')['performance_index_entropy'].sum().reset_index()

# Merge stats and entropy with players
df_players = df_players.merge(df_stats, on="name")
df_players = df_players.merge(df_entropy, on="name")
#del df_entropy, df_stats

### Height

Probably unusable feature, we only have it for 541 players

In [None]:
df_w = df_tennis.loc[:, ['winner_name', 'winner_ht', 'tourney_date']].rename(columns={'winner_name': 'name', 'winner_ht': 'ht'})
df_l = df_tennis.loc[:, ['loser_name', 'loser_ht', 'tourney_date']].rename(columns={'loser_name': 'name', 'loser_ht': 'ht'})

# players and their height
df = pd.concat([df_w, df_l]).sort_values('tourney_date', ascending='False').groupby('name').ht.first().reset_index()

# add features to the data_frame of player profiles
df_players = df_players.merge(df, on="name")

### Age

In [None]:
df_w = df_tennis.loc[:, ['winner_name', 'winner_age', 'tourney_date']].rename(columns={'winner_name': 'name', 'winner_age': 'age'})
df_l = (df_tennis.loc[:, ['loser_name', 'loser_age', 'tourney_date']]).rename(columns={'loser_name': 'name', 'loser_age': 'age'})

# players and their ages
df = pd.concat([df_w, df_l]).sort_values('tourney_date', ascending='False').groupby('name').age.first().reset_index()

# 
df_players = df_players.merge(df, on="name")


In [None]:
px.histogram(df_players, x="age").show()

### Hand

In [None]:
df_w = df_tennis.loc[:, ['winner_name', 'winner_hand']].rename(columns={'winner_name': 'name', 'winner_hand': 'hand'})
df_l = (df_tennis.loc[:, ['loser_name', 'loser_hand']]).rename(columns={'loser_name': 'name', 'loser_hand': 'hand'})

# players and their favorite hand
df = pd.concat([df_w, df_l]).groupby('name').hand.agg(pd.Series.mode).reset_index()

# add features to the player profile dataframe
df_players = df_players.merge(df, on="name")

### Mean, max, entropy of minutes

In [None]:
df_w = df_tennis.loc[:, ['winner_name', 'minutes']].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'minutes']].rename(columns={'loser_name': 'name'})

# Mean and max minutes
df = pd.concat([df_w, df_l]).groupby('name')['minutes'].agg(mean_minutes='mean', max_minutes='max')
df_players = df_players.merge(df, on="name")

# Entropy
df = pd.concat([df_w, df_l])
g_sum = df.groupby('name')['minutes'].transform('sum')
values = df['minutes']/g_sum
df['minutes_entropy'] = -(values*np.log(values))
df_entropy = df.groupby('name')['minutes_entropy'].sum().reset_index()

df_players = df_players.merge(df_entropy, on="name")

### Ranking

In [None]:
df_w = df_tennis.loc[:, ['winner_name', 'winner_rank_points', 'tourney_date']].rename(columns={'winner_name': 'name', 'winner_rank_points': 'rank_points'})
df_l = (df_tennis.loc[:, ['loser_name', 'loser_rank_points', 'tourney_date']]).rename(columns={'loser_name': 'name', 'loser_rank_points': 'rank_points'})

# players and relative measures on ranking (variance, average, max)
df = pd.concat([df_l, df_w]).sort_values('tourney_date', ascending = False).groupby('name').rank_points.agg(mean_rank_points='mean', max_rank_points='max', last_rank_points='first', variance_rank_points=lambda x: np.var(x,ddof=0)).reset_index()

# add features to the player profile dataframe
df_players = df_players.merge(df, on="name")

# ratio between the last rank points and the mean rank points
df_players['lrpOnAvgrp'] = df_players['last_rank_points']/df_players['mean_rank_points']
# ratio between the last rank points and the max rank points
df_players['lrpOnMxrp'] = df_players['last_rank_points']/df_players['max_rank_points']

In [None]:
px.histogram(df_players, x="mean_rank_points").show()

In [None]:
px.histogram(df_players, x="lrpOnAvgrp").show()

### Spectator

In [None]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_spectators']].rename(columns = {'winner_name': 'name'})
df_l = (df_tennis.loc[:, ['loser_name', 'tourney_spectators']]).rename(columns = {'loser_name': 'name'})

# players and relative measures on spectators (average and max)
df = pd.concat([df_l, df_w]).groupby('name').tourney_spectators.agg(mean_tourney_spectators='mean', max_tourney_spectators='max').reset_index()

# add features to the player profile dataframe
df_players = df_players.merge(df, on="name")

### Revenue

In [None]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_revenue']].rename(columns = {'winner_name': 'name'})
df_l = (df_tennis.loc[:, ['loser_name', 'tourney_revenue']]).rename(columns = {'loser_name': 'name'})

# players and related measures on revenue (average and max)
df = pd.concat([df_l, df_w]).groupby('name').tourney_revenue.agg(mean_tourney_revenue='mean', max_tourney_revenue='max').reset_index()

# add features to the player profile dataframe
df_players = df_players.merge(df, on="name")

### Player in-match features

In [None]:
# Those feature where also used to check possible outliers, but nothing particullarry significant was found

df_tennis['rel_w_ace'] = df_tennis['w_ace']/df_tennis['w_svpt']
df_tennis['rel_l_ace'] = df_tennis['l_ace']/df_tennis['l_svpt']

df_tennis['rel_w_df'] = df_tennis['w_df']/df_tennis['w_svpt']
df_tennis['rel_l_df'] = df_tennis['l_df']/df_tennis['l_svpt']

df_tennis['rel_w_1stIn'] = df_tennis['w_1stIn']/df_tennis['w_svpt']
df_tennis['rel_l_1stIn'] = df_tennis['l_1stIn']/df_tennis['l_svpt']

df_tennis['rel_w_1stWon'] = df_tennis['w_1stWon']/df_tennis['w_svpt']
df_tennis['rel_l_1stWon'] = df_tennis['l_1stWon']/df_tennis['l_svpt']

df_tennis['rel_w_2ndWon'] = df_tennis['w_2ndWon']/df_tennis['w_svpt']
df_tennis['rel_l_2ndWon'] = df_tennis['l_2ndWon']/df_tennis['l_svpt']

df_tennis['w_1WonOn1In'] = df_tennis['w_1stWon']/df_tennis['w_1stIn']
df_tennis['l_1WonOn1In'] = df_tennis['l_1stWon']/df_tennis['l_1stIn']

df_tennis['w_1WonOnTotWon'] = df_tennis['w_1stWon']/(df_tennis['w_2ndWon'] + df_tennis['w_1stWon'])
df_tennis['l_1WonOnTotWon'] = df_tennis['l_1stWon']/(df_tennis['l_2ndWon'] + df_tennis['l_1stWon'])

df_tennis['rel_w_ptsWon'] = (df_tennis['w_1stWon'] + df_tennis['w_2ndWon'])/(df_tennis['w_1stWon'] + df_tennis['w_2ndWon'] + df_tennis['l_1stWon'] + df_tennis['l_2ndWon'])
df_tennis['rel_l_ptsWon'] = (df_tennis['l_1stWon'] + df_tennis['l_2ndWon'])/(df_tennis['w_1stWon'] + df_tennis['w_2ndWon'] + df_tennis['l_1stWon'] + df_tennis['l_2ndWon'])

df_tennis['rel_w_bpFaced'] = df_tennis['w_bpFaced']/(df_tennis['w_svpt'] + df_tennis['l_svpt'])
df_tennis['rel_l_bpFaced'] = df_tennis['l_bpFaced']/(df_tennis['w_svpt'] + df_tennis['l_svpt'])

df_tennis['rel_w_bpSaved'] = df_tennis['w_bpSaved']/df_tennis['w_bpFaced']
df_tennis['rel_l_bpSaved'] = df_tennis['l_bpSaved']/df_tennis['l_bpFaced']

df_tennis['rel_w_gmsWon'] = df_tennis['w_gmsWon']/(df_tennis['w_gmsWon'] + df_tennis['l_gmsWon'])
df_tennis['rel_l_gmsWon'] = df_tennis['l_gmsWon']/(df_tennis['w_gmsWon'] + df_tennis['l_gmsWon'])

In [None]:
df_winner = df_tennis.loc[:, ['winner_name', 'rel_w_ace', 'rel_w_df', 'rel_w_1stIn', 'rel_w_1stWon', 'rel_w_2ndWon', 'w_1WonOn1In', 'w_1WonOnTotWon', 'rel_w_ptsWon', 'rel_w_bpFaced', 'rel_w_bpSaved', 'rel_w_gmsWon']]
df_loser = df_tennis.loc[:, ['loser_name', 'rel_l_ace', 'rel_l_df', 'rel_l_1stIn', 'rel_l_1stWon', 'rel_l_2ndWon', 'l_1WonOn1In', 'l_1WonOnTotWon', 'rel_l_ptsWon', 'rel_l_bpFaced', 'rel_l_bpSaved', 'rel_l_gmsWon']]
ren_attr_list = ['name', 'rel_ace', 'rel_df', 'rel_1stIn', 'rel_1stWon', 'rel_2ndWon', '1WonOn1In', '1WonOnTotWon', 'rel_ptsWon', 'rel_bpFaced', 'rel_bpSaved', 'rel_gmsWon']

df_winner.set_axis(ren_attr_list, axis=1, inplace=True)
df_loser.set_axis(ren_attr_list, axis=1, inplace=True)
df = pd.concat([df_winner, df_loser]).groupby('name').mean().reset_index()

df_players = df_players.merge(df, on="name")

## Dropping records

#### Filter players that played at least 15 matches

In [None]:
df_winner = df_tennis.loc[:, ["winner_name"]].rename(columns={"winner_name":"name"})
df_loser = df_tennis.loc[:, ["loser_name"]].rename(columns={"loser_name":"name"})

sm15 = pd.concat([df_winner, df_loser]).groupby('name').name.count().reset_index(name="count")["count"] >= 15
sm4 = pd.concat([df_winner, df_loser]).groupby('name').name.count().reset_index(name="count")["count"] >= 4

# This will be used for classification in order to have as much data as possible
df_players_classification = df_players[sm4]
# This will be used for clustering
df_players = df_players[sm15]

df_players.iloc[:,:20].info()

In [None]:
# drop all records with nan values in lrpOnMxrp (that are the same for which mean_rank_points is null)
df_players.dropna(subset=['lrpOnMxrp'], inplace=True)
df_players_classification.dropna(subset=['lrpOnMxrp'], inplace=True)
df_players.iloc[:,20:].info()

## Feature selection

In [None]:
df_players.info()

In [None]:
df_players.head()

### Filling age missing values

In [None]:
# filling the 4 missing value for attribute age with the mean
means = df_players['age'].mean()
df_players['age'] = df_players['age'].fillna(means)
df_players.iloc[:,:20].info()

### Correlation analysis

Here we look at the correlation between all the defined features

In [None]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

corr_threshold = 0.8
correlation = df_players.corr(method="pearson") #.abs()
correlation = correlation.where(np.tril(np.ones(correlation.shape)).astype(bool)) # remove upper triangle

px.imshow(correlation, labels=dict(color="Correlation"), color_continuous_scale=px.colors.diverging.RdBu, zmin=-1, zmax=1, width=1000, height=1000).show()

correlation = correlation.unstack().drop(labels=get_redundant_pairs(correlation)).sort_values(ascending=False).drop_duplicates()
print(f"These are {len(correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)])} pairs whose correlation is bigger/small than ±{corr_threshold*100}%:")
correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)]

### Selection

The following feature may provide an interesting picture about the performance of the players

In [None]:
feautures = ['lrpOnAvgrp', 'lrpOnMxrp', 'mean_performance_index', 'matches_won_ratio', 'mean_rank_points', 'max_rank_points', 'last_rank_points', 'variance_rank_points', 'mean_tourney_spectators', 'max_tourney_spectators', 'mean_tourney_revenue', 'max_tourney_revenue', 'rel_ptsWon', 'rel_gmsWon']
df_selected = df_players[feautures].reset_index(drop=True)

corr_threshold = 0.7
correlation = df_selected.corr(method="pearson") #.abs()
correlation = correlation.where(np.tril(np.ones(correlation.shape)).astype(bool)) # remove upper triangle

px.imshow(correlation, labels=dict(color="Correlation"), color_continuous_scale=px.colors.diverging.RdBu, zmin=-1, zmax=1, width=1000, height=1000).show()

correlation = correlation.unstack().drop(labels=get_redundant_pairs(correlation)).sort_values(ascending=False).drop_duplicates()
print(f"These are {len(correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)])} pairs whose correlation is bigger/small than ±{corr_threshold*100}%:")
correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)]

The idea is to drop all the features that have more than `70%` of correlation

In [None]:
feautures = ['lrpOnMxrp', 'matches_won_ratio', 'mean_rank_points', 'max_tourney_revenue', 'rel_ptsWon']
df_selected = df_players[feautures].reset_index(drop=True)

corr_threshold = 0.7
correlation = df_selected.corr(method="pearson") #.abs()
correlation = correlation.where(np.tril(np.ones(correlation.shape)).astype(bool)) # remove upper triangle

px.imshow(correlation, labels=dict(color="Correlation"), color_continuous_scale=px.colors.diverging.RdBu, zmin=-1, zmax=1, width=1000, height=1000).show()

correlation = correlation.unstack().drop(labels=get_redundant_pairs(correlation)).sort_values(ascending=False).drop_duplicates()
print(f"These are {len(correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)])} pairs whose correlation is bigger/small than ±{corr_threshold*100}%:")
correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)]

In [None]:
df_players[feautures].info()

The idea is to drop `rel_ptsWon` becuase there are only 1946 non-null values. And hence these are the final selected features

In [None]:
feautures = ['lrpOnMxrp', 'matches_won_ratio', 'mean_rank_points', 'max_tourney_revenue']
df_selected = df_players[feautures].reset_index(drop=True)
df_selected.describe()

In [None]:
df_selected = df_players[feautures].reset_index(drop=True)

corr_threshold = 0.7
correlation = df_selected.corr(method="pearson") #.abs()
correlation = correlation.where(np.tril(np.ones(correlation.shape)).astype(bool)) # remove upper triangle

px.imshow(correlation, labels=dict(color="Correlation"), color_continuous_scale=px.colors.diverging.RdBu, zmin=-1, zmax=1, width=1000, height=1000).show()

correlation = correlation.unstack().drop(labels=get_redundant_pairs(correlation)).sort_values(ascending=False).drop_duplicates()
print(f"These are {len(correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)])} pairs whose correlation is bigger/small than ±{corr_threshold*100}%:")
correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)]

### Analysis

By looking at the graphs, it's fair to say that during the clustering phase the features need to be standardized and outliers could be managed

In [None]:
for feature in feautures:
  px.histogram(df_players, x=feature, marginal="box").show()

## Output

In [None]:
df_players.to_csv("../datasets/players.csv")
df_players_classification.to_csv("../datasets/players_classification.csv")