# Players dataset preparation

In [1]:
import pandas as pd
import numpy as np

# Visualization
import plotly.express as px
import plotly.io as pio
pd.options.plotting.backend = 'plotly'
pio.templates.default = "seaborn"

## Dataset preparation

In [2]:
# Read datasets
df_tennis = pd.read_csv("./datasets/tennis_matches_cleaned.csv", parse_dates=["tourney_date"], index_col=0)
df_male = pd.read_csv("./datasets/male_players.csv")
df_female = pd.read_csv("./datasets/female_players.csv")

df_male.drop_duplicates(inplace=True)
df_female.drop_duplicates(inplace=True)

def preprocess_strings(df):
    df = df.applymap(lambda x:x.lower().strip() if type(x) == str else x)
    return df.replace(r"\s{2,}", " ", regex=True)

df_tennis.tourney_date = pd.to_datetime(df_tennis.tourney_date, format='%Y%m%d')

# Normalize strings
df_male = preprocess_strings(df_male)
df_female = preprocess_strings(df_female)

In [3]:
# Create a new players dataset whose name is the concatenation of the name and the surname for the male and female players
df = pd.concat([df_male.name + " " + df_male.surname], axis=1, keys=["name", "gender"])
df["gender"] = "m"
df_players = df
df = pd.concat([df_female.name + " " + df_female.surname], axis=1, keys=["name", "gender"])
df["gender"] = "f"
df_players = df_players.append(df)

# Remove duplicates arising from the intersection
df_players = df_players.drop_duplicates(subset=["name"])
df_players = df_players.sort_values(by=["name"])
df_players.reset_index(drop=True, inplace=True)

# Merge the players dataset with the tennis dataset
df = pd.merge(df_tennis, df_players.rename(columns = {'name': 'winner_name', 'gender':'winner_gender'}), on="winner_name", how="left")
df = pd.merge(df, df_players.rename(columns = {'name': 'loser_name', 'gender':'loser_gender'}), on="loser_name", how="left")

# Find the players who do not have a gender and assign them the most common among the genders of the players they played with. 
winners = df[df.winner_gender.isnull()].groupby(["winner_name", "loser_gender"]).loser_gender.count().reset_index(name="occurrences").rename(columns={"winner_name":"name", "loser_gender":"gender"})

losers = df[df.loser_gender.isnull()].groupby(["loser_name", "winner_gender"]).winner_gender.count().reset_index(name="occurrences").rename(columns={"loser_name":"name", "winner_gender":"gender"})

estimate = pd.concat([winners, losers]).groupby(["name", "gender"]).occurrences.sum().reset_index(name="occurrences")
estimate = estimate.sort_values(by=["name", "occurrences"], ascending=[1, 0]).drop_duplicates("name", keep="first").drop(columns=["occurrences"])

# Create the players dataset
df_players = df.loc[:, ['winner_name', 'winner_gender']].set_axis(['name', 'gender'], axis=1, inplace=False)
df_players = df_players.append(df.loc[:, ['loser_name', 'loser_gender']].set_axis(['name', 'gender'], axis=1, inplace=False))
df_players = df_players.append(estimate)
df_players = df_players.dropna().drop_duplicates()
df_players = df_players.sort_values(by=["name"]).reset_index(drop=True)
df_players.head()

Unnamed: 0,name,gender
0,aada inna,f
1,aalisha alexis,f
2,aaliya ebrahim,f
3,aaliyah hohmann,f
4,aalyka ebrahim,f


## Feature engineering

### Tourneys played

In [4]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_id']].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'tourney_id']].rename(columns={'loser_name': 'name'})

#  giocatori e relativo numero di partecipazioni ai tornei
df = pd.concat([df_w, df_l]).groupby('name').tourney_id.nunique().reset_index(name="total_tourneys_played")

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare dataframe
print(len(df))
print(len(df_players))

10103
10103


### Matches played

In [5]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_id']].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'tourney_id']].rename(columns={'loser_name': 'name'})

#  giocatori e relativo numero di match giocati
df = pd.concat([df_w, df_l]).groupby('name').tourney_id.count().reset_index(name="total_matches_played")

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare dataframe
print(len(df))
print(len(df_players))

10103
10103


### Matches won

In [6]:
df = df_tennis.loc[:, ['winner_name', 'tourney_id']].rename(columns={'winner_name': 'name'})

#  giocatori e relativo numero di match vinti
df = df.groupby('name').tourney_id.count().reset_index(name="total_matches_won")

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name", how="left")
df_players.total_matches_won.fillna(0, inplace=True)

# stampare dataframe
print(len(df))
print(len(df_players))

6179
10103


### Matches won ratio

In [7]:
df_players["matches_won_ratio"] = df_players["total_matches_won"] / df_players["total_matches_played"]

### Mean, max, min, entropy of performance_index

In [8]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_id', "draw_size"]].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'tourney_id', "draw_size"]].rename(columns={'loser_name': 'name'})

matches_won_per_tourney = df_w.groupby(['name', 'tourney_id']).tourney_id.count().reset_index(name="total_matches_won_per_tourney")
df = pd.concat([df_w, df_l])
df = df.merge(matches_won_per_tourney, on=['name', 'tourney_id'], how="left")
df["total_matches_won_per_tourney"] = df["total_matches_won_per_tourney"].fillna(0).astype('Int32')

df["matches_to_play_per_tourney"] = np.log2(df["draw_size"]) + 1
df["performance_index"] = df["total_matches_won_per_tourney"] / df["matches_to_play_per_tourney"]
# If performance_index is bigger than 1 (won more matches than the number of matches I am supposed to play), then set it to 1. That's the case of extra qualifications
df["performance_index"].clip(lower=0, upper=1, inplace=True)

# Stats about performance_index
df_stats = df.groupby('name')['performance_index'].agg(mean_performance_index='mean', max_performance_index='max', min_performance_index='min')

# Entropy of performance_index
# g_sum = df.groupby('name')['performance_index'].transform('sum')
# values = df['performance_index']/g_sum
# df['performance_index_entropy'] = -values*np.log(values)
# df_entropy = df.groupby('name')['performance_index_entropy'].sum().reset_index()

# Merge stats and entropy with players
df_players = df_players.merge(df_stats, on="name")
# df_players = df_players.merge(df_entropy, on="name")
#del df_entropy, df_stats

### Height

Feature probabilmente inutilizzabile, l'abbiamo solo per 541 giocatori

In [9]:
df_w = df_tennis.loc[:, ['winner_name', 'winner_ht', 'tourney_date']].rename(columns={'winner_name': 'name', 'winner_ht': 'ht'})
df_l = df_tennis.loc[:, ['loser_name', 'loser_ht', 'tourney_date']].rename(columns={'loser_name': 'name', 'loser_ht': 'ht'})

# giocatori e relativa altezza
df = pd.concat([df_w, df_l]).sort_values('tourney_date', ascending='False').groupby('name').ht.first().reset_index()

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare numero di valori nulli e dataframe
print('Numero giocatori per cui abbiamo altezza ' + str(len(df[df.ht.notnull()])))
print('Numero giocatori totali ' + str(len(df)))
df

print(len(df))
print(len(df_players))

Numero giocatori per cui abbiamo altezza 541
Numero giocatori totali 10103
10103
10103


### Age

In [10]:
df_w = df_tennis.loc[:, ['winner_name', 'winner_age', 'tourney_date']].rename(columns={'winner_name': 'name', 'winner_age': 'age'})
df_l = (df_tennis.loc[:, ['loser_name', 'loser_age', 'tourney_date']]).rename(columns={'loser_name': 'name', 'loser_age': 'age'})

# giocatori e relativa età
df = pd.concat([df_w, df_l]).sort_values('tourney_date', ascending='False').groupby('name').age.first().reset_index()

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare numero di valori nulli e dataframe
print('Numero giocatori per cui abbiamo l\'età ' + str(len(df[df.age.notnull()])))
print('Numero giocatori totali ' + str(len(df)))
df

print(len(df))
print(len(df_players))


Numero giocatori per cui abbiamo l'età 8012
Numero giocatori totali 10103
10103
10103


### Hand

In [11]:
df_w = df_tennis.loc[:, ['winner_name', 'winner_hand']].rename(columns={'winner_name': 'name', 'winner_hand': 'hand'})
df_l = (df_tennis.loc[:, ['loser_name', 'loser_hand']]).rename(columns={'loser_name': 'name', 'loser_hand': 'hand'})

# giocatori e relativa mano preferita
df = pd.concat([df_w, df_l]).groupby('name').hand.agg(pd.Series.mode).reset_index()

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare numero di valori nulli e dataframe
print('Numero giocatori per cui abbiamo la mano ' + str(len(df[df.hand.notnull()])))
print('Numero giocatori totali ' + str(len(df)))
df

print(len(df))
print(len(df_players))

Numero giocatori per cui abbiamo la mano 10103
Numero giocatori totali 10103
10103
10103


### Mean, max, entropy of minutes

In [12]:
df_w = df_tennis.loc[:, ['winner_name', 'minutes']].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'minutes']].rename(columns={'loser_name': 'name'})

# Mean and max minutes
df = pd.concat([df_w, df_l]).groupby('name')['minutes'].agg(mean_minutes='mean', max_minutes='max')
df_players = df_players.merge(df, on="name")

# Entropy
df = pd.concat([df_w, df_l])
g_sum = df.groupby('name')['minutes'].transform('sum')
values = df['minutes']/g_sum
df['minutes_entropy'] = -(values*np.log(values))
df_entropy = df.groupby('name')['minutes_entropy'].sum().reset_index()

df_players = df_players.merge(df_entropy, on="name")

  result = getattr(ufunc, method)(*inputs, **kwargs)


### Ranking

In [13]:
df_w = df_tennis.loc[:, ['winner_name', 'winner_rank_points', 'tourney_date']].rename(columns={'winner_name': 'name', 'winner_rank_points': 'rank_points'})
df_l = (df_tennis.loc[:, ['loser_name', 'loser_rank_points', 'tourney_date']]).rename(columns={'loser_name': 'name', 'loser_rank_points': 'rank_points'})

# giocatori e relative misure su ranking (varianza, media, max)
df = pd.concat([df_l, df_w]).sort_values('tourney_date', ascending = False).groupby('name').rank_points.agg(mean_rank_points='mean', max_rank_points='max', last_rank_points='first', variance_rank_points=lambda x: np.var(x,ddof=0)).reset_index()

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare numero di valori nulli e dataframe
print('numero di valori null mean_rank_points ' + str(df.mean_rank_points.isna().sum()) + ' su ' + str(len(df.mean_rank_points)))
print('numero di valori null max_rank_points ' + str(df.max_rank_points.isna().sum()) + ' su ' + str(len(df.mean_rank_points)))
print('numero di valori null last_rank_points ' + str(df.last_rank_points.isna().sum()) + ' su ' + str(len(df.mean_rank_points)))
print('numero di valori null variance_rank_points ' + str(df.variance_rank_points.isna().sum()) + ' su ' + str(len(df.variance_rank_points)))
df

print(len(df))
print(len(df_players))

numero di valori null mean_rank_points 5656 su 10103
numero di valori null max_rank_points 5656 su 10103
numero di valori null last_rank_points 5656 su 10103
numero di valori null variance_rank_points 5656 su 10103
10103
10103


### Spectator

In [14]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_spectators']].rename(columns = {'winner_name': 'name'})
df_l = (df_tennis.loc[:, ['loser_name', 'tourney_spectators']]).rename(columns = {'loser_name': 'name'})

# giocatori e relative misure sugli spectator (media e max)
df = pd.concat([df_l, df_w]).groupby('name').tourney_spectators.agg(mean_tourney_spectators='mean', max_tourney_spectators='max').reset_index()

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare numero di valori nulli e dataframe
print('Numero di nulli in mean_tourney_spectators ' + str(df.mean_tourney_spectators.isna().sum()))
print('Numero di nulli in max_tourney_spectators ' + str(df.max_tourney_spectators.isna().sum()))
df


print(len(df))
print(len(df_players))

Numero di nulli in mean_tourney_spectators 0
Numero di nulli in max_tourney_spectators 0
10103
10103


### Revenue

In [15]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_revenue']].rename(columns = {'winner_name': 'name'})
df_l = (df_tennis.loc[:, ['loser_name', 'tourney_revenue']]).rename(columns = {'loser_name': 'name'})

# giocatori e relative misure su revenue (media e max)
df = pd.concat([df_l, df_w]).groupby('name').tourney_revenue.agg(mean_tourney_revenue='mean', max_tourney_revenue='max').reset_index()

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare numero di valori nulli e dataframe
print('Numero di nulli in mean_tourney_revenue ' + str(df.mean_tourney_revenue.isna().sum()))
print('Numero di nulli in max_tourney_revenue ' + str(df.max_tourney_revenue.isna().sum()))
df

print(len(df))
print(len(df_players))

Numero di nulli in mean_tourney_revenue 0
Numero di nulli in max_tourney_revenue 0
10103
10103


### Player in-match features

In [16]:
# Those feature where also used to check possible outliers, but nothing particullarry significant was found

df_tennis['rel_w_ace'] = df_tennis['w_ace']/df_tennis['w_svpt']
df_tennis['rel_l_ace'] = df_tennis['l_ace']/df_tennis['l_svpt']

df_tennis['rel_w_df'] = df_tennis['w_df']/df_tennis['w_svpt']
df_tennis['rel_l_df'] = df_tennis['l_df']/df_tennis['l_svpt']

df_tennis['rel_w_1stIn'] = df_tennis['w_1stIn']/df_tennis['w_svpt']
df_tennis['rel_l_1stIn'] = df_tennis['l_1stIn']/df_tennis['l_svpt']

df_tennis['rel_w_1stWon'] = df_tennis['w_1stWon']/df_tennis['w_svpt']
df_tennis['rel_l_1stWon'] = df_tennis['l_1stWon']/df_tennis['l_svpt']

df_tennis['rel_w_2ndWon'] = df_tennis['w_2ndWon']/df_tennis['w_svpt']
df_tennis['rel_l_2ndWon'] = df_tennis['l_2ndWon']/df_tennis['l_svpt']

df_tennis['w_1WonOn1In'] = df_tennis['w_1stWon']/df_tennis['w_1stIn']
df_tennis['l_1WonOn1In'] = df_tennis['l_1stWon']/df_tennis['l_1stIn']

df_tennis['w_1WonOnTotWon'] = df_tennis['w_1stWon']/(df_tennis['w_2ndWon'] + df_tennis['w_1stWon'])
df_tennis['l_1WonOnTotWon'] = df_tennis['l_1stWon']/(df_tennis['l_2ndWon'] + df_tennis['l_1stWon'])

df_tennis['rel_w_ptsWon'] = (df_tennis['w_1stWon'] + df_tennis['w_2ndWon'])/(df_tennis['w_1stWon'] + df_tennis['w_2ndWon'] + df_tennis['l_1stWon'] + df_tennis['l_2ndWon'])
df_tennis['rel_l_ptsWon'] = (df_tennis['l_1stWon'] + df_tennis['l_2ndWon'])/(df_tennis['w_1stWon'] + df_tennis['w_2ndWon'] + df_tennis['l_1stWon'] + df_tennis['l_2ndWon'])

df_tennis['rel_w_bpFaced'] = df_tennis['w_bpFaced']/(df_tennis['w_svpt'] + df_tennis['l_svpt'])
df_tennis['rel_l_bpFaced'] = df_tennis['l_bpFaced']/(df_tennis['w_svpt'] + df_tennis['l_svpt'])

df_tennis['rel_w_bpSaved'] = df_tennis['w_bpSaved']/df_tennis['w_bpFaced']
df_tennis['rel_l_bpSaved'] = df_tennis['l_bpSaved']/df_tennis['l_bpFaced']

df_tennis['rel_w_gmsWon'] = df_tennis['w_gmsWon']/(df_tennis['w_gmsWon'] + df_tennis['l_gmsWon'])
df_tennis['rel_l_gmsWon'] = df_tennis['l_gmsWon']/(df_tennis['w_gmsWon'] + df_tennis['l_gmsWon'])

In [17]:
df_winner = df_tennis.loc[:, ['winner_name', 'rel_w_ace', 'rel_w_df', 'rel_w_1stIn', 'rel_w_1stWon', 'rel_w_2ndWon', 'w_1WonOn1In', 'w_1WonOnTotWon', 'rel_w_ptsWon', 'rel_w_bpFaced', 'rel_w_bpSaved', 'rel_w_gmsWon']]
df_loser = df_tennis.loc[:, ['loser_name', 'rel_l_ace', 'rel_l_df', 'rel_l_1stIn', 'rel_l_1stWon', 'rel_l_2ndWon', 'l_1WonOn1In', 'l_1WonOnTotWon', 'rel_l_ptsWon', 'rel_l_bpFaced', 'rel_l_bpSaved', 'rel_l_gmsWon']]
ren_attr_list = ['name', 'rel_ace', 'rel_df', 'rel_1stIn', 'rel_1stWon', 'rel_2ndWon', '1WonOn1In', '1WonOnTotWon', 'rel_ptsWon', 'rel_bpFaced', 'rel_bpSaved', 'rel_gmsWon']

df_winner.set_axis(ren_attr_list, axis=1, inplace=True)
df_loser.set_axis(ren_attr_list, axis=1, inplace=True)
df = pd.concat([df_winner, df_loser]).groupby('name').mean().reset_index()

df_players = df_players.merge(df, on="name")

## Feature selection

In [28]:
df_players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2299 entries, 10 to 10099
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   name                     2299 non-null   object 
 1   gender                   2299 non-null   object 
 2   total_tourneys_played    2299 non-null   int64  
 3   total_matches_played     2299 non-null   int64  
 4   total_matches_won        2299 non-null   float64
 5   matches_won_ratio        2299 non-null   float64
 6   mean_performance_index   2299 non-null   Float64
 7   max_performance_index    2299 non-null   Float64
 8   min_performance_index    2299 non-null   Float64
 9   ht                       483 non-null    float64
 10  age                      2299 non-null   float64
 11  hand                     2299 non-null   object 
 12  mean_minutes             2299 non-null   float64
 13  max_minutes              2299 non-null   float64
 14  minutes_entropy       

In [27]:
df_players.head()

Unnamed: 0,name,gender,total_tourneys_played,total_matches_played,total_matches_won,matches_won_ratio,mean_performance_index,max_performance_index,min_performance_index,ht,...,rel_df,rel_1stIn,rel_1stWon,rel_2ndWon,1WonOn1In,1WonOnTotWon,rel_ptsWon,rel_bpFaced,rel_bpSaved,rel_gmsWon
10,aaron addison,m,9,11,2.0,0.181818,0.090909,0.333333,0.0,,...,0.060695,0.540678,0.361864,0.214644,0.673461,0.629105,0.476897,0.056528,0.546117,0.405194
14,abbie myers,f,81,157,75.0,0.477707,0.290166,1.0,0.0,,...,0.070857,0.578607,0.341164,0.179801,0.59018,0.654515,0.46166,0.064469,0.396978,0.506549
22,abhinav sanjeev shanmugam,m,8,16,8.0,0.5,0.303571,0.666667,0.0,,...,0.053235,0.470209,0.341783,0.260739,0.728696,0.569641,0.508137,0.05485,0.58526,0.479098
28,abigail spears,f,9,9,0.0,0.0,0.0,0.0,0.0,,...,0.044376,0.58957,0.389238,0.142034,0.661836,0.732042,0.451985,0.062359,0.481327,0.364778
29,abigail tere apisah,f,61,126,74.0,0.587302,0.35582,0.833333,0.0,,...,0.070367,0.581215,0.368487,0.191751,0.635412,0.663208,0.544618,0.064519,0.576609,0.566284


### Filling missing values

In [19]:
attr_list = ['age', 'mean_rank_points', 'max_rank_points', 'last_rank_points', 'variance_rank_points']
means = df_players[attr_list].mean()
df_players[attr_list] = df_players[attr_list].fillna(means)
df_players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10103 entries, 0 to 10102
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   name                     10103 non-null  object 
 1   gender                   10103 non-null  object 
 2   total_tourneys_played    10103 non-null  int64  
 3   total_matches_played     10103 non-null  int64  
 4   total_matches_won        10103 non-null  float64
 5   matches_won_ratio        10103 non-null  float64
 6   mean_performance_index   10103 non-null  Float64
 7   max_performance_index    10103 non-null  Float64
 8   min_performance_index    10103 non-null  Float64
 9   ht                       541 non-null    float64
 10  age                      10103 non-null  float64
 11  hand                     10103 non-null  object 
 12  mean_minutes             4237 non-null   float64
 13  max_minutes              4237 non-null   float64
 14  minutes_entropy       

### Correlation analysis

In [20]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

corr_threshold = 0.8
correlation = df_players.corr(method="pearson") #.abs()
correlation = correlation.where(np.tril(np.ones(correlation.shape)).astype(bool)) # remove upper triangle

px.imshow(correlation, labels=dict(color="Correlation"), color_continuous_scale=px.colors.diverging.RdBu, zmin=-1, zmax=1, width=1000, height=1000).show()

correlation = correlation.unstack().drop(labels=get_redundant_pairs(correlation)).sort_values(ascending=False).drop_duplicates()
print(f"These are {len(correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)])} pairs whose correlation is bigger/small than ±{corr_threshold*100}%:")
correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)]

These are 18 pairs whose correlation is bigger/small than ±80.0%:


total_matches_played     total_matches_won         0.988506
total_tourneys_played    total_matches_played      0.987069
max_tourney_spectators   max_tourney_revenue       0.967686
mean_rank_points         max_rank_points           0.962771
total_tourneys_played    total_matches_won         0.952108
matches_won_ratio        mean_performance_index    0.942411
mean_tourney_spectators  mean_tourney_revenue      0.928038
mean_performance_index   max_performance_index     0.919653
max_rank_points          last_rank_points          0.880507
matches_won_ratio        max_performance_index     0.840612
max_minutes              minutes_entropy           0.836004
minutes_entropy          max_tourney_spectators    0.835005
                         max_tourney_revenue       0.827683
rel_1stWon               1WonOn1In                 0.822422
matches_won_ratio        rel_gmsWon                0.816157
total_matches_played     max_tourney_revenue       0.810742
mean_rank_points         last_rank_point

### Selection

The following feature may provide an interesting picture about the in-match performance and the game style of the players. The rationale behind is to pick the ones that are less correlated with the rank points.

In [21]:
feautures = ["rel_ace", "rel_df", "rel_1stIn", "rel_2ndWon", "1WonOn1In", "1WonOnTotWon", "rel_bpSaved"]
df_selected = df_players[feautures].reset_index(drop=True)

corr_threshold = 0.7
correlation = df_selected.corr(method="pearson") #.abs()
correlation = correlation.where(np.tril(np.ones(correlation.shape)).astype(bool)) # remove upper triangle

px.imshow(correlation, labels=dict(color="Correlation"), color_continuous_scale=px.colors.diverging.RdBu, zmin=-1, zmax=1, width=1000, height=1000).show()

correlation = correlation.unstack().drop(labels=get_redundant_pairs(correlation)).sort_values(ascending=False).drop_duplicates()
print(f"These are {len(correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)])} pairs whose correlation is bigger/small than ±{corr_threshold*100}%:")
correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)]

These are 2 pairs whose correlation is bigger/small than ±70.0%:


rel_1stIn   1WonOnTotWon    0.738713
rel_2ndWon  1WonOnTotWon   -0.716264
dtype: float64

The idea is to drop `rel_ace` and `1WonOnTotWon`.
- The former because 1WonOn1In it's an event that it's more informative about a player's performance in a match
- The latter because it's correlated with two features

In [22]:
feautures = ["rel_df", "rel_1stIn", "rel_2ndWon", "1WonOn1In", "rel_bpSaved"]
df_selected = df_players[feautures].reset_index(drop=True)
df_selected.describe()

Unnamed: 0,rel_df,rel_1stIn,rel_2ndWon,1WonOn1In,rel_bpSaved
count,4253.0,4253.0,4253.0,4253.0,4246.0
mean,0.062289,0.59315,0.171683,0.607704,0.487399
std,0.039454,0.072643,0.044545,0.104814,0.134292
min,0.0,0.209302,0.0,0.133333,0.0
25%,0.037226,0.554576,0.146341,0.5551,0.426076
50%,0.052271,0.597922,0.176021,0.624391,0.508099
75%,0.07558,0.637021,0.19957,0.68217,0.571429
max,0.5,0.901639,0.533333,1.0,1.0


### Dropping records

In [23]:
df_players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10103 entries, 0 to 10102
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   name                     10103 non-null  object 
 1   gender                   10103 non-null  object 
 2   total_tourneys_played    10103 non-null  int64  
 3   total_matches_played     10103 non-null  int64  
 4   total_matches_won        10103 non-null  float64
 5   matches_won_ratio        10103 non-null  float64
 6   mean_performance_index   10103 non-null  Float64
 7   max_performance_index    10103 non-null  Float64
 8   min_performance_index    10103 non-null  Float64
 9   ht                       541 non-null    float64
 10  age                      10103 non-null  float64
 11  hand                     10103 non-null  object 
 12  mean_minutes             4237 non-null   float64
 13  max_minutes              4237 non-null   float64
 14  minutes_entropy       

In [24]:
df_winner = df_tennis.loc[:, ['winner_name', 'rel_w_df', 'rel_w_1stIn', 'rel_w_2ndWon', 'w_1WonOn1In', 'rel_w_bpSaved']]
df_loser = df_tennis.loc[:, ['loser_name', 'rel_l_df', 'rel_l_1stIn', 'rel_l_2ndWon', 'l_1WonOn1In', 'rel_l_bpSaved']]
ren_attr_list = ['name', 'rel_df', 'rel_1stIn', 'rel_2ndWon', '1WonOn1In', 'rel_bpSaved']
df_winner.set_axis(ren_attr_list, axis=1, inplace=True)
df_loser.set_axis(ren_attr_list, axis=1, inplace=True)

# dataframe con occorenze degli attributi relativi alle features selezionate
attr_occurrences = pd.concat([df_winner, df_loser]).groupby('name').count()

# mask the dataframe where those attributes compare at least 4 times
sm = attr_occurrences.min(axis=1) >= 4
sm = sm.reset_index(drop = True)

# # drop records from dataframe
df_players = df_players[sm]
df_players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2299 entries, 10 to 10099
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   name                     2299 non-null   object 
 1   gender                   2299 non-null   object 
 2   total_tourneys_played    2299 non-null   int64  
 3   total_matches_played     2299 non-null   int64  
 4   total_matches_won        2299 non-null   float64
 5   matches_won_ratio        2299 non-null   float64
 6   mean_performance_index   2299 non-null   Float64
 7   max_performance_index    2299 non-null   Float64
 8   min_performance_index    2299 non-null   Float64
 9   ht                       483 non-null    float64
 10  age                      2299 non-null   float64
 11  hand                     2299 non-null   object 
 12  mean_minutes             2299 non-null   float64
 13  max_minutes              2299 non-null   float64
 14  minutes_entropy       

### Analysis

By looking at the graphs, it's fair to say that during the clustering phase the features need to be standardized with and deprived of their outliers

In [38]:
for feature in feautures:
  px.histogram(df_players, x=feature, marginal="box").show()

## Output

In [25]:
df_players.to_csv("./datasets/players.csv")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=56da3ab5-e195-41aa-a609-f5fefeb3379d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>