# Players dataset preparation

In [1]:
import pandas as pd
import numpy as np

# Visualization
import plotly.express as px
import plotly.io as pio
pd.options.plotting.backend = 'plotly'
pio.templates.default = "seaborn"

## Reading and initial processing

In [2]:
# Read datasets
df_tennis = pd.read_csv("./datasets/tennis_matches_cleaned.csv", parse_dates=["tourney_date"])
df_male = pd.read_csv("./datasets/male_players.csv")
df_female = pd.read_csv("./datasets/female_players.csv")

df_male.drop_duplicates(inplace=True)
df_female.drop_duplicates(inplace=True)

def preprocess_strings(df):
    df = df.applymap(lambda x:x.lower().strip() if type(x) == str else x)
    return df.replace(r"\s{2,}", " ", regex=True)

df_tennis.tourney_date = pd.to_datetime(df_tennis.tourney_date, format='%Y%m%d')

# Normalize strings
df_male = preprocess_strings(df_male)
df_female = preprocess_strings(df_female)

In [3]:
# Create a new players dataset whose name is the concatenation of the name and the surname for the male and female players
df = pd.concat([df_male.name + " " + df_male.surname], axis=1, keys=["name", "gender"])
df["gender"] = "m"
df_players = df
df = pd.concat([df_female.name + " " + df_female.surname], axis=1, keys=["name", "gender"])
df["gender"] = "f"
df_players = df_players.append(df)

# Remove duplicates arising from the intersection
df_players = df_players.drop_duplicates(subset=["name"])
df_players = df_players.sort_values(by=["name"])
df_players.reset_index(drop=True, inplace=True)

# Merge the players dataset with the tennis dataset
df = pd.merge(df_tennis, df_players.rename(columns = {'name': 'winner_name', 'gender':'winner_gender'}), on="winner_name", how="left")
df = pd.merge(df, df_players.rename(columns = {'name': 'loser_name', 'gender':'loser_gender'}), on="loser_name", how="left")

# Find the players who do not have a gender and assign them the most common among the genders of the players they played with. 
winners = df[df.winner_gender.isnull()].groupby(["winner_name", "loser_gender"]).loser_gender.count().reset_index(name="occurrences").rename(columns={"winner_name":"name", "loser_gender":"gender"})

losers = df[df.loser_gender.isnull()].groupby(["loser_name", "winner_gender"]).winner_gender.count().reset_index(name="occurrences").rename(columns={"loser_name":"name", "winner_gender":"gender"})

estimate = pd.concat([winners, losers]).groupby(["name", "gender"]).occurrences.sum().reset_index(name="occurrences")
estimate = estimate.sort_values(by=["name", "occurrences"], ascending=[1, 0]).drop_duplicates("name", keep="first").drop(columns=["occurrences"])

# Create the players dataset
df_players = df.loc[:, ['winner_name', 'winner_gender']].set_axis(['name', 'gender'], axis=1, inplace=False)
df_players = df_players.append(df.loc[:, ['loser_name', 'loser_gender']].set_axis(['name', 'gender'], axis=1, inplace=False))
df_players = df_players.append(estimate)
df_players = df_players.dropna().drop_duplicates()
df_players = df_players.sort_values(by=["name"]).reset_index(drop=True)
df_players.head()

Unnamed: 0,name,gender
0,aada inna,f
1,aalisha alexis,f
2,aaliya ebrahim,f
3,aaliyah hohmann,f
4,aalyka ebrahim,f


In [4]:
df_tennis.head()

Unnamed: 0.1,Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_entry,...,winner_rank,winner_rank_points,loser_rank,loser_rank_points,tourney_spectators,tourney_revenue,score_norm,games_list,w_gmsWon,l_gmsWon
0,0,2019-m020,brisbane,hard,32.0,a,2018-12-31,300.0,105453.0,,...,9.0,3590.0,16.0,1977.0,3928.0,742618.69,6-4 3-6 6-2,"['6', '4', '3', '6', '6', '2']",15,12
1,1,2019-m020,brisbane,hard,32.0,a,2018-12-31,299.0,106421.0,,...,16.0,1977.0,239.0,200.0,3928.0,742618.69,7-6 6-2,"['7', '6', '6', '2']",13,8
2,2,2019-m020,brisbane,hard,32.0,a,2018-12-31,298.0,105453.0,,...,9.0,3590.0,40.0,1050.0,3928.0,742618.69,6-2 6-2,"['6', '2', '6', '2']",12,4
3,3,2019-m020,brisbane,hard,32.0,a,2018-12-31,297.0,104542.0,pr,...,239.0,200.0,31.0,1298.0,3928.0,742618.69,6-4 7-6,"['6', '4', '7', '6']",13,10
4,4,2019-m020,brisbane,hard,32.0,a,2018-12-31,296.0,106421.0,,...,16.0,1977.0,18.0,1855.0,3928.0,742618.69,6-7 6-3 6-4,"['6', '7', '6', '3', '6', '4']",18,14


## Feature engineering

### Mean, max, entropy of minutes

In [5]:
df_w = df_tennis.loc[:, ['winner_name', 'minutes']].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'minutes']].rename(columns={'loser_name': 'name'})

df = pd.concat([df_w, df_l]).groupby('name').mean().minutes.reset_index(name = "minutes_mean")

df_players = df_players.merge(df, on="name")

print(len(df))
print(len(df_players))


10103
10103


In [6]:
df_w = df_tennis.loc[:, ['winner_name', 'minutes']].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'minutes']].rename(columns={'loser_name': 'name'})

df = pd.concat([df_w, df_l]).groupby('name').max().minutes.reset_index(name = "minutes_max")

df_players = df_players.merge(df, on="name")


print(len(df))
print(len(df_players))

10103
10103


In [7]:
df_w = df_tennis.loc[:, ['winner_name', 'minutes']].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'minutes']].rename(columns={'loser_name': 'name'})

df = pd.concat([df_w, df_l])

g_sum = df.groupby('name')['minutes'].transform('sum')
values = df['minutes']/g_sum
df['minutes_entropy'] = -(values*np.log(values))

df1 = df.groupby('name',as_index=False,sort=False)['minutes_entropy'].sum()

# TODO join con df_players

print(len(df))
print(len(df_players))

371628
10103


  result = getattr(ufunc, method)(*inputs, **kwargs)


### Tourneys played

In [8]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_id']].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'tourney_id']].rename(columns={'loser_name': 'name'})

#  giocatori e relativo numero di partecipazioni ai tornei
df = pd.concat([df_w, df_l]).groupby('name').tourney_id.nunique().reset_index(name="total_tourneys_played")

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare dataframe
print(len(df))
print(len(df_players))

10103
10103


### Matches played

In [9]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_id']].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'tourney_id']].rename(columns={'loser_name': 'name'})

#  giocatori e relativo numero di match giocati
df = pd.concat([df_w, df_l]).groupby('name').tourney_id.count().reset_index(name="total_matches_played")

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare dataframe
print(len(df))
print(len(df_players))

10103
10103


### Matches won

In [10]:
df = df_tennis.loc[:, ['winner_name', 'tourney_id']].rename(columns={'winner_name': 'name'})

#  giocatori e relativo numero di match vinti
df = df.groupby('name').tourney_id.count().reset_index(name="total_matches_won")

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name", how="left")
df_players.total_matches_won.fillna(0, inplace=True)

# stampare dataframe
print(len(df))
print(len(df_players))

6179
10103


### Matches won ratio

In [11]:
df_players["matches_won_ratio"] = df_players["total_matches_won"] / df_players["total_matches_played"]

### Mean, max, min, entropy of performance_index

In [12]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_id', "draw_size"]].rename(columns={'winner_name': 'name'})
df_l = df_tennis.loc[:, ['loser_name', 'tourney_id', "draw_size"]].rename(columns={'loser_name': 'name'})

matches_won_per_tourney = df_w.groupby(['name', 'tourney_id']).tourney_id.count().reset_index(name="total_matches_won_per_tourney")
df = pd.concat([df_w, df_l])
df = df.merge(matches_won_per_tourney, on=['name', 'tourney_id'], how="left")
df["total_matches_won_per_tourney"] = df["total_matches_won_per_tourney"].fillna(0).astype('Int32')

df["matches_to_play_per_tourney"] = np.log2(df["draw_size"]) + 1
df["performance_index"] = df["total_matches_won_per_tourney"] / df["matches_to_play_per_tourney"]
# If performance_index is bigger than 1 (won more matches than the number of matches I am supposed to play), then set it to 1. That's the case of extra qualifications
df["performance_index"].clip(lower=0, upper=1, inplace=True)

# Stats about performance_index
df_stats = df.groupby('name')['performance_index'].agg(mean_performance_index='mean', max_performance_index='max', min_performance_index='min')

# Entropy of performance_index
g_sum = df.groupby('name')['performance_index'].transform('sum')
values = df['performance_index']/g_sum
df['performance_index_entropy'] = -(values*np.log(values))
df_entropy = df.groupby('name')['performance_index_entropy'].sum().reset_index()

# Merge stats and entropy with players
df_players = df_players.merge(df_stats, on="name")
df_players = df_players.merge(df_entropy, on="name")

  result = getattr(ufunc, method)(*inputs2, **kwargs)


### Height

Feature probabilmente inutilizzabile, l'abbiamo solo per 541 giocatori

In [13]:
df_w = df_tennis.loc[:, ['winner_name', 'winner_ht', 'tourney_date']].rename(columns={'winner_name': 'name', 'winner_ht': 'ht'})
df_l = df_tennis.loc[:, ['loser_name', 'loser_ht', 'tourney_date']].rename(columns={'loser_name': 'name', 'loser_ht': 'ht'})

# giocatori e relativa altezza
df = pd.concat([df_w, df_l]).sort_values('tourney_date', ascending='False').groupby('name').ht.first().reset_index()

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare numero di valori nulli e dataframe
print('Numero giocatori per cui abbiamo altezza ' + str(len(df[df.ht.notnull()])))
print('Numero giocatori totali ' + str(len(df)))
df

print(len(df))
print(len(df_players))

Numero giocatori per cui abbiamo altezza 541
Numero giocatori totali 10103
10103
10103


### Age

In [14]:
df_w = df_tennis.loc[:, ['winner_name', 'winner_age', 'tourney_date']].rename(columns={'winner_name': 'name', 'winner_age': 'age'})
df_l = (df_tennis.loc[:, ['loser_name', 'loser_age', 'tourney_date']]).rename(columns={'loser_name': 'name', 'loser_age': 'age'})

# giocatori e relativa età
df = pd.concat([df_w, df_l]).sort_values('tourney_date', ascending='False').groupby('name').age.first().reset_index()

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare numero di valori nulli e dataframe
print('Numero giocatori per cui abbiamo l\'età ' + str(len(df[df.age.notnull()])))
print('Numero giocatori totali ' + str(len(df)))
df

print(len(df))
print(len(df_players))


Numero giocatori per cui abbiamo l'età 8012
Numero giocatori totali 10103
10103
10103


### Hand

In [15]:
df_w = df_tennis.loc[:, ['winner_name', 'winner_hand']].rename(columns={'winner_name': 'name', 'winner_hand': 'hand'})
df_l = (df_tennis.loc[:, ['loser_name', 'loser_hand']]).rename(columns={'loser_name': 'name', 'loser_hand': 'hand'})

# giocatori e relativa mano preferita
df = pd.concat([df_w, df_l]).groupby('name').hand.agg(pd.Series.mode).reset_index()

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare numero di valori nulli e dataframe
print('Numero giocatori per cui abbiamo la mano ' + str(len(df[df.hand.notnull()])))
print('Numero giocatori totali ' + str(len(df)))
df

print(len(df))
print(len(df_players))

Numero giocatori per cui abbiamo la mano 10103
Numero giocatori totali 10103
10103
10103


### Ranking

In [16]:
df_w = df_tennis.loc[:, ['winner_name', 'winner_rank_points', 'tourney_date']].rename(columns={'winner_name': 'name', 'winner_rank_points': 'rank_points'})
df_l = (df_tennis.loc[:, ['loser_name', 'loser_rank_points', 'tourney_date']]).rename(columns={'loser_name': 'name', 'loser_rank_points': 'rank_points'})

# giocatori e relative misure su ranking (varianza, media, max)
df = pd.concat([df_l, df_w]).sort_values('tourney_date', ascending = False).groupby('name').rank_points.agg(mean_rank_points='mean', max_rank_points='max', last_rank_points='first', variance_rank_points=lambda x: np.var(x,ddof=0)).reset_index()

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare numero di valori nulli e dataframe
print('numero di valori null mean_rank_points ' + str(df.mean_rank_points.isna().sum()) + ' su ' + str(len(df.mean_rank_points)))
print('numero di valori null max_rank_points ' + str(df.max_rank_points.isna().sum()) + ' su ' + str(len(df.mean_rank_points)))
print('numero di valori null last_rank_points ' + str(df.last_rank_points.isna().sum()) + ' su ' + str(len(df.mean_rank_points)))
print('numero di valori null variance_rank_points ' + str(df.variance_rank_points.isna().sum()) + ' su ' + str(len(df.variance_rank_points)))
df

print(len(df))
print(len(df_players))

numero di valori null mean_rank_points 5656 su 10103
numero di valori null max_rank_points 5656 su 10103
numero di valori null last_rank_points 5656 su 10103
numero di valori null variance_rank_points 5656 su 10103
10103
10103


### Spectator

In [17]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_spectators']].rename(columns = {'winner_name': 'name'})
df_l = (df_tennis.loc[:, ['loser_name', 'tourney_spectators']]).rename(columns = {'loser_name': 'name'})

# giocatori e relative misure sugli spectator (media e max)
df = pd.concat([df_l, df_w]).groupby('name').tourney_spectators.agg(mean_tourney_spectators='mean', max_tourney_spectators='max').reset_index()

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare numero di valori nulli e dataframe
print('Numero di nulli in mean_tourney_spectators ' + str(df.mean_tourney_spectators.isna().sum()))
print('Numero di nulli in max_tourney_spectators ' + str(df.max_tourney_spectators.isna().sum()))
df


print(len(df))
print(len(df_players))

Numero di nulli in mean_tourney_spectators 0
Numero di nulli in max_tourney_spectators 0
10103
10103


### Revenue

In [18]:
df_w = df_tennis.loc[:, ['winner_name', 'tourney_revenue']].rename(columns = {'winner_name': 'name'})
df_l = (df_tennis.loc[:, ['loser_name', 'tourney_revenue']]).rename(columns = {'loser_name': 'name'})

# giocatori e relative misure su revenue (media e max)
df = pd.concat([df_l, df_w]).groupby('name').tourney_revenue.agg(mean_tourney_revenue='mean', max_tourney_revenue='max').reset_index()

# aggiungere feature al data_frame dei profili dei player
df_players = df_players.merge(df, on="name")

# stampare numero di valori nulli e dataframe
print('Numero di nulli in mean_tourney_revenue ' + str(df.mean_tourney_revenue.isna().sum()))
print('Numero di nulli in max_tourney_revenue ' + str(df.max_tourney_revenue.isna().sum()))
df

print(len(df))
print(len(df_players))

Numero di nulli in mean_tourney_revenue 0
Numero di nulli in max_tourney_revenue 0
10103
10103


### Player in-match features

In [19]:
# Those feature where also used to check possible outliers, but nothing particullarry significant was found

df_tennis['rel_w_ace'] = df_tennis['w_ace']/df_tennis['w_svpt']
df_tennis['rel_l_ace'] = df_tennis['l_ace']/df_tennis['l_svpt']

df_tennis['rel_w_df'] = df_tennis['w_df']/df_tennis['w_svpt']
df_tennis['rel_l_df'] = df_tennis['l_df']/df_tennis['l_svpt']

df_tennis['rel_w_1stIn'] = df_tennis['w_1stIn']/df_tennis['w_svpt']
df_tennis['rel_l_1stIn'] = df_tennis['l_1stIn']/df_tennis['l_svpt']

df_tennis['rel_w_1stWon'] = df_tennis['w_1stWon']/df_tennis['w_svpt']
df_tennis['rel_l_1stWon'] = df_tennis['l_1stWon']/df_tennis['l_svpt']

df_tennis['rel_w_2ndWon'] = df_tennis['w_2ndWon']/df_tennis['w_svpt']
df_tennis['rel_l_2ndWon'] = df_tennis['l_2ndWon']/df_tennis['l_svpt']

df_tennis['w_1WonOn1In'] = df_tennis['w_1stWon']/df_tennis['w_1stIn']
df_tennis['l_1WonOn1In'] = df_tennis['l_1stWon']/df_tennis['l_1stIn']

df_tennis['w_1WonOn2Won'] = df_tennis['w_1stWon']/df_tennis['w_2ndWon']
df_tennis['l_1WonOn2Won'] = df_tennis['l_1stWon']/df_tennis['l_2ndWon']

df_tennis['rel_w_ptsWon'] = (df_tennis['w_1stWon'] + df_tennis['w_2ndWon'])/(df_tennis['w_1stWon'] + df_tennis['w_2ndWon'] + df_tennis['l_1stWon'] + df_tennis['l_2ndWon'])
df_tennis['rel_l_ptsWon'] = (df_tennis['l_1stWon'] + df_tennis['l_2ndWon'])/(df_tennis['w_1stWon'] + df_tennis['w_2ndWon'] + df_tennis['l_1stWon'] + df_tennis['l_2ndWon'])

df_tennis['rel_w_bpFaced'] = df_tennis['w_bpFaced']/(df_tennis['w_svpt'] + df_tennis['l_svpt'])
df_tennis['rel_l_bpFaced'] = df_tennis['l_bpFaced']/(df_tennis['w_svpt'] + df_tennis['l_svpt'])

df_tennis['rel_w_bpSaved'] = df_tennis['w_bpSaved']/df_tennis['w_bpFaced']
df_tennis['rel_l_bpSaved'] = df_tennis['l_bpSaved']/df_tennis['l_bpFaced']

df_tennis['rel_w_gmsWon'] = df_tennis['w_gmsWon']/(df_tennis['w_gmsWon'] + df_tennis['l_gmsWon'])
df_tennis['rel_l_gmsWon'] = df_tennis['l_gmsWon']/(df_tennis['w_gmsWon'] + df_tennis['l_gmsWon'])


In [20]:
df_winner = df_tennis.loc[:, ['winner_name', 'rel_w_ace', 'rel_w_df', 'rel_w_1stIn', 'rel_w_1stWon', 'rel_w_2ndWon', 'w_1WonOn1In', 'w_1WonOn2Won', 'rel_w_ptsWon', 'rel_w_bpFaced', 'rel_w_bpSaved', 'rel_w_gmsWon']]
df_loser = df_tennis.loc[:, ['loser_name', 'rel_l_ace', 'rel_l_df', 'rel_l_1stIn', 'rel_l_1stWon', 'rel_l_2ndWon', 'l_1WonOn1In', 'l_1WonOn2Won', 'rel_l_ptsWon', 'rel_l_bpFaced', 'rel_l_bpSaved', 'rel_l_gmsWon']]

ren_attr_list = ['name', 'rel_ace', 'rel_df', 'rel_1stIn', 'rel_1stWon', 'rel_2ndWon', '1WonOn1In', '1WonOn2Won', 'rel_ptsWon', 'rel_bpFaced', 'rel_bpSaved', 'rel_gmsWon']

df_winner.set_axis(ren_attr_list, axis=1, inplace=True)
df_loser.set_axis(ren_attr_list, axis=1, inplace=True)

df = pd.concat([df_winner, df_loser]).groupby('name').mean().reset_index()

df_players = df_players.merge(df, on="name")
df_players

Unnamed: 0,name,gender,minutes_mean,minutes_max,total_tourneys_played,total_matches_played,total_matches_won,matches_won_ratio,mean_performance_index,max_performance_index,...,rel_df,rel_1stIn,rel_1stWon,rel_2ndWon,1WonOn1In,1WonOn2Won,rel_ptsWon,rel_bpFaced,rel_bpSaved,rel_gmsWon
0,aada inna,f,,,1,1,0.0,0.000000,0.0,0.0,...,,,,,,,,,,0.000000
1,aalisha alexis,f,,,2,2,0.0,0.000000,0.0,0.0,...,,,,,,,,,,0.071429
2,aaliya ebrahim,f,,,7,9,2.0,0.222222,0.074074,0.166667,...,,,,,,,,,,0.346288
3,aaliyah hohmann,f,,,2,3,1.0,0.333333,0.111111,0.166667,...,,,,,,,,,,0.340224
4,aalyka ebrahim,f,,,4,4,0.0,0.000000,0.0,0.0,...,,,,,,,,,,0.124389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10098,zuzana zalabska,f,,,1,3,2.0,0.666667,0.333333,0.333333,...,,,,,,,,,,0.522222
10099,zuzana zlochova,f,90.571429,138.0,88,179,93.0,0.519553,0.312954,1.0,...,0.073615,0.669885,0.350763,0.102982,0.526750,4.326304,0.462562,0.089222,0.411585,0.501116
10100,zuzanna bednarz,f,,,2,2,0.0,0.000000,0.0,0.0,...,,,,,,,,,,0.225000
10101,zuzanna szczepanska,f,,,3,3,0.0,0.000000,0.0,0.0,...,,,,,,,,,,0.139927


## Dropping records

In [21]:
# dataframe con occorenze degli attributi relativi alle match feature (ace, ...)
attr_occurrences = pd.concat([df_winner, df_loser]).groupby('name').count()

# mask the dataframe where those attributes compare at least 4 times
sm = attr_occurrences.drop(columns=['rel_gmsWon']).min(axis=1) >= 4
sm = sm.reset_index(drop = True)
sm

# drop records from dataframe
df_players = df_players[sm]

In [22]:
df_players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2299 entries, 10 to 10099
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   name                       2299 non-null   object 
 1   gender                     2299 non-null   object 
 2   minutes_mean               2299 non-null   float64
 3   minutes_max                2299 non-null   float64
 4   total_tourneys_played      2299 non-null   int64  
 5   total_matches_played       2299 non-null   int64  
 6   total_matches_won          2299 non-null   float64
 7   matches_won_ratio          2299 non-null   float64
 8   mean_performance_index     2299 non-null   Float64
 9   max_performance_index      2299 non-null   Float64
 10  min_performance_index      2299 non-null   Float64
 11  performance_index_entropy  2299 non-null   Float64
 12  ht                         483 non-null    float64
 13  age                        2295 non-null   flo

### Filling NaNs

In [23]:
attr_list = ['age', 'mean_rank_points', 'max_rank_points', 'last_rank_points', 'variance_rank_points']
means = df_players[attr_list].mean()
df_players[attr_list] = df_players[attr_list].fillna(means)
df_players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2299 entries, 10 to 10099
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   name                       2299 non-null   object 
 1   gender                     2299 non-null   object 
 2   minutes_mean               2299 non-null   float64
 3   minutes_max                2299 non-null   float64
 4   total_tourneys_played      2299 non-null   int64  
 5   total_matches_played       2299 non-null   int64  
 6   total_matches_won          2299 non-null   float64
 7   matches_won_ratio          2299 non-null   float64
 8   mean_performance_index     2299 non-null   Float64
 9   max_performance_index      2299 non-null   Float64
 10  min_performance_index      2299 non-null   Float64
 11  performance_index_entropy  2299 non-null   Float64
 12  ht                         483 non-null    float64
 13  age                        2299 non-null   flo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


## Correlation analysis

In [24]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

corr_threshold = 0.7
correlation = df_players.corr(method="pearson") #.abs()
correlation = correlation.where(np.tril(np.ones(correlation.shape)).astype(bool)) # remove upper triangle

px.imshow(correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)], labels=dict(color="Correlation"), color_continuous_scale=px.colors.diverging.RdBu, zmin=-1, zmax=1, width=1000, height=1000).show()

correlation = correlation.unstack().drop(labels=get_redundant_pairs(correlation)).sort_values(ascending=False).drop_duplicates()
print(f"These are {len(correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)])} pairs whose correlation is bigger/small than ±{corr_threshold*100}%:")
correlation[(correlation>=corr_threshold) | (correlation<=-corr_threshold)]

These are 44 pairs whose correlation is bigger/small than ±70.0%:


total_matches_played       total_matches_won            0.986467
total_tourneys_played      total_matches_played         0.983970
mean_tourney_spectators    mean_tourney_revenue         0.970750
mean_rank_points           max_rank_points              0.959518
max_tourney_spectators     max_tourney_revenue          0.954754
matches_won_ratio          mean_performance_index       0.945587
total_tourneys_played      total_matches_won            0.942372
max_performance_index      performance_index_entropy    0.898209
matches_won_ratio          rel_gmsWon                   0.893137
total_matches_played       performance_index_entropy    0.876659
mean_performance_index     max_performance_index        0.874870
max_rank_points            last_rank_points             0.871592
total_tourneys_played      performance_index_entropy    0.869798
total_matches_won          performance_index_entropy    0.857711
mean_performance_index     rel_gmsWon                   0.841916
matches_won_ratio        

## Feature analysis

In [25]:
df_players.loc[:,'total_matches_played'].hist()

In [26]:
df_players.describe()

Unnamed: 0,minutes_mean,minutes_max,total_tourneys_played,total_matches_played,total_matches_won,matches_won_ratio,mean_performance_index,max_performance_index,min_performance_index,performance_index_entropy,...,rel_df,rel_1stIn,rel_1stWon,rel_2ndWon,1WonOn1In,1WonOn2Won,rel_ptsWon,rel_bpFaced,rel_bpSaved,rel_gmsWon
count,2299.0,2299.0,2299.0,2299.0,2299.0,2299.0,2299.0,2299.0,2299.0,2299.0,...,2299.0,2299.0,2299.0,2299.0,2299.0,2183.0,2299.0,2299.0,2299.0,2299.0
mean,91.399894,165.331013,50.618095,105.319704,56.07786,0.431163,0.233639,0.588843,0.003283,3.300087,...,0.052463,0.605577,0.391381,0.18041,0.647457,inf,0.486619,0.057552,0.524137,0.474291
std,13.01716,48.605663,47.2657,104.584744,60.637515,0.166794,0.121569,0.309056,0.033018,1.697273,...,0.023154,0.049175,0.045975,0.029654,0.066711,,0.030397,0.014355,0.07797,0.075078
min,34.625,43.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.008889,0.404926,0.132932,0.058269,0.274994,1.090117,0.216729,0.019887,0.120536,0.009615
25%,84.3875,134.0,8.0,11.0,4.0,0.339568,0.137624,0.333333,0.0,1.829968,...,0.036616,0.574274,0.363629,0.162414,0.603413,2.11553,0.474245,0.04739,0.482991,0.444189
50%,92.580189,162.0,33.0,61.0,27.0,0.472222,0.248743,0.666667,0.0,3.638728,...,0.04778,0.605777,0.395287,0.183692,0.653247,2.428122,0.493979,0.05569,0.533191,0.493336
75%,99.604545,189.0,90.0,192.0,105.0,0.547516,0.322151,0.833333,0.0,4.887025,...,0.063221,0.636074,0.422492,0.200551,0.696093,2.784476,0.504592,0.065547,0.575272,0.522738
max,164.0,396.0,175.0,387.0,289.0,0.852507,0.833333,1.0,0.833333,5.75425,...,0.241385,0.827553,0.563703,0.30612,0.830508,inf,0.615238,0.128559,0.809524,0.654667


### Tourneys played

In [27]:
df_players.total_tourneys_played.hist()

In [28]:
np.log2(df_players.total_tourneys_played).hist()

### Matches played

In [29]:
df_players.total_matches_played.hist()

In [30]:
np.log(df_players.total_matches_played).hist()

### Matches won

In [31]:
df_players.total_matches_won.hist()

In [32]:
np.log10(df_players.total_matches_won).hist()


divide by zero encountered in log10



### Age

In [33]:
df_players.age.hist()

In [34]:
np.log(df_players.age).hist()

### Hand

### Ranking

In [35]:
df_players.variance_rank_points.hist()

In [36]:
np.log1p(df_players.variance_rank_points).hist()

In [37]:
df_players.last_rank_points.hist()

In [38]:
np.sqrt(np.log(df_players.last_rank_points)).hist()

In [39]:
df_players.to_csv("./datasets/players.csv")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=56da3ab5-e195-41aa-a609-f5fefeb3379d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>