In [123]:
import pandas as ps
import matplotlib.pyplot as mt
import seaborn as sb
import numpy as np
import plotly.graph_objects as gg


### reading csv data

In [124]:
atp = ps.read_csv('atp_matches_2023.csv')
atp['tourney_date'] = ps.to_datetime(atp['tourney_date'] , format='%Y%m%d')
atp.count()

tourney_id            2986
tourney_name          2986
surface               2933
draw_size             2986
tourney_level         2986
tourney_date          2986
match_num             2986
winner_id             2986
winner_seed           1250
winner_entry           473
winner_name           2986
winner_hand           2986
winner_ht             2782
winner_ioc            2986
winner_age            2985
loser_id              2986
loser_seed             769
loser_entry            695
loser_name            2986
loser_hand            2986
loser_ht              2650
loser_ioc             2986
loser_age             2979
score                 2986
best_of               2986
round                 2986
minutes               2754
w_ace                 2815
w_df                  2815
w_svpt                2815
w_1stIn               2815
w_1stWon              2815
w_2ndWon              2815
w_SvGms               2815
w_bpSaved             2815
w_bpFaced             2815
l_ace                 2815
l

In [182]:
atp.columns

Index(['index', 'tourney_id', 'tourney_name', 'surface', 'draw_size',
       'tourney_level', 'tourney_date', 'match_num', 'winner_id',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc',
       'loser_age', 'score', 'best_of', 'round', 'minutes', 'w_ace', 'w_df',
       'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved',
       'w_bpFaced', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon',
       'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'winner_rank',
       'winner_rank_points', 'loser_rank', 'loser_rank_points', 'match_id',
       'set1', 'set2', 'set3', 'set4', 'set5'],
      dtype='object')

#### creating match ID and splitting match set details to lowest granuality

In [125]:
atp['match_id'] = atp['tourney_id'].astype(str).str.cat(atp['match_num'].astype(str), sep = '')
atp[['set1' ,'set2' , 'set3' , 'set4' , 'set5']] = atp['score'].str.split(' ', expand = True)

cols = ['set1' , 'set2', 'set3','set4','set5']

for column in atp[['set1','set2', 'set3','set4','set5']]:
    atp.loc[atp[column].str.contains('-') == False , column] = '0-0'
    atp[column].fillna(value = '0-0', inplace=True)
atp.drop(['winner_seed','winner_entry','loser_seed','loser_entry'] , axis = 1 , inplace=True)
atp = atp.dropna(axis = 0)


#### set point difference creation 

In [126]:

atp.reset_index(inplace = True)

In [127]:
winner_stats = atp[['match_id','surface','tourney_level','set1' , 'set2', 'set3','winner_rank' , 'winner_id' , 'winner_hand' , 'winner_ht' , 'winner_age' , 'score' , 'minutes', 'w_ace','w_df','w_svpt','w_1stIn','w_1stWon','w_2ndWon','w_SvGms', 'w_bpSaved','w_bpFaced','winner_rank_points']]
loser_stats = atp[['match_id','surface','tourney_level','set1' , 'set2', 'set3','loser_rank' , 'loser_id' , 'loser_hand' , 'loser_ht' , 'loser_age' , 'score' , 'minutes', 'l_ace','l_df','l_svpt','l_1stIn','l_1stWon','l_2ndWon','l_SvGms', 'l_bpSaved','l_bpFaced','loser_rank_points']]
#rename columns of both df into generic form by removing 'winner_', 'w_',l+','loser_' suffixes and adding ''ply_' for later concatenation 
winner_stats.columns  = [col.replace('winner_', 'ply_') for col in winner_stats.columns]
winner_stats.columns  = [col.replace('w_', 'ply_') for col in winner_stats.columns]
loser_stats.columns  = [col.replace('loser_', 'ply_') for col in loser_stats.columns]
loser_stats.columns  = [col.replace('l_', 'ply_') for col in loser_stats.columns]
winner_stats = winner_stats.reset_index(drop=True)
loser_stats = loser_stats.reset_index(drop=True)
winner_stats = winner_stats.replace('ND', '0.0' )
loser_stats = loser_stats.replace('ND', '0.0' )

In [128]:
def winnerset_pnt_diff(scrcs):
    for i in range(scrcs.shape[0]):
        scores1 = scrcs['set1'][i]
        scores1 = scores1.replace('-',' ').replace('(',' ').replace(')','').split()
        float_scores1 = [float(string) for string in scores1]
        diff1 = float_scores1[0] - float_scores1[1]
        scrcs.loc[[i],'set1_pnt_diff'] = diff1

        scores2 = scrcs['set2'][i]
        scores2 = scores2.replace('-',' ').replace('(',' ').replace(')','').split()
        float_scores2 = [float(string) for string in scores2]
        diff2 = float_scores2[0] - float_scores2[1]
        scrcs.loc[[i],'set2_pnt_diff'] = diff2
        
        scores = scrcs['set3'][i]
        scores = scores.replace('-',' ').replace('(',' ').replace(')','').replace('[',' ').replace(']','').split()
        float_scores = [float(string) for string in scores]
        diff3 = float_scores[0] - float_scores[1]
        scrcs.loc[[i],'set3_pnt_diff'] = diff3



winnerset_pnt_diff(winner_stats)



In [129]:
def loserset_pnt_diff(scrcs):
    for i in range(scrcs.shape[0]):
        scores1 = scrcs['set1'][i]
        scores1 = scores1.replace('-',' ').replace('(',' ').replace(')','').split()
        float_scores1 = [float(string) for string in scores1]
        diff1 = - float_scores1[0] + float_scores1[1]
        scrcs.loc[[i],'set1_pnt_diff'] = diff1

        scores2 = scrcs['set2'][i]
        scores2 = scores2.replace('-',' ').replace('(',' ').replace(')','').split()
        float_scores2 = [float(string) for string in scores2]
        diff2 = - float_scores2[0] + float_scores2[1]
        scrcs.loc[[i],'set2_pnt_diff'] = diff2
        
        scores = scrcs['set3'][i]
        scores = scores.replace('-',' ').replace('(',' ').replace(')','').replace('[',' ').replace(']','').split()
        float_scores = [float(string) for string in scores]
        diff3 = - float_scores[0] + float_scores[1]
        scrcs.loc[[i],'set3_pnt_diff'] = diff3

loserset_pnt_diff(loser_stats)

In [130]:
loser_stats

Unnamed: 0,match_id,surface,tourney_level,set1,set2,set3,ply_rank,ply_id,ply_hand,ply_ht,...,ply_1stIn,ply_1stWon,ply_2ndWon,ply_SvGms,ply_bpSaved,ply_bpFaced,ply_rank_points,set1_pnt_diff,set2_pnt_diff,set3_pnt_diff
0,2023-9900300,Hard,A,7-6(4),7-6(6),0-0,16.0,126610,R,196.0,...,62.0,47.0,15.0,12.0,9.0,9.0,2375.0,-1.0,-1.0,0.0
1,2023-9900299,Hard,A,6-2,0-0,0-0,23.0,207518,R,185.0,...,12.0,8.0,3.0,4.0,1.0,3.0,1865.0,-4.0,0.0,0.0
2,2023-9900296,Hard,A,7-6(5),7-6(5),0-0,10.0,128034,R,196.0,...,62.0,51.0,7.0,12.0,2.0,2.0,2905.0,-1.0,-1.0,0.0
3,2023-9900295,Hard,A,6-3,6-3,0-0,245.0,200390,R,183.0,...,41.0,26.0,12.0,9.0,6.0,9.0,220.0,-3.0,-3.0,0.0
4,2023-9900292,Hard,A,4-6,7-6(2),6-4,16.0,126610,R,196.0,...,58.0,48.0,18.0,16.0,1.0,2.0,2375.0,2.0,-1.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2353,2023-7696300,Hard,F,3-4(6),4-1,4-2,36.0,209950,R,185.0,...,57.0,46.0,12.0,13.0,1.0,4.0,1158.0,1.0,-3.0,-2.0
2354,2023-7696298,Hard,F,4-3(5),2-1,0-0,94.0,208502,L,180.0,...,18.0,15.0,8.0,5.0,0.0,1.0,673.0,-1.0,-1.0,0.0
2355,2023-7696297,Hard,F,4-2,3-4(3),4-2,94.0,208502,L,180.0,...,56.0,39.0,12.0,12.0,8.0,11.0,673.0,-2.0,1.0,-2.0
2356,2023-7696295,Hard,F,2-4,4-3(6),4-2,115.0,208134,R,185.0,...,65.0,49.0,11.0,15.0,3.0,5.0,549.0,2.0,-1.0,-2.0


### OT meta data
#### creating columns that denotes the OT sets

In [131]:
def winnerset_pnt_ot_diff(scrcs):
    for i in range(scrcs.shape[0]):
        scores1 = scrcs['set1'][i]
        scores1 = scores1.replace('-',' ').replace('(',' ').replace(')','').split()
        float_scores1 = [float(string) for string in scores1]
        if len(float_scores1) == 3:
            if float_scores1[2] <6:
                if float_scores1[0] > float_scores1[1]:
                     diff1 = 7 - float_scores1[2]
                else:
                    diff1 = float_scores1[2] - 7
                scrcs.loc[[i],'set1_ot_diff'] = diff1
            elif (float_scores1[2] > 5) :
                if float_scores1[0] > float_scores1[1]:
                    diff1 = 10 - float_scores1[2]
                    scrcs.loc[[i],'set1_ot_diff'] = diff1
                else:
                    diff1 =  float_scores1[2] - 10
                    scrcs.loc[[i],'set1_ot_diff'] = diff1
        elif len(float_scores2) == 2:
                scrcs.loc[[i],'set1_ot_diff'] = 0

        scores2 = scrcs['set2'][i]
        scores2 = scores2.replace('-',' ').replace('(',' ').replace(')','').split()
        float_scores2 = [float(string) for string in scores2]
        if len(float_scores2) == 3:
            if float_scores2[2] <6:
                if float_scores2[0] >float_scores2[1]:
                    diff2 = 7 - float_scores2[2]
                else:
                     diff2 =  float_scores2[2] -7
                scrcs.loc[[i],'set2_ot_diff'] = diff2
            elif (float_scores2[2] > 5):
                if float_scores2[0] >float_scores2[1]:
                    diff2 = 10 - float_scores2[2]
                    scrcs.loc[[i],'set2_ot_diff'] = diff2
                else:
                    diff2 = float_scores2[2] - 10
                    scrcs.loc[[i],'set2_ot_diff'] = diff2 
        elif len(float_scores2) == 2:
                scrcs.loc[[i],'set2_ot_diff'] = 0


        scores3 = scrcs['set3'][i]


winnerset_pnt_ot_diff(winner_stats)

In [132]:
winner_stats[['set1','set1_ot_diff','set2','set2_ot_diff']]

Unnamed: 0,set1,set1_ot_diff,set2,set2_ot_diff
0,7-6(4),3.0,7-6(6),4.0
1,6-2,,0-0,0.0
2,7-6(5),2.0,7-6(5),2.0
3,6-3,,6-3,0.0
4,4-6,0.0,7-6(2),5.0
...,...,...,...,...
2353,3-4(6),-4.0,4-1,0.0
2354,4-3(5),2.0,2-1,0.0
2355,4-2,0.0,3-4(3),-4.0
2356,2-4,,4-3(6),4.0


In [133]:
def loserset_pnt_ot_diff(scrcs):
    for i in range(scrcs.shape[0]):
        scores1 = scrcs['set1'][i]
        scores1 = scores1.replace('-',' ').replace('(',' ').replace(')','').split()
        float_scores1 = [float(string) for string in scores1]
        if len(float_scores1) == 3:
            if float_scores1[2] <6:
                if float_scores1[0] > float_scores1[1]:
                     diff1 = -7 + float_scores1[2]
                else:
                    diff1 = - float_scores1[2] + 7
                scrcs.loc[[i],'set1_ot_diff'] = diff1
            elif (float_scores1[2] > 5) :
                if float_scores1[0] > float_scores1[1]:
                    diff1 = -10 + float_scores1[2]
                    scrcs.loc[[i],'set1_ot_diff'] = diff1
                else:
                    diff1 = - float_scores1[2] +10

                    scrcs.loc[[i],'set1_ot_diff'] = diff1
        elif len(float_scores2) == 2:
                scrcs.loc[[i],'set1_ot_diff'] = 0

        scores2 = scrcs['set2'][i]
        scores2 = scores2.replace('-',' ').replace('(',' ').replace(')','').split()
        float_scores2 = [float(string) for string in scores2]
        if len(float_scores2) == 3:
            if float_scores2[2] <6:
                if float_scores2[0] >float_scores2[1]:
                    diff2 = -7 + float_scores2[2]
                else:
                     diff2 =  -float_scores2[2] +7
                scrcs.loc[[i],'set2_ot_diff'] = diff2
            elif (float_scores2[2] > 5):
                if float_scores2[0] >float_scores2[1]:
                    diff2 = -10 + float_scores2[2]
                    scrcs.loc[[i],'set2_ot_diff'] = diff2
                else:
                    diff2 =  -float_scores2[2] + 10
                    scrcs.loc[[i],'set2_ot_diff'] = diff2 
        elif len(float_scores2) == 2:
                scrcs.loc[[i],'set2_ot_diff'] = 0



loserset_pnt_ot_diff(loser_stats)

In [134]:
loser_stats[['set1','set1_ot_diff','set2','set2_ot_diff']]

Unnamed: 0,set1,set1_ot_diff,set2,set2_ot_diff
0,7-6(4),-3.0,7-6(6),-4.0
1,6-2,,0-0,0.0
2,7-6(5),-2.0,7-6(5),-2.0
3,6-3,,6-3,0.0
4,4-6,0.0,7-6(2),-5.0
...,...,...,...,...
2353,3-4(6),4.0,4-1,0.0
2354,4-3(5),-2.0,2-1,0.0
2355,4-2,0.0,3-4(3),4.0
2356,2-4,,4-3(6),-4.0


### creating a column that denotes who won each set

In [135]:
def set_winner(atp):
    for i in range(atp.shape[0]):
        scores1 = atp['set1'][i]
        scores1 = scores1.replace('-',' ').replace('(',' ').replace(')','').split()
        float_scores1 = [float(string) for string in scores1]
        if float_scores1[0] > float_scores1[1]:
            atp.loc[[i],'set1_won'] = 1
        elif float_scores1[0] == float_scores1[1]:
            atp.loc[[i],'set1_won'] = 1
        elif float_scores1[0] < float_scores1[1]:
            atp.loc[[i],'set1_won'] = 0

        scores2 = atp['set2'][i]
        scores2 = scores2.replace('-',' ').replace('(',' ').replace(')','').split()
        float_scores2 = [float(string) for string in scores2]
        if float_scores2[0] > float_scores2[1]:
            atp.loc[[i],'set2_won'] = 1
        elif float_scores2[0] < float_scores2[1]:
            atp.loc[[i],'set2_won'] = 0

        scores3 = atp['set3'][i]
        scores3 = scores3.replace('-',' ').replace('(',' ').replace(')','').split()
        float_scores3 = [float(string) for string in scores3]
        if float_scores3[0] > float_scores3[1]:
            atp.loc[[i],'set3_won'] = 1
        elif float_scores3[0] < float_scores3[1]:
            atp.loc[[i],'set3_won'] = 0

set_winner(winner_stats)


In [136]:
set_winner(loser_stats)

In [137]:
winner_stats.columns

Index(['match_id', 'surface', 'tourney_level', 'set1', 'set2', 'set3',
       'ply_rank', 'ply_id', 'ply_hand', 'ply_ht', 'ply_age', 'score',
       'minutes', 'ply_ace', 'ply_df', 'ply_svpt', 'ply_1stIn', 'ply_1stWon',
       'ply_2ndWon', 'ply_SvGms', 'ply_bpSaved', 'ply_bpFaced',
       'ply_rank_points', 'set1_pnt_diff', 'set2_pnt_diff', 'set3_pnt_diff',
       'set1_ot_diff', 'set2_ot_diff', 'set1_won', 'set2_won', 'set3_won'],
      dtype='object')

In [138]:
ply_stats = ps.concat([winner_stats,loser_stats], axis = 0 , ignore_index= True)
ply_stats

Unnamed: 0,match_id,surface,tourney_level,set1,set2,set3,ply_rank,ply_id,ply_hand,ply_ht,...,ply_bpFaced,ply_rank_points,set1_pnt_diff,set2_pnt_diff,set3_pnt_diff,set1_ot_diff,set2_ot_diff,set1_won,set2_won,set3_won
0,2023-9900300,Hard,A,7-6(4),7-6(6),0-0,9.0,126203,R,193.0,...,0.0,3355.0,1.0,1.0,0.0,3.0,4.0,1.0,1.0,
1,2023-9900299,Hard,A,6-2,0-0,0-0,19.0,126207,R,188.0,...,3.0,2000.0,4.0,0.0,0.0,,0.0,1.0,,
2,2023-9900296,Hard,A,7-6(5),7-6(5),0-0,9.0,126203,R,193.0,...,4.0,3355.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,
3,2023-9900295,Hard,A,6-3,6-3,0-0,19.0,126207,R,188.0,...,5.0,2000.0,3.0,3.0,0.0,,0.0,1.0,1.0,
4,2023-9900292,Hard,A,4-6,7-6(2),6-4,4.0,126774,R,193.0,...,2.0,5550.0,-2.0,1.0,2.0,0.0,5.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4711,2023-7696300,Hard,F,3-4(6),4-1,4-2,36.0,209950,R,185.0,...,4.0,1158.0,1.0,-3.0,-2.0,4.0,0.0,0.0,1.0,1.0
4712,2023-7696298,Hard,F,4-3(5),2-1,0-0,94.0,208502,L,180.0,...,1.0,673.0,-1.0,-1.0,0.0,-2.0,0.0,1.0,1.0,
4713,2023-7696297,Hard,F,4-2,3-4(3),4-2,94.0,208502,L,180.0,...,11.0,673.0,-2.0,1.0,-2.0,0.0,4.0,1.0,0.0,1.0
4714,2023-7696295,Hard,F,2-4,4-3(6),4-2,115.0,208134,R,185.0,...,5.0,549.0,2.0,-1.0,-2.0,,-4.0,0.0,1.0,1.0


In [139]:
ply_stats.columns

Index(['match_id', 'surface', 'tourney_level', 'set1', 'set2', 'set3',
       'ply_rank', 'ply_id', 'ply_hand', 'ply_ht', 'ply_age', 'score',
       'minutes', 'ply_ace', 'ply_df', 'ply_svpt', 'ply_1stIn', 'ply_1stWon',
       'ply_2ndWon', 'ply_SvGms', 'ply_bpSaved', 'ply_bpFaced',
       'ply_rank_points', 'set1_pnt_diff', 'set2_pnt_diff', 'set3_pnt_diff',
       'set1_ot_diff', 'set2_ot_diff', 'set1_won', 'set2_won', 'set3_won'],
      dtype='object')

In [140]:
atp

Unnamed: 0,index,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_name,...,winner_rank,winner_rank_points,loser_rank,loser_rank_points,match_id,set1,set2,set3,set4,set5
0,0,2023-9900,United Cup,Hard,18,A,2023-01-02,300,126203,Taylor Fritz,...,9.0,3355.0,16.0,2375.0,2023-9900300,7-6(4),7-6(6),0-0,0-0,0-0
1,1,2023-9900,United Cup,Hard,18,A,2023-01-02,299,126207,Frances Tiafoe,...,19.0,2000.0,23.0,1865.0,2023-9900299,6-2,0-0,0-0,0-0,0-0
2,2,2023-9900,United Cup,Hard,18,A,2023-01-02,296,126203,Taylor Fritz,...,9.0,3355.0,10.0,2905.0,2023-9900296,7-6(5),7-6(5),0-0,0-0,0-0
3,3,2023-9900,United Cup,Hard,18,A,2023-01-02,295,126207,Frances Tiafoe,...,19.0,2000.0,245.0,220.0,2023-9900295,6-3,6-3,0-0,0-0,0-0
4,4,2023-9900,United Cup,Hard,18,A,2023-01-02,292,126774,Stefanos Tsitsipas,...,4.0,5550.0,16.0,2375.0,2023-9900292,4-6,7-6(2),6-4,0-0,0-0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2353,2751,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,300,209098,Hamad Medjedovic,...,110.0,582.0,36.0,1158.0,2023-7696300,3-4(6),4-1,4-2,3-4(9),4-1
2354,2753,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,298,209098,Hamad Medjedovic,...,110.0,582.0,94.0,673.0,2023-7696298,4-3(5),2-1,0-0,0-0,0-0
2355,2754,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,297,209950,Arthur Fils,...,36.0,1158.0,94.0,673.0,2023-7696297,4-2,3-4(3),4-2,4-3(5),0-0
2356,2756,2023-7696,NextGen Finals,Hard,8,F,2023-11-27,295,209950,Arthur Fils,...,36.0,1158.0,115.0,549.0,2023-7696295,2-4,4-3(6),4-2,1-4,4-2


In [141]:
ply_stats = ply_stats[ply_stats['set3'] != '0-0']

In [142]:
ply_stats

Unnamed: 0,match_id,surface,tourney_level,set1,set2,set3,ply_rank,ply_id,ply_hand,ply_ht,...,ply_bpFaced,ply_rank_points,set1_pnt_diff,set2_pnt_diff,set3_pnt_diff,set1_ot_diff,set2_ot_diff,set1_won,set2_won,set3_won
4,2023-9900292,Hard,A,4-6,7-6(2),6-4,4.0,126774,R,193.0,...,2.0,5550.0,-2.0,1.0,2.0,0.0,5.0,0.0,1.0,1.0
5,2023-9900288,Hard,A,6-0,6-7(4),7-5,4.0,126774,R,193.0,...,6.0,5550.0,6.0,-1.0,2.0,,-3.0,1.0,0.0,1.0
6,2023-9900284,Hard,A,6-4,3-6,6-3,16.0,126610,R,196.0,...,4.0,2375.0,2.0,-3.0,3.0,,0.0,1.0,0.0,1.0
8,2023-9900280,Hard,A,6-4,5-7,6-4,14.0,111815,L,188.0,...,8.0,2445.0,2.0,-2.0,2.0,0.0,0.0,1.0,0.0,1.0
9,2023-9900279,Hard,A,3-6,7-5,6-3,19.0,126207,R,188.0,...,10.0,2000.0,-3.0,2.0,3.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4707,2023-0605289,Hard,A,6-7(3),6-3,6-4,2.0,207989,R,185.0,...,8.0,8455.0,1.0,-3.0,-2.0,4.0,0.0,0.0,1.0,1.0
4711,2023-7696300,Hard,F,3-4(6),4-1,4-2,36.0,209950,R,185.0,...,4.0,1158.0,1.0,-3.0,-2.0,4.0,0.0,0.0,1.0,1.0
4713,2023-7696297,Hard,F,4-2,3-4(3),4-2,94.0,208502,L,180.0,...,11.0,673.0,-2.0,1.0,-2.0,0.0,4.0,1.0,0.0,1.0
4714,2023-7696295,Hard,F,2-4,4-3(6),4-2,115.0,208134,R,185.0,...,5.0,549.0,2.0,-1.0,-2.0,,-4.0,0.0,1.0,1.0


In [143]:
ply_stats = ply_stats[['match_id', 'surface', 'tourney_level', 'set1', 'set2', 'set3',
       'ply_rank', 'ply_id', 'ply_hand', 'ply_ht', 'ply_age', 'score',
       'minutes', 'ply_ace', 'ply_df', 'ply_svpt', 'ply_1stIn', 'ply_1stWon',
       'ply_2ndWon', 'ply_SvGms', 'ply_bpSaved', 'ply_bpFaced',
       'ply_rank_points', 'set1_pnt_diff', 'set2_pnt_diff', 'set3_pnt_diff',
       'set1_ot_diff', 'set2_ot_diff', 'set1_won', 'set2_won', 'set3_won']]
features = ['match_id', 'surface', 'tourney_level', 'set1', 'set2', 'set3',
       'ply_rank', 'ply_id', 'ply_hand', 'ply_ht', 'ply_age', 'score',
       'minutes', 'ply_ace', 'ply_df', 'ply_svpt', 'ply_1stIn', 'ply_1stWon',
       'ply_2ndWon', 'ply_SvGms', 'ply_bpSaved', 'ply_bpFaced',
       'ply_rank_points', 'set1_pnt_diff', 'set2_pnt_diff', 'set3_pnt_diff',
       'set1_ot_diff', 'set2_ot_diff', 'set1_won', 'set2_won']

target = ['set3_won']

In [None]:
ply_stats_enc = ps.get_dummies(ply_stats , columns = ['surface','ply_hand'])


In [150]:
ply_stats_enc['tourney_level'].unique()

array(['A', 'G', 'M', 'F'], dtype=object)

In [151]:
tourney_level_order = {'A': 0, 'M': 1, 'G': 2, 'F': 3}
ply_stats_enc['tourney_level'] = ply_stats_enc['tourney_level'].map(tourney_level_order)


In [153]:
ply_stats_enc.columns

Index(['match_id', 'tourney_level', 'set1', 'set2', 'set3', 'ply_rank',
       'ply_id', 'ply_ht', 'ply_age', 'score', 'minutes', 'ply_ace', 'ply_df',
       'ply_svpt', 'ply_1stIn', 'ply_1stWon', 'ply_2ndWon', 'ply_SvGms',
       'ply_bpSaved', 'ply_bpFaced', 'ply_rank_points', 'set1_pnt_diff',
       'set2_pnt_diff', 'set3_pnt_diff', 'set1_ot_diff', 'set2_ot_diff',
       'set1_won', 'set2_won', 'set3_won', 'surface_Clay', 'surface_Grass',
       'surface_Hard', 'ply_hand_L', 'ply_hand_R'],
      dtype='object')

In [170]:
features = [ 'tourney_level',  'ply_rank',
       'ply_id', 'ply_ht', 'ply_age', 'minutes', 'ply_ace', 'ply_df',
       'ply_svpt', 'ply_1stIn', 'ply_1stWon', 'ply_2ndWon', 'ply_SvGms',
       'ply_bpSaved', 'ply_bpFaced', 'ply_rank_points', 'set1_pnt_diff',
       'set2_pnt_diff', 'set3_pnt_diff', 'set1_ot_diff', 'set2_ot_diff',
       'set1_won', 'set2_won',  'surface_Clay', 'surface_Grass',
       'surface_Hard', 'ply_hand_L', 'ply_hand_R']
target =['set3_won']
X = ply_stats_enc[features]
y = ply_stats_enc[target]

In [177]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [172]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [173]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

In [174]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9705014749262537
              precision    recall  f1-score   support

         0.0       0.92      0.68      0.78        53
         1.0       0.97      1.00      0.98       625

    accuracy                           0.97       678
   macro avg       0.95      0.84      0.88       678
weighted avg       0.97      0.97      0.97       678



In [176]:
print(y_test, y_pred)

      set3_won
3413       1.0
1571       1.0
2509       1.0
2356       1.0
147        1.0
...        ...
4280       1.0
1756       1.0
117        1.0
1105       1.0
1514       1.0

[678 rows x 1 columns] [1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0
 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1