In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd

# Set display option to show all columns
pd.set_option('display.max_columns', None)


In [2]:
df = pd.read_csv("df_atp_clean.csv")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44559 entries, 0 to 44558
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ATP                 44559 non-null  int64  
 1   Location            44559 non-null  object 
 2   Tournament          44559 non-null  object 
 3   Date                44559 non-null  object 
 4   Series              44559 non-null  object 
 5   Court               44559 non-null  object 
 6   Surface             44559 non-null  object 
 7   Round               44559 non-null  object 
 8   Best of             44559 non-null  float64
 9   Winner              44559 non-null  object 
 10  Loser               44559 non-null  object 
 11  Comment             44559 non-null  object 
 12  B365W               44559 non-null  float64
 13  B365L               44559 non-null  float64
 14  PSW                 44559 non-null  float64
 15  PSL                 44559 non-null  float64
 16  WPts

In [3]:
player_id_mapping = {}  # Initialize player-ID mapping

# Assign IDs to Winners
winner_id = 1  # Start ID numbering from 1
for winner in df['Winner'].unique():
    if winner not in player_id_mapping:
        player_id_mapping[winner] = winner_id
        winner_id += 1

# Assign IDs to Losers
for loser in df['Loser'].unique():
    if loser not in player_id_mapping:
        player_id_mapping[loser] = winner_id
        winner_id += 1

# Replace Player Names with IDs
df['Winner_ID'] = df['Winner'].map(player_id_mapping)
df['Loser_ID'] = df['Loser'].map(player_id_mapping)


In [4]:
df

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,Comment,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,Winner_ID,Loser_ID
0,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,Hard,1st Round,3.0,Berdych T.,Calleri A.,Completed,1.39,2.75,1.446,3.04,1200.0,692.0,1,2,1482.654703,1504.995140,0.467894,1,279
1,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,Hard,1st Round,3.0,Kohlschreiber P.,Guccione C.,Completed,1.53,2.37,1.719,2.29,470.0,265.0,1,0,1484.000000,1500.000000,0.476990,2,19
2,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,Hard,1st Round,3.0,Malisse X.,Luczak P.,Completed,1.28,3.25,1.391,3.36,731.0,286.0,4,0,1502.708326,1500.000000,0.503898,3,17
3,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,Hard,1st Round,3.0,Robredo T.,Melzer J.,Completed,1.53,2.37,1.606,2.55,1490.0,690.0,0,5,1500.000000,1490.593267,0.513534,4,70
4,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,Hard,1st Round,3.0,Ancic M.,Clement A.,Completed,1.44,2.62,1.549,2.72,1360.0,558.0,1,6,1487.142017,1475.819981,0.516288,5,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44554,67,Paris,BNP Paribas Masters,2022-11-04,Masters 1000,Indoor,Hard,Quarterfinals,3.0,Djokovic N.,Musetti L.,Completed,1.11,6.50,1.140,6.95,4320.0,1746.0,1235,163,1999.225470,1666.726069,0.871470,21,264
44555,67,Paris,BNP Paribas Masters,2022-11-04,Masters 1000,Indoor,Hard,Quarterfinals,3.0,Tsitsipas S.,Paul T.,Completed,1.40,3.00,1.450,2.97,5035.0,1330.0,415,231,1862.431885,1762.264077,0.640288,224,135
44556,67,Paris,BNP Paribas Masters,2022-11-05,Masters 1000,Indoor,Hard,Semifinals,3.0,Rune H.,Auger-Aliassime F.,Completed,2.75,1.44,2.810,1.50,1991.0,3725.0,153,284,1854.208821,1864.746873,0.484839,150,236
44557,67,Paris,BNP Paribas Masters,2022-11-05,Masters 1000,Indoor,Hard,Semifinals,3.0,Djokovic N.,Tsitsipas S.,Completed,1.20,4.50,1.240,4.62,4320.0,5035.0,1236,416,2003.338426,1873.942685,0.678058,21,224


In [5]:
# One-hot encode the 'Surface' column
df = pd.get_dummies(df, columns=['Surface'], prefix='Surface')

# Display the first few rows of the DataFrame to verify the encoding
df.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Round,Best of,Winner,Loser,Comment,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard
0,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Berdych T.,Calleri A.,Completed,1.39,2.75,1.446,3.04,1200.0,692.0,1,2,1482.654703,1504.99514,0.467894,1,279,False,False,False,True
1,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Kohlschreiber P.,Guccione C.,Completed,1.53,2.37,1.719,2.29,470.0,265.0,1,0,1484.0,1500.0,0.47699,2,19,False,False,False,True
2,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Malisse X.,Luczak P.,Completed,1.28,3.25,1.391,3.36,731.0,286.0,4,0,1502.708326,1500.0,0.503898,3,17,False,False,False,True
3,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Robredo T.,Melzer J.,Completed,1.53,2.37,1.606,2.55,1490.0,690.0,0,5,1500.0,1490.593267,0.513534,4,70,False,False,False,True
4,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Ancic M.,Clement A.,Completed,1.44,2.62,1.549,2.72,1360.0,558.0,1,6,1487.142017,1475.819981,0.516288,5,26,False,False,False,True


In [6]:
df2= pd.read_csv("features.csv")
df2

Unnamed: 0,P1==Winner,P1_wins_proba_elo,P1_match_count,P2_match_count,P1_pts,P2_pts,field_type==indoor_hard,P1_match_count_indoor_hard,P2_match_count_indoor_hard,P1_wins_proba_elo_indoor_hard,field_type==indoor_carpet,P1_match_count_indoor_carpet,P2_match_count_indoor_carpet,P1_wins_proba_elo_indoor_carpet,field_type==indoor_clay,P1_match_count_indoor_clay,P2_match_count_indoor_clay,P1_wins_proba_elo_indoor_clay,field_type==outdoor_hard,P1_match_count_outdoor_hard,P2_match_count_outdoor_hard,P1_wins_proba_elo_outdoor_hard,field_type==outdoor_clay,P1_match_count_outdoor_clay,P2_match_count_outdoor_clay,P1_wins_proba_elo_outdoor_clay,field_type==outdoor_grass,P1_match_count_outdoor_grass,P2_match_count_outdoor_grass,P1_wins_proba_elo_outdoor_grass
0,False,0.532106,2,1,692,1200,0,0,0,0.500000,0,0,0,0.500000,0,0,0,0.500000,1,2,1,0.532106,0,0,0,0.500000,0,0,0,0.500000
1,False,0.523010,0,1,265,470,0,0,0,0.500000,0,0,0,0.500000,0,0,0,0.500000,1,0,1,0.523010,0,0,0,0.500000,0,0,0,0.500000
2,False,0.496102,0,4,286,731,0,0,0,0.500000,0,0,0,0.500000,0,0,0,0.500000,1,0,4,0.496102,0,0,0,0.500000,0,0,0,0.500000
3,False,0.486466,5,0,690,1490,0,0,0,0.500000,0,0,0,0.500000,0,0,0,0.500000,1,5,0,0.486466,0,0,0,0.500000,0,0,0,0.500000
4,True,0.516288,1,6,1360,558,0,0,0,0.500000,0,0,0,0.500000,0,0,0,0.500000,1,1,6,0.516288,0,0,0,0.500000,0,0,0,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44569,False,0.252786,444,1240,4065,3320,1,108,190,0.202169,0,0,13,0.491389,0,0,0,0.500000,0,222,582,0.183191,0,54,327,0.092101,0,60,128,0.110676
44570,True,0.301900,425,419,3530,5350,1,100,98,0.300160,0,0,0,0.500000,0,0,0,0.500000,0,198,163,0.469776,0,100,123,0.337953,0,27,35,0.540097
44571,True,0.863992,1241,379,3320,2955,1,191,51,0.890563,0,13,0,0.508611,0,0,0,0.500000,0,582,220,0.937717,0,327,64,0.897572,0,128,44,0.934018
44572,False,0.632670,426,322,3530,5020,1,101,29,0.623226,0,0,1,0.525163,0,0,10,0.409078,0,198,95,0.658726,0,100,175,0.388429,0,27,12,0.726947


In [7]:
df.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Round,Best of,Winner,Loser,Comment,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard
0,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Berdych T.,Calleri A.,Completed,1.39,2.75,1.446,3.04,1200.0,692.0,1,2,1482.654703,1504.99514,0.467894,1,279,False,False,False,True
1,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Kohlschreiber P.,Guccione C.,Completed,1.53,2.37,1.719,2.29,470.0,265.0,1,0,1484.0,1500.0,0.47699,2,19,False,False,False,True
2,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Malisse X.,Luczak P.,Completed,1.28,3.25,1.391,3.36,731.0,286.0,4,0,1502.708326,1500.0,0.503898,3,17,False,False,False,True
3,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Robredo T.,Melzer J.,Completed,1.53,2.37,1.606,2.55,1490.0,690.0,0,5,1500.0,1490.593267,0.513534,4,70,False,False,False,True
4,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Ancic M.,Clement A.,Completed,1.44,2.62,1.549,2.72,1360.0,558.0,1,6,1487.142017,1475.819981,0.516288,5,26,False,False,False,True


In [8]:
tournament_id_mapping = {}  # Initialize tournament-ID mapping

# Assign IDs to Tournaments
tournament_id = 1  # Start ID numbering from 1
for tournament in df['Tournament'].unique():
    if tournament not in tournament_id_mapping:
        tournament_id_mapping[tournament] = tournament_id
        tournament_id += 1

# Replace Tournament Names with IDs
df['Tournament_ID'] = df['Tournament'].map(tournament_id_mapping)


In [9]:
df

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Round,Best of,Winner,Loser,Comment,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID
0,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Berdych T.,Calleri A.,Completed,1.39,2.75,1.446,3.04,1200.0,692.0,1,2,1482.654703,1504.995140,0.467894,1,279,False,False,False,True,1
1,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Kohlschreiber P.,Guccione C.,Completed,1.53,2.37,1.719,2.29,470.0,265.0,1,0,1484.000000,1500.000000,0.476990,2,19,False,False,False,True,1
2,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Malisse X.,Luczak P.,Completed,1.28,3.25,1.391,3.36,731.0,286.0,4,0,1502.708326,1500.000000,0.503898,3,17,False,False,False,True,1
3,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Robredo T.,Melzer J.,Completed,1.53,2.37,1.606,2.55,1490.0,690.0,0,5,1500.000000,1490.593267,0.513534,4,70,False,False,False,True,1
4,1,Adelaide,Next Generation Adelaide International,2006-01-02,International,Outdoor,1st Round,3.0,Ancic M.,Clement A.,Completed,1.44,2.62,1.549,2.72,1360.0,558.0,1,6,1487.142017,1475.819981,0.516288,5,26,False,False,False,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44554,67,Paris,BNP Paribas Masters,2022-11-04,Masters 1000,Indoor,Quarterfinals,3.0,Djokovic N.,Musetti L.,Completed,1.11,6.50,1.140,6.95,4320.0,1746.0,1235,163,1999.225470,1666.726069,0.871470,21,264,False,False,False,True,104
44555,67,Paris,BNP Paribas Masters,2022-11-04,Masters 1000,Indoor,Quarterfinals,3.0,Tsitsipas S.,Paul T.,Completed,1.40,3.00,1.450,2.97,5035.0,1330.0,415,231,1862.431885,1762.264077,0.640288,224,135,False,False,False,True,104
44556,67,Paris,BNP Paribas Masters,2022-11-05,Masters 1000,Indoor,Semifinals,3.0,Rune H.,Auger-Aliassime F.,Completed,2.75,1.44,2.810,1.50,1991.0,3725.0,153,284,1854.208821,1864.746873,0.484839,150,236,False,False,False,True,104
44557,67,Paris,BNP Paribas Masters,2022-11-05,Masters 1000,Indoor,Semifinals,3.0,Djokovic N.,Tsitsipas S.,Completed,1.20,4.50,1.240,4.62,4320.0,5035.0,1236,416,2003.338426,1873.942685,0.678058,21,224,False,False,False,True,104


In [10]:
df = pd.get_dummies(df, columns=['Series'], prefix='Series')

In [11]:
df = pd.get_dummies(df, columns=['Court'], prefix='Court')

In [12]:
df = pd.get_dummies(df, columns=['Round'], prefix='Round')

In [13]:
df = pd.get_dummies(df, columns=['Best of'], prefix='Best_of')

In [14]:
# final dataframe for comparing
df_final = df

In [15]:
df_final

Unnamed: 0,ATP,Location,Tournament,Date,Winner,Loser,Comment,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0
0,1,Adelaide,Next Generation Adelaide International,2006-01-02,Berdych T.,Calleri A.,Completed,1.39,2.75,1.446,3.04,1200.0,692.0,1,2,1482.654703,1504.995140,0.467894,1,279,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
1,1,Adelaide,Next Generation Adelaide International,2006-01-02,Kohlschreiber P.,Guccione C.,Completed,1.53,2.37,1.719,2.29,470.0,265.0,1,0,1484.000000,1500.000000,0.476990,2,19,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
2,1,Adelaide,Next Generation Adelaide International,2006-01-02,Malisse X.,Luczak P.,Completed,1.28,3.25,1.391,3.36,731.0,286.0,4,0,1502.708326,1500.000000,0.503898,3,17,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
3,1,Adelaide,Next Generation Adelaide International,2006-01-02,Robredo T.,Melzer J.,Completed,1.53,2.37,1.606,2.55,1490.0,690.0,0,5,1500.000000,1490.593267,0.513534,4,70,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
4,1,Adelaide,Next Generation Adelaide International,2006-01-02,Ancic M.,Clement A.,Completed,1.44,2.62,1.549,2.72,1360.0,558.0,1,6,1487.142017,1475.819981,0.516288,5,26,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44554,67,Paris,BNP Paribas Masters,2022-11-04,Djokovic N.,Musetti L.,Completed,1.11,6.50,1.140,6.95,4320.0,1746.0,1235,163,1999.225470,1666.726069,0.871470,21,264,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False
44555,67,Paris,BNP Paribas Masters,2022-11-04,Tsitsipas S.,Paul T.,Completed,1.40,3.00,1.450,2.97,5035.0,1330.0,415,231,1862.431885,1762.264077,0.640288,224,135,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False
44556,67,Paris,BNP Paribas Masters,2022-11-05,Rune H.,Auger-Aliassime F.,Completed,2.75,1.44,2.810,1.50,1991.0,3725.0,153,284,1854.208821,1864.746873,0.484839,150,236,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False
44557,67,Paris,BNP Paribas Masters,2022-11-05,Djokovic N.,Tsitsipas S.,Completed,1.20,4.50,1.240,4.62,4320.0,5035.0,1236,416,2003.338426,1873.942685,0.678058,21,224,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False


In [16]:
# Drop specified columns
columns_to_drop = ['Winner', 'Loser', 'Location', 'Tournament', 'Comment']
df = df.drop(columns_to_drop, axis=1)

# Display the first few rows of the DataFrame to verify the columns are dropped
print(df.head())
df

   ATP        Date  B365W  B365L    PSW   PSL    WPts   LPts  \
0    1  2006-01-02   1.39   2.75  1.446  3.04  1200.0  692.0   
1    1  2006-01-02   1.53   2.37  1.719  2.29   470.0  265.0   
2    1  2006-01-02   1.28   3.25  1.391  3.36   731.0  286.0   
3    1  2006-01-02   1.53   2.37  1.606  2.55  1490.0  690.0   
4    1  2006-01-02   1.44   2.62  1.549  2.72  1360.0  558.0   

   match_count_winner  match_count_loser   elo_winner    elo_loser  proba_elo  \
0                   1                  2  1482.654703  1504.995140   0.467894   
1                   1                  0  1484.000000  1500.000000   0.476990   
2                   4                  0  1502.708326  1500.000000   0.503898   
3                   0                  5  1500.000000  1490.593267   0.513534   
4                   1                  6  1487.142017  1475.819981   0.516288   

   Winner_ID  Loser_ID  Surface_Carpet  Surface_Clay  Surface_Grass  \
0          1       279           False         False     

Unnamed: 0,ATP,Date,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0
0,1,2006-01-02,1.39,2.75,1.446,3.04,1200.0,692.0,1,2,1482.654703,1504.995140,0.467894,1,279,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
1,1,2006-01-02,1.53,2.37,1.719,2.29,470.0,265.0,1,0,1484.000000,1500.000000,0.476990,2,19,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
2,1,2006-01-02,1.28,3.25,1.391,3.36,731.0,286.0,4,0,1502.708326,1500.000000,0.503898,3,17,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
3,1,2006-01-02,1.53,2.37,1.606,2.55,1490.0,690.0,0,5,1500.000000,1490.593267,0.513534,4,70,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
4,1,2006-01-02,1.44,2.62,1.549,2.72,1360.0,558.0,1,6,1487.142017,1475.819981,0.516288,5,26,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44554,67,2022-11-04,1.11,6.50,1.140,6.95,4320.0,1746.0,1235,163,1999.225470,1666.726069,0.871470,21,264,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False
44555,67,2022-11-04,1.40,3.00,1.450,2.97,5035.0,1330.0,415,231,1862.431885,1762.264077,0.640288,224,135,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False
44556,67,2022-11-05,2.75,1.44,2.810,1.50,1991.0,3725.0,153,284,1854.208821,1864.746873,0.484839,150,236,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False
44557,67,2022-11-05,1.20,4.50,1.240,4.62,4320.0,5035.0,1236,416,2003.338426,1873.942685,0.678058,21,224,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False


In [64]:
df

Unnamed: 0,ATP,Date,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0
0,1,2006-01-02,1.39,2.75,1.446,3.04,1200.0,692.0,1,2,1482.654703,1504.995140,0.467894,1,279,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
1,1,2006-01-02,1.53,2.37,1.719,2.29,470.0,265.0,1,0,1484.000000,1500.000000,0.476990,2,19,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
2,1,2006-01-02,1.28,3.25,1.391,3.36,731.0,286.0,4,0,1502.708326,1500.000000,0.503898,3,17,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
3,1,2006-01-02,1.53,2.37,1.606,2.55,1490.0,690.0,0,5,1500.000000,1490.593267,0.513534,4,70,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
4,1,2006-01-02,1.44,2.62,1.549,2.72,1360.0,558.0,1,6,1487.142017,1475.819981,0.516288,5,26,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44554,67,2022-11-04,1.11,6.50,1.140,6.95,4320.0,1746.0,1235,163,1999.225470,1666.726069,0.871470,21,264,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False
44555,67,2022-11-04,1.40,3.00,1.450,2.97,5035.0,1330.0,415,231,1862.431885,1762.264077,0.640288,224,135,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False
44556,67,2022-11-05,2.75,1.44,2.810,1.50,1991.0,3725.0,153,284,1854.208821,1864.746873,0.484839,150,236,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False
44557,67,2022-11-05,1.20,4.50,1.240,4.62,4320.0,5035.0,1236,416,2003.338426,1873.942685,0.678058,21,224,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False


In [49]:
df

Unnamed: 0,ATP,B365W,B365L,PSW,PSL,WPts,LPts,match_count_PlayerA,match_count_PPlayerBayerB,elo_PlayerA,ePlayerBo_PPlayerBayerB,proba_ePlayerBo,Winner_ID,Loser_ID,Surface_Carpet,Surface_CPlayerBay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand SPlayerBam,Series_InternationaPlayerB,Series_InternationaPlayerB GoPlayerBd,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_QuarterfinaPlayerBs,Round_Round Robin,Round_SemifinaPlayerBs,Round_The FinaPlayerB,Best_of_3.0,Best_of_5.0,Winner,Year,Month,Day,DayOfWeek,PPlayerBayerA,PPlayerBayerB
0,1,-0.393020,-0.228098,-0.371979,-0.196574,-0.283650,-0.320004,-1.038175,-0.947645,-1.037820,-0.623217,-0.548569,1,279,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,1,2006,1,2,0,1,279
1,1,-0.271119,-0.335928,-0.162978,-0.341755,-0.615132,-0.671612,-1.038175,-0.957748,-1.029089,-0.663532,-0.500169,2,19,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,0,2006,1,2,0,19,2
2,1,-0.488800,-0.086215,-0.414085,-0.134630,-0.496616,-0.654320,-1.025714,-0.957748,-0.907664,-0.663532,-0.357006,3,17,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,1,2006,1,2,0,3,17
3,1,-0.271119,-0.335928,-0.249487,-0.291425,-0.151965,-0.321650,-1.042328,-0.932490,-0.925242,-0.739451,-0.305733,4,70,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,1,2006,1,2,0,4,70
4,1,-0.349484,-0.264987,-0.293125,-0.258517,-0.210996,-0.430344,-1.038175,-0.927439,-1.008696,-0.858681,-0.291081,5,26,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,1,2006,1,2,0,5,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44554,67,-0.636823,0.836022,-0.606244,0.560304,1.133096,0.547902,4.087241,-0.134355,2.314946,0.682064,1.598716,21,264,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,0,2022,11,4,4,264,21
44555,67,-0.384313,-0.157156,-0.368917,-0.210124,1.457768,0.205351,0.681373,0.209147,1.427097,1.453122,0.368676,224,135,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,0,2022,11,4,4,135,224
44556,67,0.791161,-0.599830,0.672261,-0.494679,0.075532,2.177488,-0.406843,0.476876,1.373726,2.280230,-0.458409,150,236,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,0,2022,11,5,5,236,150
44557,67,-0.558458,0.268492,-0.529687,0.109275,1.133096,3.256194,4.091395,1.143672,2.341641,2.354446,0.569640,21,224,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,1,2022,11,5,5,21,224


In [17]:
df['Date'] = pd.to_datetime(df['Date'])

# Extract relevant features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek
# Add more feature extraction as needed

# Drop the original 'Date' column if it's no longer needed
df = df.drop('Date', axis=1)


In [18]:
df

Unnamed: 0,ATP,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0,Year,Month,Day,DayOfWeek
0,1,1.39,2.75,1.446,3.04,1200.0,692.0,1,2,1482.654703,1504.995140,0.467894,1,279,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
1,1,1.53,2.37,1.719,2.29,470.0,265.0,1,0,1484.000000,1500.000000,0.476990,2,19,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
2,1,1.28,3.25,1.391,3.36,731.0,286.0,4,0,1502.708326,1500.000000,0.503898,3,17,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
3,1,1.53,2.37,1.606,2.55,1490.0,690.0,0,5,1500.000000,1490.593267,0.513534,4,70,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
4,1,1.44,2.62,1.549,2.72,1360.0,558.0,1,6,1487.142017,1475.819981,0.516288,5,26,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44554,67,1.11,6.50,1.140,6.95,4320.0,1746.0,1235,163,1999.225470,1666.726069,0.871470,21,264,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,2022,11,4,4
44555,67,1.40,3.00,1.450,2.97,5035.0,1330.0,415,231,1862.431885,1762.264077,0.640288,224,135,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,2022,11,4,4
44556,67,2.75,1.44,2.810,1.50,1991.0,3725.0,153,284,1854.208821,1864.746873,0.484839,150,236,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,2022,11,5,5
44557,67,1.20,4.50,1.240,4.62,4320.0,5035.0,1236,416,2003.338426,1873.942685,0.678058,21,224,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,2022,11,5,5


In [19]:
df.head(20)
df_final = df

In [20]:
df_final

Unnamed: 0,ATP,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0,Year,Month,Day,DayOfWeek
0,1,1.39,2.75,1.446,3.04,1200.0,692.0,1,2,1482.654703,1504.995140,0.467894,1,279,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
1,1,1.53,2.37,1.719,2.29,470.0,265.0,1,0,1484.000000,1500.000000,0.476990,2,19,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
2,1,1.28,3.25,1.391,3.36,731.0,286.0,4,0,1502.708326,1500.000000,0.503898,3,17,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
3,1,1.53,2.37,1.606,2.55,1490.0,690.0,0,5,1500.000000,1490.593267,0.513534,4,70,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
4,1,1.44,2.62,1.549,2.72,1360.0,558.0,1,6,1487.142017,1475.819981,0.516288,5,26,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44554,67,1.11,6.50,1.140,6.95,4320.0,1746.0,1235,163,1999.225470,1666.726069,0.871470,21,264,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,2022,11,4,4
44555,67,1.40,3.00,1.450,2.97,5035.0,1330.0,415,231,1862.431885,1762.264077,0.640288,224,135,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,2022,11,4,4
44556,67,2.75,1.44,2.810,1.50,1991.0,3725.0,153,284,1854.208821,1864.746873,0.484839,150,236,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,2022,11,5,5
44557,67,1.20,4.50,1.240,4.62,4320.0,5035.0,1236,416,2003.338426,1873.942685,0.678058,21,224,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,2022,11,5,5


In [21]:
df.to_csv("df_final")

In [22]:


# Define the new column names according to the provided schema
new_names = {
    'ATP': 'ATP',
    'B365W': 'B365_PlayerA',
    'B365L': 'B365_PlayerB',
    'PSW': 'PS_PlayerA',
    'PSL': 'PS_PlayerB',
    'WPts': 'PlayerA_Pts',
    'LPts': 'PlayerB_Pts',
    'match_count_winner': 'match_count_PlayerA',
    'match_count_loser': 'match_count_PlayerB',
    'elo_winner': 'elo_PlayerA',
    'elo_loser': 'elo_PlayerB',
    'proba_elo': 'proba_elo',
    'Winner_ID': 'PlayerA_ID',
    'Loser_ID': 'PlayerB_ID',
    'Surface_Carpet': 'Surface_Carpet',
    'Surface_Clay': 'Surface_Clay',
    'Surface_Grass': 'Surface_Grass',
    'Surface_Hard': 'Surface_Hard',
    'Tournament_ID': 'Tournament_ID',
    'Series_ATP250': 'Series_ATP250',
    'Series_ATP500': 'Series_ATP500',
    'Series_Grand Slam': 'Series_Grand Slam',
    'Series_International': 'Series_International',
    'Series_International Gold': 'Series_International Gold',
    'Series_Masters': 'Series_Masters',
    'Series_Masters 1000': 'Series_Masters 1000',
    'Series_Masters Cup': 'Series_Masters Cup',
    'Court_Indoor': 'Court_Indoor',
    'Court_Outdoor': 'Court_Outdoor',
    'Round_1st Round': 'Round_1st Round',
    'Round_2nd Round': 'Round_2nd Round',
    'Round_3rd Round': 'Round_3rd Round',
    'Round_4th Round': 'Round_4th Round',
    'Round_Quarterfinals': 'Round_Quarterfinals',
    'Round_Round Robin': 'Round_Round Robin',
    'Round_Semifinals': 'Round_Semifinals',
    'Round_The Final': 'Round_The Final',
    'Best_of_3.0': 'Best_of_3.0',
    'Best_of_5.0': 'Best_of_5.0',
    'Year': 'Year',
    'Month': 'Month',
    'Day': 'Day',
    'DayOfWeek': 'DayOfWeek'
}

# Rename columns based on the defined mapping
df_final.rename(columns=new_names, inplace=True)

# Display the modified DataFrame
df_final

Unnamed: 0,ATP,B365_PlayerA,B365_PlayerB,PS_PlayerA,PS_PlayerB,PlayerA_Pts,PlayerB_Pts,match_count_PlayerA,match_count_PlayerB,elo_PlayerA,elo_PlayerB,proba_elo,PlayerA_ID,PlayerB_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0,Year,Month,Day,DayOfWeek
0,1,1.39,2.75,1.446,3.04,1200.0,692.0,1,2,1482.654703,1504.995140,0.467894,1,279,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
1,1,1.53,2.37,1.719,2.29,470.0,265.0,1,0,1484.000000,1500.000000,0.476990,2,19,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
2,1,1.28,3.25,1.391,3.36,731.0,286.0,4,0,1502.708326,1500.000000,0.503898,3,17,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
3,1,1.53,2.37,1.606,2.55,1490.0,690.0,0,5,1500.000000,1490.593267,0.513534,4,70,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
4,1,1.44,2.62,1.549,2.72,1360.0,558.0,1,6,1487.142017,1475.819981,0.516288,5,26,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44554,67,1.11,6.50,1.140,6.95,4320.0,1746.0,1235,163,1999.225470,1666.726069,0.871470,21,264,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,2022,11,4,4
44555,67,1.40,3.00,1.450,2.97,5035.0,1330.0,415,231,1862.431885,1762.264077,0.640288,224,135,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,2022,11,4,4
44556,67,2.75,1.44,2.810,1.50,1991.0,3725.0,153,284,1854.208821,1864.746873,0.484839,150,236,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,2022,11,5,5
44557,67,1.20,4.50,1.240,4.62,4320.0,5035.0,1236,416,2003.338426,1873.942685,0.678058,21,224,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,2022,11,5,5


In [23]:
df_final.columns
df_final.to_csv('thatsit.csv')


In [25]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44559 entries, 0 to 44558
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ATP                        44559 non-null  int64  
 1   B365_PlayerA               44559 non-null  float64
 2   B365_PlayerB               44559 non-null  float64
 3   PS_PlayerA                 44559 non-null  float64
 4   PS_PlayerB                 44559 non-null  float64
 5   PlayerA_Pts                44559 non-null  float64
 6   PlayerB_Pts                44559 non-null  float64
 7   match_count_PlayerA        44559 non-null  int64  
 8   match_count_PlayerB        44559 non-null  int64  
 9   elo_PlayerA                44559 non-null  float64
 10  elo_PlayerB                44559 non-null  float64
 11  proba_elo                  44559 non-null  float64
 12  PlayerA_ID                 44559 non-null  int64  
 13  PlayerB_ID                 44559 non-null  int

In [26]:
import pandas as pd

# Assuming df_final is your DataFrame
df_final['PlayerA_Wins'] = True

# Display the modified DataFrame
print(df_final)


       ATP  B365_PlayerA  B365_PlayerB  PS_PlayerA  PS_PlayerB  PlayerA_Pts  \
0        1          1.39          2.75       1.446        3.04       1200.0   
1        1          1.53          2.37       1.719        2.29        470.0   
2        1          1.28          3.25       1.391        3.36        731.0   
3        1          1.53          2.37       1.606        2.55       1490.0   
4        1          1.44          2.62       1.549        2.72       1360.0   
...    ...           ...           ...         ...         ...          ...   
44554   67          1.11          6.50       1.140        6.95       4320.0   
44555   67          1.40          3.00       1.450        2.97       5035.0   
44556   67          2.75          1.44       2.810        1.50       1991.0   
44557   67          1.20          4.50       1.240        4.62       4320.0   
44558   67          4.00          1.25       4.300        1.26       1991.0   

       PlayerB_Pts  match_count_PlayerA  match_coun

In [27]:
df_final
import pandas as pd

# Assuming df_final is your DataFrame
df_final.drop(columns=['Winner'], inplace=True)

# Display the modified DataFrame
print(df_final)


KeyError: "['Winner'] not found in axis"

In [28]:
df_final

Unnamed: 0,ATP,B365_PlayerA,B365_PlayerB,PS_PlayerA,PS_PlayerB,PlayerA_Pts,PlayerB_Pts,match_count_PlayerA,match_count_PlayerB,elo_PlayerA,elo_PlayerB,proba_elo,PlayerA_ID,PlayerB_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0,Year,Month,Day,DayOfWeek,PlayerA_Wins
0,1,1.39,2.75,1.446,3.04,1200.0,692.0,1,2,1482.654703,1504.995140,0.467894,1,279,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0,True
1,1,1.53,2.37,1.719,2.29,470.0,265.0,1,0,1484.000000,1500.000000,0.476990,2,19,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0,True
2,1,1.28,3.25,1.391,3.36,731.0,286.0,4,0,1502.708326,1500.000000,0.503898,3,17,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0,True
3,1,1.53,2.37,1.606,2.55,1490.0,690.0,0,5,1500.000000,1490.593267,0.513534,4,70,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0,True
4,1,1.44,2.62,1.549,2.72,1360.0,558.0,1,6,1487.142017,1475.819981,0.516288,5,26,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44554,67,1.11,6.50,1.140,6.95,4320.0,1746.0,1235,163,1999.225470,1666.726069,0.871470,21,264,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,2022,11,4,4,True
44555,67,1.40,3.00,1.450,2.97,5035.0,1330.0,415,231,1862.431885,1762.264077,0.640288,224,135,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,2022,11,4,4,True
44556,67,2.75,1.44,2.810,1.50,1991.0,3725.0,153,284,1854.208821,1864.746873,0.484839,150,236,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,2022,11,5,5,True
44557,67,1.20,4.50,1.240,4.62,4320.0,5035.0,1236,416,2003.338426,1873.942685,0.678058,21,224,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,2022,11,5,5,True


In [29]:
df_final.to_csv('thatreallyit.csv', index=False)


In [34]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44559 entries, 0 to 44558
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ATP                        44559 non-null  int64  
 1   B365_PlayerA               44559 non-null  float64
 2   B365_PlayerB               44559 non-null  float64
 3   PS_PlayerA                 44559 non-null  float64
 4   PS_PlayerB                 44559 non-null  float64
 5   PlayerA_Pts                44559 non-null  float64
 6   PlayerB_Pts                44559 non-null  float64
 7   match_count_PlayerA        44559 non-null  int64  
 8   match_count_PlayerB        44559 non-null  int64  
 9   elo_PlayerA                44559 non-null  float64
 10  elo_PlayerB                44559 non-null  float64
 11  proba_elo                  44559 non-null  float64
 12  PlayerA_ID                 44559 non-null  int64  
 13  PlayerB_ID                 44559 non-null  int

In [38]:
df_old = df_final

In [40]:
df_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44559 entries, 0 to 44558
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ATP                        44559 non-null  int64  
 1   B365_PlayerA               44559 non-null  float64
 2   B365_PlayerB               44559 non-null  float64
 3   PS_PlayerA                 44559 non-null  float64
 4   PS_PlayerB                 44559 non-null  float64
 5   PlayerA_Pts                44559 non-null  float64
 6   PlayerB_Pts                44559 non-null  float64
 7   match_count_PlayerA        44559 non-null  int64  
 8   match_count_PlayerB        44559 non-null  int64  
 9   elo_PlayerA                44559 non-null  float64
 10  elo_PlayerB                44559 non-null  float64
 11  proba_elo                  44559 non-null  float64
 12  PlayerA_ID                 44559 non-null  int64  
 13  PlayerB_ID                 44559 non-null  int

In [53]:
import pandas as pd
import numpy as np

# Assuming df_old is your original DataFrame

# Create a copy of the original DataFrame
df_random = df_old.copy()

# Selecting relevant columns for swapping
match_columns = ['B365_PlayerA', 'B365_PlayerB', 'PS_PlayerA', 'PS_PlayerB', 
                 'PlayerA_Pts', 'PlayerB_Pts', 'elo_PlayerA', 'elo_PlayerB',
                 'match_count_PlayerA', 'match_count_PlayerB', 'proba_elo', 
                 'PlayerA_ID', 'PlayerB_ID']

# Create a mask to randomly determine which rows to swap
swap_mask = np.random.choice([True, False], size=len(df_random), p=[0.5, 0.5])

# Swap values between Player A and Player B randomly in about 50% of the cases while keeping the row order unchanged
for col in match_columns:
    temp_col = col.replace('_PlayerA', '_temp')
    # Store values from Player A temporarily
    df_random.loc[swap_mask, temp_col] = df_random.loc[swap_mask, col]
    # Swap values between Player A and Player B
    df_random.loc[swap_mask, col] = df_random.loc[swap_mask, col.replace('_PlayerA', '_PlayerB')]
    df_random.loc[swap_mask, col.replace('_PlayerA', '_PlayerB')] = df_random.loc[swap_mask, temp_col]

# Drop the temporary column
df_random.drop(columns=[col for col in df_random.columns if '_temp' in col], inplace=True)

# Swap PlayerA_ID and PlayerB_ID columns
df_random.loc[swap_mask, ['PlayerA_ID', 'PlayerB_ID']] = df_random.loc[swap_mask, ['PlayerB_ID', 'PlayerA_ID']].values

# Update 'PlayerA_Wins' column to reflect the new match outcomes based on swapped values
df_random['PlayerA_Wins'] = ~swap_mask

# Display the updated DataFrame
print(df_random)



       ATP  B365_PlayerA  B365_PlayerB  PS_PlayerA  PS_PlayerB  PlayerA_Pts  \
0        1          2.75          1.39       3.040       1.446       1200.0   
1        1          1.53          2.37       1.719       2.290        470.0   
2        1          1.28          3.25       1.391       3.360        731.0   
3        1          2.37          1.53       2.550       1.606       1490.0   
4        1          2.62          1.44       2.720       1.549       1360.0   
...    ...           ...           ...         ...         ...          ...   
44554   67          1.11          6.50       1.140       6.950       4320.0   
44555   67          1.40          3.00       1.450       2.970       5035.0   
44556   67          2.75          1.44       2.810       1.500       1991.0   
44557   67          4.50          1.20       4.620       1.240       4320.0   
44558   67          1.25          4.00       1.260       4.300       1991.0   

       PlayerB_Pts  match_count_PlayerA  match_coun

In [55]:
df_random

Unnamed: 0,ATP,B365_PlayerA,B365_PlayerB,PS_PlayerA,PS_PlayerB,PlayerA_Pts,PlayerB_Pts,match_count_PlayerA,match_count_PlayerB,elo_PlayerA,elo_PlayerB,proba_elo,PlayerA_ID,PlayerB_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0,Year,Month,Day,DayOfWeek,PlayerA_Wins
0,1,2.75,1.39,3.040,1.446,1200.0,692.0,2,1,1504.995140,1482.654703,0.467894,279,1,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0,False
1,1,1.53,2.37,1.719,2.290,470.0,265.0,1,0,1484.000000,1500.000000,0.476990,2,19,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0,True
2,1,1.28,3.25,1.391,3.360,731.0,286.0,4,0,1502.708326,1500.000000,0.503898,3,17,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0,True
3,1,2.37,1.53,2.550,1.606,1490.0,690.0,5,0,1490.593267,1500.000000,0.513534,70,4,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0,False
4,1,2.62,1.44,2.720,1.549,1360.0,558.0,6,1,1475.819981,1487.142017,0.516288,26,5,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2006,1,2,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44554,67,1.11,6.50,1.140,6.950,4320.0,1746.0,1235,163,1999.225470,1666.726069,0.871470,21,264,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,2022,11,4,4,True
44555,67,1.40,3.00,1.450,2.970,5035.0,1330.0,415,231,1862.431885,1762.264077,0.640288,224,135,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,2022,11,4,4,True
44556,67,2.75,1.44,2.810,1.500,1991.0,3725.0,153,284,1854.208821,1864.746873,0.484839,150,236,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,2022,11,5,5,True
44557,67,4.50,1.20,4.620,1.240,4320.0,5035.0,416,1236,1873.942685,2003.338426,0.678058,224,21,False,False,False,True,104,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,2022,11,5,5,False


In [56]:
df_random.to_csv('df_random.csv') #### FINAL DATAFRAME