In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd

# Set display option to show all columns
pd.set_option('display.max_columns', None)

df = pd.read_csv("archive/vahid_reduced.csv", index_col=[0])
df.info()
df.reset_index(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48824 entries, 1 to 68
Data columns (total 52 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Location                          48824 non-null  object 
 1   Tournament                        48824 non-null  object 
 2   Date                              48824 non-null  object 
 3   Series                            48824 non-null  object 
 4   Court                             48824 non-null  object 
 5   Surface                           48824 non-null  object 
 6   Round                             48824 non-null  object 
 7   Best of                           48809 non-null  float64
 8   Winner                            48824 non-null  object 
 9   Loser                             48824 non-null  object 
 10  Comment                           48824 non-null  object 
 11  B365W                             48824 non-null  float64
 12  B365L       

In [None]:
# This notebook loads the fetched atp data (from atp_reduced data frame from vahid) and hot encodes certain variables
# It also shuffles the variable player A and player B wins randomly

In [3]:
player_id_mapping = {}  # Initialize player-ID mapping

# Assign IDs to Winners
winner_id = 1  # Start ID numbering from 1
for winner in df['Winner'].unique():
    if winner not in player_id_mapping:
        player_id_mapping[winner] = winner_id
        winner_id += 1

# Assign IDs to Losers
for loser in df['Loser'].unique():
    if loser not in player_id_mapping:
        player_id_mapping[loser] = winner_id
        winner_id += 1

# Replace Player Names with IDs
df['Winner_ID'] = df['Winner'].map(player_id_mapping)
df['Loser_ID'] = df['Loser'].map(player_id_mapping)

df.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,Comment,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,match_count_indoor_hard_winner,match_count_indoor_hard_loser,elo_indoor_hard_winner,elo_indoor_hard_loser,proba_elo_indoor_hard,match_count_indoor_carpet_winner,match_count_indoor_carpet_loser,elo_indoor_carpet_winner,elo_indoor_carpet_loser,proba_elo_indoor_carpet,match_count_indoor_clay_winner,match_count_indoor_clay_loser,elo_indoor_clay_winner,elo_indoor_clay_loser,proba_elo_indoor_clay,match_count_outdoor_hard_winner,match_count_outdoor_hard_loser,elo_outdoor_hard_winner,elo_outdoor_hard_loser,proba_elo_outdoor_hard,match_count_outdoor_clay_winner,match_count_outdoor_clay_loser,elo_outdoor_clay_winner,elo_outdoor_clay_loser,proba_elo_outdoor_clay,match_count_outdoor_grass_winner,match_count_outdoor_grass_loser,elo_outdoor_grass_winner,elo_outdoor_grass_loser,proba_elo_outdoor_grass,Winner_ID,Loser_ID
0,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,Hard,1st Round,3.0,Dent T.,Horna L.,Completed,1.16,4.5,1.241,4.55,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,465
1,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,Hard,1st Round,3.0,Martin A.,Ancic M.,Completed,2.0,1.72,2.17,1.752,0.0,0.0,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,2,33
2,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,Hard,1st Round,3.0,Beck K.,Vahaly B.,Completed,1.83,1.83,1.73,2.21,0.0,0.0,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,510
3,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,Hard,1st Round,3.0,Nieminen J.,Moodie W.,Completed,1.4,2.75,1.459,2.93,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,4,303
4,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,Hard,1st Round,3.0,Arthurs W.,Karlovic I.,Completed,1.8,1.909,1.862,2.0,0.0,0.0,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,5,36


In [4]:
# One-hot encode the 'Surface' column
df = pd.get_dummies(df, columns=['Surface'], prefix='Surface')

# Display the first few rows of the DataFrame to verify the encoding
df.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Round,Best of,Winner,Loser,Comment,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,match_count_indoor_hard_winner,match_count_indoor_hard_loser,elo_indoor_hard_winner,elo_indoor_hard_loser,proba_elo_indoor_hard,match_count_indoor_carpet_winner,match_count_indoor_carpet_loser,elo_indoor_carpet_winner,elo_indoor_carpet_loser,proba_elo_indoor_carpet,match_count_indoor_clay_winner,match_count_indoor_clay_loser,elo_indoor_clay_winner,elo_indoor_clay_loser,proba_elo_indoor_clay,match_count_outdoor_hard_winner,match_count_outdoor_hard_loser,elo_outdoor_hard_winner,elo_outdoor_hard_loser,proba_elo_outdoor_hard,match_count_outdoor_clay_winner,match_count_outdoor_clay_loser,elo_outdoor_clay_winner,elo_outdoor_clay_loser,proba_elo_outdoor_clay,match_count_outdoor_grass_winner,match_count_outdoor_grass_loser,elo_outdoor_grass_winner,elo_outdoor_grass_loser,proba_elo_outdoor_grass,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard
0,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,1st Round,3.0,Dent T.,Horna L.,Completed,1.16,4.5,1.241,4.55,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,465,False,False,False,True
1,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,1st Round,3.0,Martin A.,Ancic M.,Completed,2.0,1.72,2.17,1.752,0.0,0.0,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,2,33,False,False,False,True
2,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,1st Round,3.0,Beck K.,Vahaly B.,Completed,1.83,1.83,1.73,2.21,0.0,0.0,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,510,False,False,False,True
3,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,1st Round,3.0,Nieminen J.,Moodie W.,Completed,1.4,2.75,1.459,2.93,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,4,303,False,False,False,True
4,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,1st Round,3.0,Arthurs W.,Karlovic I.,Completed,1.8,1.909,1.862,2.0,0.0,0.0,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,5,36,False,False,False,True


In [5]:
tournament_id_mapping = {}  # Initialize tournament-ID mapping

# Assign IDs to Tournaments
tournament_id = 1  # Start ID numbering from 1
for tournament in df['Tournament'].unique():
    if tournament not in tournament_id_mapping:
        tournament_id_mapping[tournament] = tournament_id
        tournament_id += 1

# Replace Tournament Names with IDs
df['Tournament_ID'] = df['Tournament'].map(tournament_id_mapping)

df.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Round,Best of,Winner,Loser,Comment,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,match_count_indoor_hard_winner,match_count_indoor_hard_loser,elo_indoor_hard_winner,elo_indoor_hard_loser,proba_elo_indoor_hard,match_count_indoor_carpet_winner,match_count_indoor_carpet_loser,elo_indoor_carpet_winner,elo_indoor_carpet_loser,proba_elo_indoor_carpet,match_count_indoor_clay_winner,match_count_indoor_clay_loser,elo_indoor_clay_winner,elo_indoor_clay_loser,proba_elo_indoor_clay,match_count_outdoor_hard_winner,match_count_outdoor_hard_loser,elo_outdoor_hard_winner,elo_outdoor_hard_loser,proba_elo_outdoor_hard,match_count_outdoor_clay_winner,match_count_outdoor_clay_loser,elo_outdoor_clay_winner,elo_outdoor_clay_loser,proba_elo_outdoor_clay,match_count_outdoor_grass_winner,match_count_outdoor_grass_loser,elo_outdoor_grass_winner,elo_outdoor_grass_loser,proba_elo_outdoor_grass,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID
0,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,1st Round,3.0,Dent T.,Horna L.,Completed,1.16,4.5,1.241,4.55,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,465,False,False,False,True,1
1,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,1st Round,3.0,Martin A.,Ancic M.,Completed,2.0,1.72,2.17,1.752,0.0,0.0,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,2,33,False,False,False,True,1
2,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,1st Round,3.0,Beck K.,Vahaly B.,Completed,1.83,1.83,1.73,2.21,0.0,0.0,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,510,False,False,False,True,1
3,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,1st Round,3.0,Nieminen J.,Moodie W.,Completed,1.4,2.75,1.459,2.93,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,4,303,False,False,False,True,1
4,1,Adelaide,AAPT Championships,2004-01-05,International,Outdoor,1st Round,3.0,Arthurs W.,Karlovic I.,Completed,1.8,1.909,1.862,2.0,0.0,0.0,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,5,36,False,False,False,True,1


In [6]:
df = pd.get_dummies(df, columns=['Series'], prefix='Series')
df = pd.get_dummies(df, columns=['Court'], prefix='Court')
df = pd.get_dummies(df, columns=['Round'], prefix='Round')
df = pd.get_dummies(df, columns=['Best of'], prefix='Best_of')

df.head()

Unnamed: 0,ATP,Location,Tournament,Date,Winner,Loser,Comment,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,match_count_indoor_hard_winner,match_count_indoor_hard_loser,elo_indoor_hard_winner,elo_indoor_hard_loser,proba_elo_indoor_hard,match_count_indoor_carpet_winner,match_count_indoor_carpet_loser,elo_indoor_carpet_winner,elo_indoor_carpet_loser,proba_elo_indoor_carpet,match_count_indoor_clay_winner,match_count_indoor_clay_loser,elo_indoor_clay_winner,elo_indoor_clay_loser,proba_elo_indoor_clay,match_count_outdoor_hard_winner,match_count_outdoor_hard_loser,elo_outdoor_hard_winner,elo_outdoor_hard_loser,proba_elo_outdoor_hard,match_count_outdoor_clay_winner,match_count_outdoor_clay_loser,elo_outdoor_clay_winner,elo_outdoor_clay_loser,proba_elo_outdoor_clay,match_count_outdoor_grass_winner,match_count_outdoor_grass_loser,elo_outdoor_grass_winner,elo_outdoor_grass_loser,proba_elo_outdoor_grass,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0
0,1,Adelaide,AAPT Championships,2004-01-05,Dent T.,Horna L.,Completed,1.16,4.5,1.241,4.55,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,465,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
1,1,Adelaide,AAPT Championships,2004-01-05,Martin A.,Ancic M.,Completed,2.0,1.72,2.17,1.752,0.0,0.0,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,2,33,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
2,1,Adelaide,AAPT Championships,2004-01-05,Beck K.,Vahaly B.,Completed,1.83,1.83,1.73,2.21,0.0,0.0,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,510,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
3,1,Adelaide,AAPT Championships,2004-01-05,Nieminen J.,Moodie W.,Completed,1.4,2.75,1.459,2.93,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,4,303,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
4,1,Adelaide,AAPT Championships,2004-01-05,Arthurs W.,Karlovic I.,Completed,1.8,1.909,1.862,2.0,0.0,0.0,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,5,36,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False


In [7]:
# Drop specified columns
columns_to_drop = ['Winner', 'Loser', 'Location', 'Tournament', 'Comment']
df = df.drop(columns_to_drop, axis=1)

# Display the first few rows of the DataFrame to verify the columns are dropped
df.head()

Unnamed: 0,ATP,Date,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,match_count_indoor_hard_winner,match_count_indoor_hard_loser,elo_indoor_hard_winner,elo_indoor_hard_loser,proba_elo_indoor_hard,match_count_indoor_carpet_winner,match_count_indoor_carpet_loser,elo_indoor_carpet_winner,elo_indoor_carpet_loser,proba_elo_indoor_carpet,match_count_indoor_clay_winner,match_count_indoor_clay_loser,elo_indoor_clay_winner,elo_indoor_clay_loser,proba_elo_indoor_clay,match_count_outdoor_hard_winner,match_count_outdoor_hard_loser,elo_outdoor_hard_winner,elo_outdoor_hard_loser,proba_elo_outdoor_hard,match_count_outdoor_clay_winner,match_count_outdoor_clay_loser,elo_outdoor_clay_winner,elo_outdoor_clay_loser,proba_elo_outdoor_clay,match_count_outdoor_grass_winner,match_count_outdoor_grass_loser,elo_outdoor_grass_winner,elo_outdoor_grass_loser,proba_elo_outdoor_grass,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0
0,1,2004-01-05,1.16,4.5,1.241,4.55,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,465,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
1,1,2004-01-05,2.0,1.72,2.17,1.752,0.0,0.0,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,2,33,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
2,1,2004-01-05,1.83,1.83,1.73,2.21,0.0,0.0,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,510,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
3,1,2004-01-05,1.4,2.75,1.459,2.93,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,4,303,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False
4,1,2004-01-05,1.8,1.909,1.862,2.0,0.0,0.0,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,5,36,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False


In [8]:
df['Date'] = pd.to_datetime(df['Date'])

# Extract relevant features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek
# Add more feature extraction as needed

# Drop the original 'Date' column if it's no longer needed
df = df.drop('Date', axis=1)

df.head()

Unnamed: 0,ATP,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,match_count_indoor_hard_winner,match_count_indoor_hard_loser,elo_indoor_hard_winner,elo_indoor_hard_loser,proba_elo_indoor_hard,match_count_indoor_carpet_winner,match_count_indoor_carpet_loser,elo_indoor_carpet_winner,elo_indoor_carpet_loser,proba_elo_indoor_carpet,match_count_indoor_clay_winner,match_count_indoor_clay_loser,elo_indoor_clay_winner,elo_indoor_clay_loser,proba_elo_indoor_clay,match_count_outdoor_hard_winner,match_count_outdoor_hard_loser,elo_outdoor_hard_winner,elo_outdoor_hard_loser,proba_elo_outdoor_hard,match_count_outdoor_clay_winner,match_count_outdoor_clay_loser,elo_outdoor_clay_winner,elo_outdoor_clay_loser,proba_elo_outdoor_clay,match_count_outdoor_grass_winner,match_count_outdoor_grass_loser,elo_outdoor_grass_winner,elo_outdoor_grass_loser,proba_elo_outdoor_grass,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0,Year,Month,Day,DayOfWeek
0,1,1.16,4.5,1.241,4.55,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,465,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0
1,1,2.0,1.72,2.17,1.752,0.0,0.0,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,2,33,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0
2,1,1.83,1.83,1.73,2.21,0.0,0.0,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,510,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0
3,1,1.4,2.75,1.459,2.93,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,4,303,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0
4,1,1.8,1.909,1.862,2.0,0.0,0.0,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,5,36,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0


In [9]:
# final dataframe for comparing
df_old = df

df_old.to_csv("df_old.csv")

df_old.head()

Unnamed: 0,ATP,B365W,B365L,PSW,PSL,WPts,LPts,match_count_winner,match_count_loser,elo_winner,elo_loser,proba_elo,match_count_indoor_hard_winner,match_count_indoor_hard_loser,elo_indoor_hard_winner,elo_indoor_hard_loser,proba_elo_indoor_hard,match_count_indoor_carpet_winner,match_count_indoor_carpet_loser,elo_indoor_carpet_winner,elo_indoor_carpet_loser,proba_elo_indoor_carpet,match_count_indoor_clay_winner,match_count_indoor_clay_loser,elo_indoor_clay_winner,elo_indoor_clay_loser,proba_elo_indoor_clay,match_count_outdoor_hard_winner,match_count_outdoor_hard_loser,elo_outdoor_hard_winner,elo_outdoor_hard_loser,proba_elo_outdoor_hard,match_count_outdoor_clay_winner,match_count_outdoor_clay_loser,elo_outdoor_clay_winner,elo_outdoor_clay_loser,proba_elo_outdoor_clay,match_count_outdoor_grass_winner,match_count_outdoor_grass_loser,elo_outdoor_grass_winner,elo_outdoor_grass_loser,proba_elo_outdoor_grass,Winner_ID,Loser_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0,Year,Month,Day,DayOfWeek
0,1,1.16,4.5,1.241,4.55,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,465,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0
1,1,2.0,1.72,2.17,1.752,0.0,0.0,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,2,33,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0
2,1,1.83,1.83,1.73,2.21,0.0,0.0,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,510,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0
3,1,1.4,2.75,1.459,2.93,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,4,303,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0
4,1,1.8,1.909,1.862,2.0,0.0,0.0,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,5,36,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0


In [11]:
# Define the new column names according to the provided schema
new_names = {
    'ATP': 'ATP',
    'B365W': 'B365_PlayerA',
    'B365L': 'B365_PlayerB',
    'PSW': 'PS_PlayerA',
    'PSL': 'PS_PlayerB',
    'WPts': 'PlayerA_Pts',
    'LPts': 'PlayerB_Pts',
    'match_count_winner': 'match_count_PlayerA',
    'match_count_loser': 'match_count_PlayerB',
    'elo_winner': 'elo_PlayerA',
    'elo_loser': 'elo_PlayerB',
    'proba_elo': 'proba_elo_PlayerA_Wins',
    'Winner_ID': 'PlayerA_ID',
    'Loser_ID': 'PlayerB_ID',
    'Surface_Carpet': 'Surface_Carpet',
    'Surface_Clay': 'Surface_Clay',
    'Surface_Grass': 'Surface_Grass',
    'Surface_Hard': 'Surface_Hard',
    'Tournament_ID': 'Tournament_ID',
    'Series_ATP250': 'Series_ATP250',
    'Series_ATP500': 'Series_ATP500',
    'Series_Grand Slam': 'Series_Grand Slam',
    'Series_International': 'Series_International',
    'Series_International Gold': 'Series_International Gold',
    'Series_Masters': 'Series_Masters',
    'Series_Masters 1000': 'Series_Masters 1000',
    'Series_Masters Cup': 'Series_Masters Cup',
    'Court_Indoor': 'Court_Indoor',
    'Court_Outdoor': 'Court_Outdoor',
    'Round_1st Round': 'Round_1st Round',
    'Round_2nd Round': 'Round_2nd Round',
    'Round_3rd Round': 'Round_3rd Round',
    'Round_4th Round': 'Round_4th Round',
    'Round_Quarterfinals': 'Round_Quarterfinals',
    'Round_Round Robin': 'Round_Round Robin',
    'Round_Semifinals': 'Round_Semifinals',
    'Round_The Final': 'Round_The Final',
    'Best_of_3.0': 'Best_of_3.0',
    'Best_of_5.0': 'Best_of_5.0',
    'Year': 'Year',
    'Month': 'Month',
    'Day': 'Day',
    'DayOfWeek': 'DayOfWeek'
}

# Rename columns based on the defined mapping
df_old.rename(columns=new_names, inplace=True)

df_old['PlayerA_Wins'] = True

# Display the modified DataFrame
df_old.head(60)

Unnamed: 0,ATP,B365_PlayerA,B365_PlayerB,PS_PlayerA,PS_PlayerB,PlayerA_Pts,PlayerB_Pts,match_count_PlayerA,match_count_PlayerB,elo_PlayerA,elo_PlayerB,proba_elo_PlayerA_Wins,match_count_indoor_hard_winner,match_count_indoor_hard_loser,elo_indoor_hard_winner,elo_indoor_hard_loser,proba_elo_indoor_hard,match_count_indoor_carpet_winner,match_count_indoor_carpet_loser,elo_indoor_carpet_winner,elo_indoor_carpet_loser,proba_elo_indoor_carpet,match_count_indoor_clay_winner,match_count_indoor_clay_loser,elo_indoor_clay_winner,elo_indoor_clay_loser,proba_elo_indoor_clay,match_count_outdoor_hard_winner,match_count_outdoor_hard_loser,elo_outdoor_hard_winner,elo_outdoor_hard_loser,proba_elo_outdoor_hard,match_count_outdoor_clay_winner,match_count_outdoor_clay_loser,elo_outdoor_clay_winner,elo_outdoor_clay_loser,proba_elo_outdoor_clay,match_count_outdoor_grass_winner,match_count_outdoor_grass_loser,elo_outdoor_grass_winner,elo_outdoor_grass_loser,proba_elo_outdoor_grass,PlayerA_ID,PlayerB_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0,Year,Month,Day,DayOfWeek,PlayerA_Wins
0,1,1.16,4.5,1.241,4.55,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,465,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0,True
1,1,2.0,1.72,2.17,1.752,0.0,0.0,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,2,33,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0,True
2,1,1.83,1.83,1.73,2.21,0.0,0.0,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,510,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0,True
3,1,1.4,2.75,1.459,2.93,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,4,303,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0,True
4,1,1.8,1.909,1.862,2.0,0.0,0.0,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,5,36,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0,True
5,1,1.8,1.9,1.699,2.25,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,6,673,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,6,1,True
6,1,1.533,2.375,1.592,2.51,0.0,0.0,1,0,1484.0,1500.0,0.47699,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,0,1484.0,1500.0,0.47699,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,7,22,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,6,1,True
7,1,1.444,2.625,1.526,2.7,0.0,0.0,3,0,1485.438774,1500.0,0.479057,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,0,1485.438774,1500.0,0.479057,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,8,224,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,6,1,True
8,1,1.615,2.2,1.676,2.3,0.0,0.0,2,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,2,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,9,462,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,6,1,True
9,1,2.625,1.444,2.7,1.526,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,10,212,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,7,2,True


In [12]:
df_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48824 entries, 0 to 48823
Data columns (total 74 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   ATP                               48824 non-null  int64  
 1   B365_PlayerA                      48824 non-null  float64
 2   B365_PlayerB                      48824 non-null  float64
 3   PS_PlayerA                        48824 non-null  float64
 4   PS_PlayerB                        48824 non-null  float64
 5   PlayerA_Pts                       48824 non-null  float64
 6   PlayerB_Pts                       48824 non-null  float64
 7   match_count_PlayerA               48824 non-null  int64  
 8   match_count_PlayerB               48824 non-null  int64  
 9   elo_PlayerA                       48824 non-null  float64
 10  elo_PlayerB                       48824 non-null  float64
 11  proba_elo_PlayerA_Wins            48824 non-null  float64
 12  matc

In [13]:
df_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48824 entries, 0 to 48823
Data columns (total 74 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   ATP                               48824 non-null  int64  
 1   B365_PlayerA                      48824 non-null  float64
 2   B365_PlayerB                      48824 non-null  float64
 3   PS_PlayerA                        48824 non-null  float64
 4   PS_PlayerB                        48824 non-null  float64
 5   PlayerA_Pts                       48824 non-null  float64
 6   PlayerB_Pts                       48824 non-null  float64
 7   match_count_PlayerA               48824 non-null  int64  
 8   match_count_PlayerB               48824 non-null  int64  
 9   elo_PlayerA                       48824 non-null  float64
 10  elo_PlayerB                       48824 non-null  float64
 11  proba_elo_PlayerA_Wins            48824 non-null  float64
 12  matc

In [14]:
import pandas as pd
import numpy as np

# Create a copy of the original DataFrame
df_shuffled = df_old.copy()

df_shuffled["proba_elo_PlayerB_Wins"] = 1.0 - df_shuffled["proba_elo_PlayerA_Wins"]

# Selecting relevant columns for swapping
match_columns = ['B365_PlayerA', 'PS_PlayerA', 'PlayerA_Pts', 'elo_PlayerA', 'match_count_PlayerA', 'proba_elo_PlayerA_Wins', 'PlayerA_ID']

# Create a mask to randomly determine which rows to swap
swap_mask = np.random.choice([True, False], size=len(df_shuffled), p=[0.5, 0.5])

# Swap values between Player A and Player B randomly in about 50% of the cases while keeping the row order unchanged
for playerA_col in match_columns:
    temp_col = playerA_col.replace('PlayerA', 'temp')
    playerB_col = playerA_col.replace('PlayerA', 'PlayerB')

    # Store values from Player A temporarily
    df_shuffled.loc[swap_mask, temp_col] = df_shuffled.loc[swap_mask, playerA_col]

    # Swap values between Player A and Player B
    df_shuffled.loc[swap_mask, playerA_col] = df_shuffled.loc[swap_mask, playerB_col]
    df_shuffled.loc[swap_mask, playerB_col] = df_shuffled.loc[swap_mask, temp_col]

    # Drop the temporary column
    df_shuffled.drop(columns=temp_col, inplace=True)

# Update 'PlayerA_Wins' column to reflect the new match outcomes based on swapped values
df_shuffled['PlayerA_Wins'] = ~swap_mask

# Display the updated DataFrame
df_shuffled.head()

Unnamed: 0,ATP,B365_PlayerA,B365_PlayerB,PS_PlayerA,PS_PlayerB,PlayerA_Pts,PlayerB_Pts,match_count_PlayerA,match_count_PlayerB,elo_PlayerA,elo_PlayerB,proba_elo_PlayerA_Wins,match_count_indoor_hard_winner,match_count_indoor_hard_loser,elo_indoor_hard_winner,elo_indoor_hard_loser,proba_elo_indoor_hard,match_count_indoor_carpet_winner,match_count_indoor_carpet_loser,elo_indoor_carpet_winner,elo_indoor_carpet_loser,proba_elo_indoor_carpet,match_count_indoor_clay_winner,match_count_indoor_clay_loser,elo_indoor_clay_winner,elo_indoor_clay_loser,proba_elo_indoor_clay,match_count_outdoor_hard_winner,match_count_outdoor_hard_loser,elo_outdoor_hard_winner,elo_outdoor_hard_loser,proba_elo_outdoor_hard,match_count_outdoor_clay_winner,match_count_outdoor_clay_loser,elo_outdoor_clay_winner,elo_outdoor_clay_loser,proba_elo_outdoor_clay,match_count_outdoor_grass_winner,match_count_outdoor_grass_loser,elo_outdoor_grass_winner,elo_outdoor_grass_loser,proba_elo_outdoor_grass,PlayerA_ID,PlayerB_ID,Surface_Carpet,Surface_Clay,Surface_Grass,Surface_Hard,Tournament_ID,Series_ATP250,Series_ATP500,Series_Grand Slam,Series_International,Series_International Gold,Series_Masters,Series_Masters 1000,Series_Masters Cup,Court_Indoor,Court_Outdoor,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best_of_3.0,Best_of_5.0,Year,Month,Day,DayOfWeek,PlayerA_Wins,proba_elo_PlayerB_Wins
0,1,1.16,4.5,1.241,4.55,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,465,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0,True,0.5
1,1,1.72,2.0,1.752,2.17,0.0,0.0,0,13,1500.0,1569.12595,0.401812,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,13,0,1569.12595,1500.0,0.598188,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,33,2,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0,False,0.598188
2,1,1.83,1.83,2.21,1.73,0.0,0.0,3,1,1515.297526,1484.0,0.544919,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,1,3,1484.0,1515.297526,0.455081,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,510,3,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0,False,0.455081
3,1,1.4,2.75,1.459,2.93,0.0,0.0,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,4,303,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0,True,0.5
4,1,1.909,1.8,2.0,1.862,0.0,0.0,0,3,1500.0,1457.584184,0.56074,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,3,0,1457.584184,1500.0,0.43926,0,0,1500.0,1500.0,0.5,0,0,1500.0,1500.0,0.5,36,5,False,False,False,True,1,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,2004,1,5,0,False,0.43926


In [15]:
df_shuffled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48824 entries, 0 to 48823
Data columns (total 75 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   ATP                               48824 non-null  int64  
 1   B365_PlayerA                      48824 non-null  float64
 2   B365_PlayerB                      48824 non-null  float64
 3   PS_PlayerA                        48824 non-null  float64
 4   PS_PlayerB                        48824 non-null  float64
 5   PlayerA_Pts                       48824 non-null  float64
 6   PlayerB_Pts                       48824 non-null  float64
 7   match_count_PlayerA               48824 non-null  int64  
 8   match_count_PlayerB               48824 non-null  int64  
 9   elo_PlayerA                       48824 non-null  float64
 10  elo_PlayerB                       48824 non-null  float64
 11  proba_elo_PlayerA_Wins            48824 non-null  float64
 12  matc

In [16]:
df_shuffled.to_csv('Vahid_final.csv') #### FINAL DATAFRAME