One can read all datasets directly from the disk and avoid doing the time consuming processing

In [1]:
import pandas as pd

# Load data from the disk
elo_rates = pd.read_csv("archive/elo_rates_enriched.csv", index_col=0)
df_atp = pd.read_csv("archive/atp_data_enriched.csv", index_col=0)
# df_conf = pd.read_csv("archive/confidence_data.csv")

df_atp["Date"] = pd.to_datetime(df_atp["Date"])

# Drop 350 rows where bet odds are absent
# We just ignore the odds for losers cause they do not appear in our futher analysis.
df_atp_reduced = df_atp.dropna(subset=["PSW", "B365W"], axis=0)

# Save dataframes on the disk
df_atp_reduced.to_csv("archive/df_atp_reduced.csv")

# Display the first few rows to verify the changes
df_atp_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49074 entries, 125 to 64485
Data columns (total 53 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   ATP                               49074 non-null  int64         
 1   Location                          49074 non-null  object        
 2   Tournament                        49074 non-null  object        
 3   Date                              49074 non-null  datetime64[ns]
 4   Series                            49074 non-null  object        
 5   Court                             49074 non-null  object        
 6   Surface                           49074 non-null  object        
 7   Round                             49074 non-null  object        
 8   Best of                           49059 non-null  float64       
 9   Winner                            49074 non-null  object        
 10  Loser                             49074 non-null 

It's time to define some betting strategies (base line):

1. To bet on the higher ranked player (HPBS)
2. To bet on the lower ranked player (LPBS)
3. To bet on the player with higher total elo rate (HRBS)
4. To bet on the player with lower total elo rate (LRBS)
5. To bet on the player with the better elo rate in the appropirate field type (BRFTBS)
6. To bet on a random player (RPBS)
7. To bet on the winner always (WBS)
8. To bet on the loser always (LBS)

Ofcourse, the strategies 7 and 8 are just theoretical and put there to compare the strategies with the theoretical maximum profit/loss.

In [2]:
import pandas as pd
import numpy as np

profit = df_atp_reduced[["Court", "Surface", "Winner", "Loser", "WPts", "LPts", "PSW", "B365W", "proba_elo", "proba_elo_indoor_hard", "proba_elo_indoor_clay", "proba_elo_indoor_carpet", "proba_elo_outdoor_hard", "proba_elo_outdoor_clay", "proba_elo_outdoor_grass"]].copy()

# Strategy 1
strategies = pd.DataFrame(df_atp["WPts"] > df_atp["LPts"], columns=["HPBS"], index=df_atp_reduced.index)
profit[["HPBS_PS", "HPBS_B365"]] = profit.loc[strategies["HPBS"], ["PSW", "B365W"]].rename(columns={"PSW": "HPBS_PS", "B365W": "HPBS_B365"}) - 1.0

# Strategy 2
strategies["LPBS"] = df_atp["WPts"] < df_atp["LPts"]
profit[["LPBS_PS", "LPBS_B365"]] = profit.loc[strategies["LPBS"], ["PSW", "B365W"]].rename(columns={"PSW": "LPBS_PS", "B365W": "LPBS_B365"}) - 1.0

# Strategy 3
strategies["HRBS"] = df_atp["elo_winner"] > df_atp["elo_loser"]
profit[["HRBS_PS", "HRBS_B365"]] = profit.loc[strategies["HRBS"], ["PSW", "B365W"]].rename(columns={"PSW": "HRBS_PS", "B365W": "HRBS_B365"}) - 1.0

# Strategy 4
strategies["LRBS"] = df_atp["elo_winner"] < df_atp["elo_loser"]
profit[["LRBS_PS", "LRBS_B365"]] = profit.loc[strategies["LRBS"], ["PSW", "B365W"]].rename(columns={"PSW": "LRBS_PS", "B365W": "LRBS_B365"}) - 1.0

# Strategy 5
court_surface_type = df_atp[["Court", "Surface"]].drop_duplicates().sort_values(by="Court").to_numpy().tolist()
strategies["BRFTBS"] = False
for pair in court_surface_type:
    col_name = f"proba_elo_{pair[0].lower()}_{pair[1].lower()}"
    strategies.loc[(profit["Court"]==pair[0]) & (profit["Surface"]==pair[1]), "BRFTBS"] = profit.loc[(profit["Court"]==pair[0]) & (profit["Surface"]==pair[1]), col_name] >= .5
profit[["BRFTBS_PS", "BRFTBS_B365"]] = profit.loc[strategies["BRFTBS"], ["PSW", "B365W"]].rename(columns={"PSW": "BRFTBS_PS", "B365W": "BRFTBS_B365"}) - 1.0

# Strategy 6
np.random.seed(13572)
strategies["RPBS"] = np.random.randint(0, 2, size=len(strategies))==1
profit[["RPBS_PS", "RPBS_B365"]] = profit.loc[strategies["RPBS"], ["PSW", "B365W"]].rename(columns={"PSW": "RPBS_PS", "B365W": "RPBS_B365"}) - 1.0

# Strategy 7
strategies["WBS"] = True
profit[["WBS_PS", "WBS_B365"]] = profit.loc[strategies["WBS"], ["PSW", "B365W"]].rename(columns={"PSW": "WBS_PS", "B365W": "WBS_B365"}) - 1.0

# Strategy 8
strategies["LBS"] = False
profit[["LBS_PS", "LBS_B365"]] = profit.loc[strategies["LBS"], ["PSW", "B365W"]].rename(columns={"PSW": "LBS_PS", "B365W": "LBS_B365"}) - 1.0

# profit.dropna(axis=0, subset=["PSW", "B365W"], inplace=True)
profit.fillna(value=-1.0, inplace=True)

profit.head(10)

Unnamed: 0,Court,Surface,Winner,Loser,WPts,LPts,PSW,B365W,proba_elo,proba_elo_indoor_hard,...,LRBS_PS,LRBS_B365,BRFTBS_PS,BRFTBS_B365,RPBS_PS,RPBS_B365,WBS_PS,WBS_B365,LBS_PS,LBS_B365
125,Outdoor,Hard,Dent T.,Horna L.,0.0,0.0,1.241,1.16,0.5,0.5,...,-1.0,-1.0,0.241,0.16,0.241,0.16,0.241,0.16,-1.0,-1.0
126,Outdoor,Hard,Martin A.,Ancic M.,0.0,0.0,2.17,2.0,0.598188,0.5,...,-1.0,-1.0,1.17,1.0,1.17,1.0,1.17,1.0,-1.0,-1.0
127,Outdoor,Hard,Beck K.,Vahaly B.,0.0,0.0,1.73,1.83,0.455081,0.5,...,0.73,0.83,-1.0,-1.0,0.73,0.83,0.73,0.83,-1.0,-1.0
128,Outdoor,Hard,Nieminen J.,Moodie W.,0.0,0.0,1.459,1.4,0.5,0.5,...,-1.0,-1.0,0.459,0.4,0.459,0.4,0.459,0.4,-1.0,-1.0
130,Outdoor,Hard,Arthurs W.,Karlovic I.,0.0,0.0,1.862,1.8,0.43926,0.5,...,0.862,0.8,-1.0,-1.0,-1.0,-1.0,0.862,0.8,-1.0,-1.0
131,Outdoor,Hard,Sanguinetti D.,Elsner D.,0.0,0.0,1.699,1.8,0.5,0.5,...,-1.0,-1.0,0.699,0.8,0.699,0.8,0.699,0.8,-1.0,-1.0
133,Outdoor,Hard,Hrbaty D.,Benneteau J.,0.0,0.0,1.592,1.533,0.47699,0.5,...,0.592,0.533,-1.0,-1.0,0.592,0.533,0.592,0.533,-1.0,-1.0
134,Outdoor,Hard,Clement A.,Hanescu V.,0.0,0.0,1.526,1.444,0.479057,0.5,...,0.526,0.444,-1.0,-1.0,0.526,0.444,0.526,0.444,-1.0,-1.0
135,Outdoor,Hard,Llodra M.,Burgsmuller L.,0.0,0.0,1.676,1.615,0.5,0.5,...,-1.0,-1.0,0.676,0.615,0.676,0.615,0.676,0.615,-1.0,-1.0
136,Outdoor,Hard,Saulnier C.,Ginepri R.,0.0,0.0,2.7,2.625,0.5,0.5,...,-1.0,-1.0,1.7,1.625,-1.0,-1.0,1.7,1.625,-1.0,-1.0


# This is how each strategy would perform on "Pinnacle Sports":

In [3]:
print("The statistics related to bet strategies on \"Pinnacle Sports\"")
profit[["HPBS_PS", "LPBS_PS", "HRBS_PS", "LRBS_PS", "BRFTBS_PS", "RPBS_PS", "WBS_PS", "LBS_PS"]].describe()

The statistics related to bet strategies on "Pinnacle Sports"


Unnamed: 0,HPBS_PS,LPBS_PS,HRBS_PS,LRBS_PS,BRFTBS_PS,RPBS_PS,WBS_PS,LBS_PS
count,49074.0,49074.0,49074.0,49074.0,49074.0,49074.0,49074.0,49074.0
mean,-0.03417,-0.047894,-0.004574,-0.071679,0.00632,-0.030564,0.926528,-1.0
std,0.817617,1.680366,0.876072,1.651572,0.929184,1.346063,1.281506,0.0
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.028,-1.0
25%,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.28,-1.0
50%,0.18,-1.0,0.18,-1.0,0.171,0.01,0.56,-1.0
75%,0.476,0.794,0.5,0.7,0.51,0.565,1.15,-1.0
max,11.0,45.0,14.0,45.0,20.0,45.0,45.0,-1.0


# This is how each strategy would perform on "Bet 365":

In [4]:
print("The statistics related to bet strategies on \"Bet 365\"")
profit[["HPBS_B365", "LPBS_B365", "HRBS_B365", "LRBS_B365", "BRFTBS_B365", "RPBS_B365", "WBS_B365", "LBS_B365"]].describe()

The statistics related to bet strategies on "Bet 365"


Unnamed: 0,HPBS_B365,LPBS_B365,HRBS_B365,LRBS_B365,BRFTBS_B365,RPBS_B365,WBS_B365,LBS_B365
count,49074.0,49074.0,49074.0,49074.0,49074.0,49074.0,49074.0,49074.0
mean,-0.066666,-0.105147,-0.03938,-0.127194,-0.029235,-0.076004,0.83608,-1.0
std,0.8321,1.50178,0.880207,1.473254,0.923948,1.237792,1.124247,0.0
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.029,-1.0
25%,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.25,-1.0
50%,0.14,-1.0,0.14,-1.0,0.14,0.005,0.5,-1.0
75%,0.4,0.72,0.44,0.61,0.44,0.5,1.1,-1.0
max,33.0,28.0,33.0,28.0,33.0,33.0,33.0,-1.0


Summary of the strategies:
- The self-defined variable -field specific Elo rate- is suprisingly the most relevant variable among the ones we defined.
- Except strategy 5 (To bet on the player with the better elo rate in the appropriate field type (BRFTBS)), all betting strategies perform poorly.
- This low marginal profit (in case of BRFTBS on PS) can turn up negative by choosing an arbitrary sub set of the matches, which means losing money.
- Even in the best case senario, his low margin of profit is increases the risk of losing money.
- Therefore, we have to come up with new betting strategies, to gain some money.

# Constructing the feature set:

In [5]:
import pandas as pd
import numpy as np
import sklearn.preprocessing
import joblib

# Load from disk
df_atp_reduced = pd.read_csv("archive/df_atp_reduced.csv", index_col=0)


def extract_features(df: pd.DataFrame, shuffle_mask: np.ndarray | pd.Series | None=None) -> pd.DataFrame:
    # Let's Suffle the players in a way that there is 50% chance for the player A to win and 50% for player B.
    if shuffle_mask is None:
        np.random.seed(98245)
        shuffle_mask = np.random.randint(0, 2, size=len(df))==1
    elif isinstance(shuffle_mask, pd.Series) and len(shuffle_mask) == len(df):
        shuffle_mask = shuffle_mask.values
    elif not isinstance(shuffle_mask, np.ndarray) or len(shuffle_mask) != len(df):
        raise ValueError(f"inappropirate input type or incorrect length for shuffle_mask; a {type(shuffle_mask).__name__} passed")
        
    features = pd.DataFrame(shuffle_mask, columns=["P1==Winner"], index=df.index)

    # Assign the features of the players acordingly
    features["P1_wins_proba_elo"] = (shuffle_mask - df["proba_elo"]).abs()
    features["P1_match_count"] = pd.merge(df.loc[shuffle_mask, "match_count_winner"], df.loc[~shuffle_mask, "match_count_loser"], how="outer", left_index=True, right_index=True).fillna(value=0).sum(axis=1).astype("int")
    features["P2_match_count"] = pd.merge(df.loc[shuffle_mask, "match_count_loser"], df.loc[~shuffle_mask, "match_count_winner"], how="outer", left_index=True, right_index=True).fillna(value=0).sum(axis=1).astype("int")
    features["P1_pts"] = pd.merge(df.loc[shuffle_mask, "WPts"], df.loc[~shuffle_mask, "LPts"], how="outer", left_index=True, right_index=True).fillna(value=0).sum(axis=1).astype("int")
    features["P2_pts"] = pd.merge(df.loc[shuffle_mask, "LPts"], df.loc[~shuffle_mask, "WPts"], how="outer", left_index=True, right_index=True).fillna(value=0).sum(axis=1).astype("int")
    features["Date"] = df["Date"]

    # Each cell in the dataframe features answers to the question whether the specified strategy predicts that player one wins
    # Before constructing the dataframe we should scramble the players so that sometimes player 1 wins and he loses the other times with equal probabilities.
    court_surface_type = df[["Court", "Surface"]].drop_duplicates().sort_values(by="Court").to_numpy().tolist()


    for pair in court_surface_type:
        field_type = f"{pair[0].lower()}_{pair[1].lower()}"
        features[f"field_type=={field_type}"] = (df["Court"].str.lower() + "_" + df["Surface"].str.lower() == field_type) + 0
        features[f"P1_match_count_{field_type}"] = pd.merge(df.loc[shuffle_mask, f"match_count_{field_type}_winner"], df.loc[~shuffle_mask, f"match_count_{field_type}_loser"], how="outer", left_index=True, right_index=True).fillna(value=0).sum(axis=1).astype("int")
        features[f"P2_match_count_{field_type}"] = pd.merge(df.loc[shuffle_mask, f"match_count_{field_type}_loser"], df.loc[~shuffle_mask, f"match_count_{field_type}_winner"], how="outer", left_index=True, right_index=True).fillna(value=0).sum(axis=1).astype("int")
        features[f"P1_wins_proba_elo_{field_type}"] = (~shuffle_mask - df[f"proba_elo_{field_type}"]).abs()

    return features

# We trow out the data related to the indoor clay and carpet matches, as the elo rates belonging to them are in imature
df_atp_reduced = df_atp_reduced[(df_atp_reduced["Court"]=="Outdoor") | (df_atp_reduced["Surface"]=="Hard")]

# Shuffle P1 & P2 according to a random mask
feature_agg = [extract_features(df=df_atp_reduced)]

# Make another feature set with the exact oposite shuffle mask
feature_agg.append(extract_features(df=df_atp_reduced, shuffle_mask=~feature_agg[0]["P1==Winner"]))

# Aggregate the two feature set
features = pd.concat(feature_agg, axis=0).sort_values(by=["Date"])

# Drop the matches in which either of the players has attended less than 10 game,
# as this is considered as too imature scores.
print(f"row count before droping imature elo rates: {len(features)}")
cutoff = 5
drop_list = ["Date", "P1_match_count", "P2_match_count"]
for pair in df_atp_reduced[["Court", "Surface"]].drop_duplicates().sort_values(by="Court").to_numpy().tolist():
    field_type = f"{pair[0].lower()}_{pair[1].lower()}"
    drop_list.append(f"P1_match_count_{field_type}")
    drop_list.append(f"P2_match_count_{field_type}")
    features = features[(features[f"P1_match_count_{field_type}"] >= cutoff) & (features[f"P2_match_count_{field_type}"] >= cutoff)]
print(f"row count after droping imature elo rates: {len(features)}")

# Drop the match count columns as their main pupose was to determine the maturity of the elo rates
features.drop(labels=drop_list, axis=1, inplace=True)
# features.drop(labels=["Date"], axis=1, inplace=True)

# Save the result on the disk
features.to_csv("archive/features.csv")

features.tail()

row count before droping imature elo rates: 95566
row count after droping imature elo rates: 27998


Unnamed: 0,P1==Winner,P1_wins_proba_elo,P1_pts,P2_pts,field_type==indoor_hard,P1_wins_proba_elo_indoor_hard,field_type==outdoor_hard,P1_wins_proba_elo_outdoor_hard,field_type==outdoor_clay,P1_wins_proba_elo_outdoor_clay,field_type==outdoor_grass,P1_wins_proba_elo_outdoor_grass
63113,False,0.530782,7200,5490,1,0.467043,0,0.543762,0,0.300885,0,0.571744
63114,True,0.255611,11445,8455,1,0.898777,0,0.835576,0,0.626603,0,0.866195
63114,False,0.744389,8455,11445,1,0.101223,0,0.164424,0,0.373397,0,0.133805
63115,False,0.690513,5490,11445,1,0.303754,0,0.158293,0,0.191344,0,0.085269
63115,True,0.309487,11445,5490,1,0.696246,0,0.841707,0,0.808656,0,0.914731


In [6]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27998 entries, 33171 to 63115
Data columns (total 12 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   P1==Winner                       27998 non-null  bool   
 1   P1_wins_proba_elo                27998 non-null  float64
 2   P1_pts                           27998 non-null  int64  
 3   P2_pts                           27998 non-null  int64  
 4   field_type==indoor_hard          27998 non-null  int64  
 5   P1_wins_proba_elo_indoor_hard    27998 non-null  float64
 6   field_type==outdoor_hard         27998 non-null  int64  
 7   P1_wins_proba_elo_outdoor_hard   27998 non-null  float64
 8   field_type==outdoor_clay         27998 non-null  int64  
 9   P1_wins_proba_elo_outdoor_clay   27998 non-null  float64
 10  field_type==outdoor_grass        27998 non-null  int64  
 11  P1_wins_proba_elo_outdoor_grass  27998 non-null  float64
dtypes: bool(1), float64