### Betting Data

In [11]:
import os
import pandas as pd

# Path to the folder containing the betting data files
folder_path = '/content/drive/My Drive/Betting_data'

# Listing all .xls files in the directory
files = [f for f in os.listdir(folder_path) if f.endswith('.xls')]

# Initializing an empty list to hold DataFrames
data_frames = []

# Loading data into a DataFrame
for file in files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)
    year = file.split('.')[0]  # Assuming the file name contains the year
    df['Year'] = year
    data_frames.append(df)

# Combining all the DataFrames into a single DataFrame
betting_df = pd.concat(data_frames, ignore_index=True)

print(betting_df.head())


   ATP  Location                              Tournament       Date  \
0    1  Adelaide  Next Generation Adelaide International 2007-12-31   
1    1  Adelaide  Next Generation Adelaide International 2007-12-31   
2    1  Adelaide  Next Generation Adelaide International 2007-12-31   
3    1  Adelaide  Next Generation Adelaide International 2007-12-31   
4    1  Adelaide  Next Generation Adelaide International 2007-12-31   

          Series    Court Surface      Round  Best of       Winner  ... SBW  \
0  International  Outdoor    Hard  1st Round        3   Querrey S.  ... NaN   
1  International  Outdoor    Hard  1st Round        3  Sirianni J.  ... NaN   
2  International  Outdoor    Hard  1st Round        3   Russell M.  ... NaN   
3  International  Outdoor    Hard  1st Round        3    Becker B.  ... NaN   
4  International  Outdoor    Hard  1st Round        3    Zverev M.  ... NaN   

  SBL B&WW  B&WL  SJW  SJL  MaxW MaxL AvgW AvgL  
0 NaN  NaN   NaN  NaN  NaN   NaN  NaN  NaN  NaN 

## Data Preparation

In [12]:
# Selecting relevant columns
relevant_columns = ['Year', 'Winner', 'Loser', 'B365W', 'B365L', 'Date']
betting_df = betting_df[relevant_columns]

# Dropping rows with missing odds
betting_df = betting_df.dropna(subset=['B365W', 'B365L'])

# Displaying cleaned DataFrame
print(betting_df.head())


   Year       Winner           Loser  B365W  B365L       Date
0  2008   Querrey S.       Darcis S.   1.53   2.37 2007-12-31
1  2008  Sirianni J.     Stepanek R.   6.50   1.10 2007-12-31
2  2008   Russell M.  Del Potro J.M.   3.75   1.25 2007-12-31
3  2008    Becker B.       Smeets R.   1.40   2.75 2007-12-31
4  2008    Zverev M.        Mayer F.   1.72   2.00 2007-12-31


In [13]:
#Function to calculate implied probabilities
def calculate_probabilities(row):
    alpha = row['B365W']  #Winner odds
    beta = row['B365L']   #Loser odds

    if pd.isna(alpha) or pd.isna(beta):
        return pd.Series([None, None])

    implied_p1 = 1 / alpha
    implied_p2 = 1 / beta

    total = implied_p1 + implied_p2

    normalized_p1 = implied_p1 / total
    normalized_p2 = implied_p2 / total

    return pd.Series([normalized_p1, normalized_p2])

# Normalizing probabilities
betting_df[['normalized_p1', 'normalized_p2']] = betting_df.apply(calculate_probabilities, axis=1)

#Checking the normalized columns
print(betting_df[['normalized_p1', 'normalized_p2']].head())

# Calculating the result column (1 if normalized_p1 > normalized_p2, otherwise 0)
betting_df['result'] = (betting_df['normalized_p1'] > betting_df['normalized_p2']).astype(int)

# Checking the 'result' column
print(betting_df[['normalized_p1', 'normalized_p2', 'result']].head())


   normalized_p1  normalized_p2
0       0.607692       0.392308
1       0.144737       0.855263
2       0.250000       0.750000
3       0.662651       0.337349
4       0.537634       0.462366
   normalized_p1  normalized_p2  result
0       0.607692       0.392308       1
1       0.144737       0.855263       0
2       0.250000       0.750000       0
3       0.662651       0.337349       1
4       0.537634       0.462366       1


## Data Modeling

### ELO model with constant K

In [14]:
from sklearn.metrics import accuracy_score, log_loss

def calculate_EP(elo_1, elo_2):
    return 1 / (1 + 10 ** ((elo_2 - elo_1) / 400))

def update_elo(current_elo, expected_prob, actual_result, k_factor):
    return current_elo + k_factor * (actual_result - expected_prob)

# Function to apply the constant K Elo model
def apply_constant_elo_model(betting_df, k_factor):
    elo_ratings = {}  # Initialize Elo ratings dictionary
    predicted_probs = []

    for index, row in betting_df.iterrows():
        winner_name = row['Winner']
        loser_name = row['Loser']

        winner_elo = elo_ratings.get(winner_name, 1500)
        loser_elo = elo_ratings.get(loser_name, 1500)

        expected_win_prob = calculate_EP(winner_elo, loser_elo)
        predicted_probs.append(expected_win_prob)

        # Update Elo ratings
        elo_ratings[winner_name] = update_elo(winner_elo, expected_win_prob, 1, k_factor)
        elo_ratings[loser_name] = update_elo(loser_elo, expected_win_prob, 0, k_factor)

    betting_df['elo_predicted_prob'] = predicted_probs
    return betting_df, elo_ratings

# Grid search for the best K factor
best_k = None
best_accuracy = 0
best_log_loss = float('inf')
best_calibration = None

k_values = [10, 20, 30, 40]

for k in k_values:
    temp_betting_df, _ = apply_constant_elo_model(betting_df.copy(), k)

    accuracy = accuracy_score(temp_betting_df['result'], temp_betting_df['elo_predicted_prob'] > 0.5)
    log_loss_value = log_loss(temp_betting_df['result'], temp_betting_df['elo_predicted_prob'])
    calibration = temp_betting_df['result'].mean() / temp_betting_df['elo_predicted_prob'].mean()

    if log_loss_value < best_log_loss:
        best_k = k
        best_accuracy = accuracy
        best_log_loss = log_loss_value
        best_calibration = calibration

print(f"Best K: {best_k}")
print(f"(Accuracy, Log-Loss, Calibration): {best_accuracy}, {best_log_loss}, {best_calibration}")


Best K: 10
(Accuracy, Log-Loss, Calibration): 0.7094626042748201, 0.5581992123263912, 1.2266019425029742


### ELO Model with dynamic K

In [15]:
def dynamic_k_factor(matches_played, delta=100, myu=5, sigma=0.1):
    return delta / ((matches_played + myu) ** sigma)

def apply_dynamic_elo_model(betting_df, elo_ratings, matches_played, delta=100, myu=5, sigma=0.1):
    predicted_probs = []

    for index, row in betting_df.iterrows():
        winner_name = row['Winner']
        loser_name = row['Loser']

        winner_elo = elo_ratings.get(winner_name, 1500)
        loser_elo = elo_ratings.get(loser_name, 1500)

        k_factor_winner = dynamic_k_factor(matches_played.get(winner_name, 0), delta, myu, sigma)
        k_factor_loser = dynamic_k_factor(matches_played.get(loser_name, 0), delta, myu, sigma)

        expected_win_prob = calculate_EP(winner_elo, loser_elo)
        predicted_probs.append(expected_win_prob)

        # Updating Elo ratings
        elo_ratings[winner_name] = update_elo(winner_elo, expected_win_prob, 1, k_factor_winner)
        elo_ratings[loser_name] = update_elo(loser_elo, expected_win_prob, 0, k_factor_loser)

        # Updating matches played
        matches_played[winner_name] = matches_played.get(winner_name, 0) + 1
        matches_played[loser_name] = matches_played.get(loser_name, 0) + 1

    betting_df['elo_predicted_prob'] = predicted_probs
    return betting_df

# Initializing Elo ratings and matches played
elo_ratings = {}
matches_played = {}

# Applying the dynamic Elo model
betting_df = apply_dynamic_elo_model(betting_df, elo_ratings, matches_played)

# Re-evaluating the model metrics
accuracy_elo = accuracy_score(betting_df['result'], betting_df['elo_predicted_prob'] > 0.5)
log_loss_elo = log_loss(betting_df['result'], betting_df['elo_predicted_prob'])
calibration_elo = betting_df['result'].mean() / betting_df['elo_predicted_prob'].mean()

print(f"Dynamic Elo Model - Accuracy: {accuracy_elo}, Log-Loss: {log_loss_elo}, Calibration: {calibration_elo}")


Dynamic Elo Model - Accuracy: 0.6773119472987004, Log-Loss: 1.0466574299619376, Calibration: 1.1680353976993483
