# Importing Libraries

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
import matplotlib.pyplot as plt

# Data Loading and Cleaning

In [2]:
# List of file paths for the data from 2005 to 2019

file_paths = [

    '/content/drive/MyDrive/Betting_data/2005.xls',
    '/content/drive/MyDrive/Betting_data/2006.xls',
    '/content/drive/MyDrive/Betting_data/2007.xls',
    '/content/drive/MyDrive/Betting_data/2008.xls',
    '/content/drive/MyDrive/Betting_data/2009.xls',
    '/content/drive/MyDrive/Betting_data/2010.xls',
    '/content/drive/MyDrive/Betting_data/2011.xls',
    '/content/drive/MyDrive/Betting_data/2012.xls',
    '/content/drive/MyDrive/Betting_data/2013.xlsx',
    '/content/drive/MyDrive/Betting_data/2014.xlsx',
    '/content/drive/MyDrive/Betting_data/2015.xlsx',
    '/content/drive/MyDrive/Betting_data/2016.xlsx',
    '/content/drive/MyDrive/Betting_data/2017.xlsx',
    '/content/drive/MyDrive/Betting_data/2018.xlsx',
    '/content/drive/MyDrive/Betting_data/2019.xlsx'
]

# Initializing an empty list to hold DataFrames
data_frames = []

# Loading data into a DataFrame
for file_path in file_paths:
    if os.path.exists(file_path):
        df = pd.read_excel(file_path)
        data_frames.append(df)
    else:
        print(f"File {file_path} not found.")

# Combining all the DataFrames into a single DataFrame
betting_df = pd.concat(data_frames, ignore_index=True)


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [3]:
# Selecting Columns and Data Cleaning

# Converting 'Date' into a datetime object
betting_df["Date"] = pd.to_datetime(betting_df["Date"], errors='coerce')

# Selecting only the relevant columns
columns = [
    "Date",
    "Tournament",
    "Surface",
    "Winner",
    "Loser",
    "WRank",
    "WPts",
    "LRank",
    "LPts",
    "B365W",
    "B365L",
    "PSW",
    "PSL"
]
betting_df = betting_df[columns]

# Converting categorical columns to 'category' data type
categorical_columns = ["Tournament", "Surface"]
betting_df[categorical_columns] = betting_df[categorical_columns].astype("category")

# Handling missing values in 'WRank' and 'LRank'
betting_df["WRank"] = betting_df["WRank"].fillna(100000)
betting_df["LRank"] = betting_df["LRank"].fillna(100000)

# Handling missing values in 'WPts' and 'LPts' by imputing with the median
betting_df["WPts"] = betting_df["WPts"].fillna(betting_df["WPts"].median())
betting_df["LPts"] = betting_df["LPts"].fillna(betting_df["LPts"].median())

# Removing remaining NaN values
betting_df.dropna(inplace=True)

# Creating a higher-ranked player column
betting_df["higher_rank_won"] = betting_df["WRank"] < betting_df["LRank"]

# Calculating the difference in ranking points between the higher and lower-ranked players
betting_df["diff"] = (
    betting_df["WPts"] * betting_df["higher_rank_won"] +
    betting_df["LPts"] * (~betting_df["higher_rank_won"])
) - (
    betting_df["LPts"] * betting_df["higher_rank_won"] +
    betting_df["WPts"] * (~betting_df["higher_rank_won"])
)

# Print the last few rows of the DataFrame to verify the output
print(betting_df.tail())


            Date   Tournament Surface         Winner          Loser  WRank  \
40385 2019-11-15  Masters Cup    Hard      Nadal R.   Tsitsipas S.     1.0   
40386 2019-11-15  Masters Cup    Hard     Zverev A.    Medvedev D.     7.0   
40387 2019-11-16  Masters Cup    Hard  Tsitsipas S.     Federer R.     6.0   
40388 2019-11-16  Masters Cup    Hard      Thiem D.      Zverev A.     5.0   
40389 2019-11-17  Masters Cup    Hard  Tsitsipas S.       Thiem D.     6.0   

         WPts  LRank    LPts  B365W  B365L   PSW   PSL  higher_rank_won  \
40385  9585.0    6.0  4000.0   1.44   2.75  1.39  3.26             True   
40386  2945.0    4.0  5705.0   1.90   1.90  2.14  1.79            False   
40387  4000.0    3.0  6190.0   3.50   1.30  3.75  1.33            False   
40388  5025.0    7.0  2945.0   1.80   2.00  1.84  2.10             True   
40389  4000.0    5.0  5025.0   2.00   1.80  2.00  1.93            False   

         diff  
40385  5585.0  
40386  2760.0  
40387  2190.0  
40388  2080.0  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_df[categorical_columns] = betting_df[categorical_columns].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_df["WRank"] = betting_df["WRank"].fillna(100000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_df["LRank"] = betting_df["LRank"].fillna(100000)


# Data Splitting

In [4]:
# Split the data into training and testing sets based on date
split_date = pd.to_datetime("2019-01-01")
matches_train = betting_df[betting_df["Date"] < split_date]
matches_test = betting_df[betting_df["Date"] >= split_date]

# Prepare the features and target variable
X_train = matches_train[["diff"]]
y_train = matches_train["higher_rank_won"]
X_test = matches_test[["diff"]]
y_test = matches_test["higher_rank_won"]


In [5]:
# Defining the initial Elo rating for all players

initial_elo = 1500
k_factor = 25  # Constant K-factor

# Creating a set of unique player names from 'Winner' and 'Loser'
# Union needed to avoid duplicate entries

unique_player_names = set(matches_train["Winner"]).union(set(matches_train["Loser"]))

# Initialize the Elo ratings dictionary with the initial Elo score
elo_ratings = {player_name: initial_elo for player_name in unique_player_names}


# ELO Functions definition

In [6]:
# Function to calculate expected probability based on Elo ratings
def calculate_EP(elo_1, elo_2):
    return 1 / (1 + 10 ** ((elo_2 - elo_1) / 400))

# Function to update Elo ratings based on match outcomes with a constant K-factor
def winner_update_elo(current_elo, expected_prob, actual_result, k_factor):
    return current_elo + k_factor * (actual_result - expected_prob)

def loser_update_elo(current_elo, expected_prob, actual_result, k_factor):
    return current_elo + k_factor * (actual_result - (1 - expected_prob))

# Updating the ELO ratings for training set

In [7]:
# Update Elo ratings using the constant K-factor model
for index, row in matches_train.iterrows():
    winner_name = row["Winner"]
    loser_name = row["Loser"]

    # Ensure Elo ratings are initialized correctly to avoid KeyErrors
    if winner_name not in elo_ratings:
        elo_ratings[winner_name] = initial_elo
    if loser_name not in elo_ratings:
        elo_ratings[loser_name] = initial_elo

    winner_elo = elo_ratings[winner_name]
    loser_elo = elo_ratings[loser_name]

    expected_win_prob = calculate_EP(winner_elo, loser_elo)

    elo_ratings[winner_name] = winner_update_elo(winner_elo, expected_win_prob, 1, k_factor)
    elo_ratings[loser_name] = loser_update_elo(loser_elo, expected_win_prob, 0, k_factor)


# Updating ELO ratings for testing set

In [8]:
# Ensure all players in the test set are also included in the dictionaries
for index, row in matches_test.iterrows():
    winner_name = row["Winner"]
    loser_name = row["Loser"]

    if winner_name not in elo_ratings:
        elo_ratings[winner_name] = initial_elo

    if loser_name not in elo_ratings:
        elo_ratings[loser_name] = initial_elo

# Calculate expected probabilities for the testing set based on the constant K-factor model
expected_prob_test, expected_outcome_test = [], []

for index, row in matches_test.iterrows():
    winner_name = row["Winner"]
    loser_name = row["Loser"]

    winner_elo = elo_ratings[winner_name]
    loser_elo = elo_ratings[loser_name]

    # Calculate expected probability of the winner defeating the loser
    expected_win_prob = calculate_EP(winner_elo, loser_elo)

    elo_ratings[winner_name] = winner_update_elo(winner_elo, expected_win_prob, 1, k_factor)
    elo_ratings[loser_name] = loser_update_elo(loser_elo, expected_win_prob, 0, k_factor)

    if row["higher_rank_won"] == 1:
        expected_prob_test.append(expected_win_prob)
        expected_outcome_test.append(int(expected_win_prob > 0.5))
    else:
        expected_prob_test.append(1 - expected_win_prob)
        expected_outcome_test.append(int((1 - expected_win_prob) > 0.5))


# Model Evaluation

In [9]:
# Prepare the actual results for the test set
y_test = matches_test["higher_rank_won"].apply(lambda x: 1 if x else 0)  # Ensuring binary labels

# Evaluate the ELO model
elo_accuracy = accuracy_score(y_test, expected_outcome_test)
elo_log_loss = log_loss(y_test, expected_prob_test)
elo_calibration = np.sum(expected_prob_test) / np.sum(y_test)

# Print results
print(f"Elo Model Accuracy: {elo_accuracy}")
print(f"Elo Model Log Loss: {elo_log_loss}")
print(f"Elo Model Calibration: {elo_calibration}")

# Add ELO model results to validation stats
validation_stats = pd.DataFrame({
    "model": ["elo_constant_k"],
    "accuracy": [elo_accuracy],
    "calibration": [elo_calibration],
    "log_loss": [elo_log_loss]
})

print(validation_stats)


Elo Model Accuracy: 0.6312524234199302
Elo Model Log Loss: 0.6336791627253523
Elo Model Calibration: 1.0338132755797784
            model  accuracy  calibration  log_loss
0  elo_constant_k  0.631252     1.033813  0.633679


# Optimizing the Model using different K values

In [10]:
# Defining the initial Elo rating for all players
initial_elo = 1500

# Creating a set of unique player names from 'Winner' and 'Loser'
unique_player_names = set(matches_train["Winner"]).union(set(matches_train["Loser"]))

# Initializing the Elo ratings dictionary with the initial Elo score
elo_ratings = {player_name: initial_elo for player_name in unique_player_names}

In [11]:
# Function to evaluate the ELO model with different K values

def evaluate_elo_model(k_factor):

    # Initialize Elo ratings again to start fresh for each K-factor

    elo_ratings = {player_name: initial_elo for player_name in unique_player_names}

    # Update Elo ratings using the constant K-factor model
    for index, row in matches_train.iterrows():
        winner_name = row["Winner"]
        loser_name = row["Loser"]

        if winner_name not in elo_ratings:
            elo_ratings[winner_name] = initial_elo
        if loser_name not in elo_ratings:
            elo_ratings[loser_name] = initial_elo

        winner_elo = elo_ratings[winner_name]
        loser_elo = elo_ratings[loser_name]

        expected_win_prob = calculate_EP(winner_elo, loser_elo)

        elo_ratings[winner_name] = winner_update_elo(winner_elo, expected_win_prob, 1, k_factor)
        elo_ratings[loser_name] = loser_update_elo(loser_elo, expected_win_prob, 0, k_factor)


    for index, row in matches_test.iterrows():
        winner_name = row["Winner"]
        loser_name = row["Loser"]

        if winner_name not in elo_ratings:
            elo_ratings[winner_name] = initial_elo

        if loser_name not in elo_ratings:
            elo_ratings[loser_name] = initial_elo

    # Calculating expected probabilities for the testing set based on the constant K-factor model
    expected_prob_test, expected_outcome_test = [], []

    for index, row in matches_test.iterrows():
        winner_name = row["Winner"]
        loser_name = row["Loser"]

        winner_elo = elo_ratings[winner_name]
        loser_elo = elo_ratings[loser_name]

        expected_win_prob = calculate_EP(winner_elo, loser_elo)

        elo_ratings[winner_name] = winner_update_elo(winner_elo, expected_win_prob, 1, k_factor)
        elo_ratings[loser_name] = loser_update_elo(loser_elo, expected_win_prob, 0, k_factor)

        if row["higher_rank_won"] == 1:
            expected_prob_test.append(expected_win_prob)
            expected_outcome_test.append(int(expected_win_prob > 0.5))
        else:
            expected_prob_test.append(1 - expected_win_prob)
            expected_outcome_test.append(int((1 - expected_win_prob) > 0.5))

    y_test = matches_test["higher_rank_won"].apply(lambda x: 1 if x else 0)

    # Evaluating the ELO model
    elo_accuracy = accuracy_score(y_test, expected_outcome_test)
    elo_log_loss = log_loss(y_test, expected_prob_test)
    elo_calibration = np.sum(expected_prob_test) / np.sum(y_test)

    return elo_accuracy, elo_log_loss, elo_calibration


In [13]:
# Range of K values
k_values = [10, 15, 20, 25, 30, 35, 40]

# Empty list to Store results
results = []

for k in k_values:
    accuracy, log_loss_value, calibration = evaluate_elo_model(k)
    results.append({
        "k_factor": k,
        "accuracy": accuracy,
        "log_loss": log_loss_value,
        "calibration": calibration
    })

# Converting results to a DataFrame
results_df = pd.DataFrame(results)

# Finding the best K-factor based on the desired metric
best_result = results_df.loc[results_df["log_loss"].idxmax()]

print("Best K-factor based on log_loss:")
print(best_result)

Best K-factor based on log_loss:
k_factor       40.000000
accuracy        0.632416
log_loss        0.638343
calibration     1.050368
Name: 6, dtype: float64


In [14]:
best_k_factor = best_result["k_factor"]
best_accuracy, best_log_loss, best_calibration = evaluate_elo_model(best_k_factor)

# Print final model evaluation with the best K-factor
print(f"Best K-factor: {best_k_factor}")
print(f"Final Elo Model Accuracy: {best_accuracy}")
print(f"Final Elo Model Log Loss: {best_log_loss}")
print(f"Final Elo Model Calibration: {best_calibration}")

Best K-factor: 40.0
Final Elo Model Accuracy: 0.6324156649864289
Final Elo Model Log Loss: 0.6383433765724233
Final Elo Model Calibration: 1.050368017361009
