# Loading Libraries

In [8]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Data Loading and Cleaning

In [9]:
# List of file paths for the data from 2005 to 2019

file_paths = [

    '/content/drive/MyDrive/Betting_data/2005.xls',
    '/content/drive/MyDrive/Betting_data/2006.xls',
    '/content/drive/MyDrive/Betting_data/2007.xls',
    '/content/drive/MyDrive/Betting_data/2008.xls',
    '/content/drive/MyDrive/Betting_data/2009.xls',
    '/content/drive/MyDrive/Betting_data/2010.xls',
    '/content/drive/MyDrive/Betting_data/2011.xls',
    '/content/drive/MyDrive/Betting_data/2012.xls',
    '/content/drive/MyDrive/Betting_data/2013.xlsx',
    '/content/drive/MyDrive/Betting_data/2014.xlsx',
    '/content/drive/MyDrive/Betting_data/2015.xlsx',
    '/content/drive/MyDrive/Betting_data/2016.xlsx',
    '/content/drive/MyDrive/Betting_data/2017.xlsx',
    '/content/drive/MyDrive/Betting_data/2018.xlsx',
    '/content/drive/MyDrive/Betting_data/2019.xlsx'
]

# Initializing an empty list to hold DataFrames
data_frames = []

# Loading data into a DataFrame
for file_path in file_paths:
    if os.path.exists(file_path):
        df = pd.read_excel(file_path)
        data_frames.append(df)
    else:
        print(f"File {file_path} not found.")

# Combining all the DataFrames into a single DataFrame
betting_df = pd.concat(data_frames, ignore_index=True)


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [10]:
# Converting 'Date' into a datetime object
betting_df["Date"] = pd.to_datetime(betting_df["Date"], errors='coerce')

# Converting categorical columns to 'category' data type
categorical_columns = ["Tournament", "Surface"]
betting_df[categorical_columns] = betting_df[categorical_columns].astype("category")

# Handling missing values in 'WRank' and 'LRank'
betting_df["WRank"] = betting_df["WRank"].fillna(100000)
betting_df["LRank"] = betting_df["LRank"].fillna(100000)

# Handling missing values in 'WPts' and 'LPts' by imputing with the median
betting_df["WPts"] = betting_df["WPts"].fillna(betting_df["WPts"].median())
betting_df["LPts"] = betting_df["LPts"].fillna(betting_df["LPts"].median())

# Creating a higher-ranked player column
betting_df["higher_rank_won"] = betting_df["WRank"] < betting_df["LRank"]

# Select only the odds columns
odds_columns = [
    'B365W', 'B365L', 'PSW', 'PSL']
#betting_df = betting_df[odds_columns]

## Checking for missing values

In [11]:


# Calculate the number of missing values in each odds column
missing_values = betting_df[odds_columns].isna().sum()

# Print the number of missing values for each odds column
print(missing_values)


B365W     547
B365L     524
PSW      3150
PSL      3150
dtype: int64


## Replacing the missing values with mean

In [12]:
# Fill missing values in odds columns with the mean of each respective column
betting_df[odds_columns] = betting_df[odds_columns].fillna(betting_df[odds_columns].mean())

# Verify if there are any remaining missing values
missing_values_after = betting_df[odds_columns].isna().sum()
print(missing_values_after)

B365W    0
B365L    0
PSW      0
PSL      0
dtype: int64


## Data Splitting

In [14]:
# Split the data into training and testing sets based on the date
split_date = pd.to_datetime("2019-01-01")
matches_train = betting_df[betting_df["Date"] < split_date]
matches_test = betting_df[betting_df["Date"] >= split_date]

In [15]:
matches_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37797 entries, 0 to 37842
Data columns (total 49 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   ATP              37797 non-null  int64         
 1   Location         37797 non-null  object        
 2   Tournament       37797 non-null  category      
 3   Date             37797 non-null  datetime64[ns]
 4   Series           37797 non-null  object        
 5   Court            37797 non-null  object        
 6   Surface          37797 non-null  category      
 7   Round            37797 non-null  object        
 8   Best of          37797 non-null  int64         
 9   Winner           37797 non-null  object        
 10  Loser            37797 non-null  object        
 11  WRank            37797 non-null  float64       
 12  LRank            37797 non-null  float64       
 13  WPts             37797 non-null  float64       
 14  LPts             37797 non-null  float64   

In [16]:
# Functions involved in BCM model
def implied_probabilities(alpha, beta):
    """
    Calculate implied probabilities from odds.
    """
    p1 = beta / (alpha + beta)
    p2 = alpha / (alpha + beta)
    return p1, p2

def logit(p):
    """
    Compute the logit (log-odds) of a probability.
    """
    return np.log(p / (1 - p))

def inv_logit(y):
    """
    Compute the inverse logit to get the probability.
    """
    return np.exp(y) / (1 + np.exp(y))

# Extracting odds columns for players 1 and 2 from multiple companies
company_odds_dict = {
    'B365': ['B365W', 'B365L'],
    'PS': ['PSW', 'PSL']
}

# Function to calculate BCM probabilities
def calculate_bcm_probabilities(matches):
    probs = []
    for company, (winner_odds, loser_odds) in company_odds_dict.items():
        if winner_odds in matches.columns and loser_odds in matches.columns:
            alpha = matches[winner_odds]
            beta = matches[loser_odds]
            p1, p2 = implied_probabilities(alpha, beta)
            probs.append(p1)

    # Calculate the logit of the average probabilities
    logit_probs = np.mean([logit(p) for p in probs], axis=0)
    consensus_probs = inv_logit(logit_probs)

    return consensus_probs

# Calculating BCM probabilities for the training and testing datasets
train_bcm_probs = calculate_bcm_probabilities(matches_train)
test_bcm_probs = calculate_bcm_probabilities(matches_test)


# Evaluating the BCM model on the testing set
y_test = matches_test['higher_rank_won']

# Calculating accuracy
bcm_accuracy = accuracy_score(y_test, np.round(test_bcm_probs))

# Calculating log loss
bcm_log_loss = log_loss(y_test, test_bcm_probs)

# CalcuCalculatinglate calibration
bcm_calibration = np.sum(test_bcm_probs) / np.sum(y_test)

print(f"BCM Model Accuracy: {bcm_accuracy}")
print(f"BCM Model Log Loss: {bcm_log_loss}")
print(f"BCM Model Calibration: {bcm_calibration}")

# Adding BCM model results to validation stats
validation_stats = pd.DataFrame({
    "model": ["bcm"],
    "accuracy": [bcm_accuracy],
    "calibration": [bcm_calibration],
    "log_loss": [bcm_log_loss]
})

print(validation_stats)

BCM Model Accuracy: 0.786733513305052
BCM Model Log Loss: 0.5032026995207048
BCM Model Calibration: 0.9562794838292791
  model  accuracy  calibration  log_loss
0   bcm  0.786734     0.956279  0.503203
