# Loading Libraries

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
import matplotlib.pyplot as plt

In [2]:
# List of file paths for the data from 2005 to 2019

file_paths = [

    '/content/drive/MyDrive/Betting_data/2005.xls',
    '/content/drive/MyDrive/Betting_data/2006.xls',
    '/content/drive/MyDrive/Betting_data/2007.xls',
    '/content/drive/MyDrive/Betting_data/2008.xls',
    '/content/drive/MyDrive/Betting_data/2009.xls',
    '/content/drive/MyDrive/Betting_data/2010.xls',
    '/content/drive/MyDrive/Betting_data/2011.xls',
    '/content/drive/MyDrive/Betting_data/2012.xls',
    '/content/drive/MyDrive/Betting_data/2013.xlsx',
    '/content/drive/MyDrive/Betting_data/2014.xlsx',
    '/content/drive/MyDrive/Betting_data/2015.xlsx',
    '/content/drive/MyDrive/Betting_data/2016.xlsx',
    '/content/drive/MyDrive/Betting_data/2017.xlsx',
    '/content/drive/MyDrive/Betting_data/2018.xlsx',
    '/content/drive/MyDrive/Betting_data/2019.xlsx'
]

# Initializing an empty list to hold DataFrames
data_frames = []

# Loading data into a DataFrame
for file_path in file_paths:
    if os.path.exists(file_path):
        df = pd.read_excel(file_path)
        data_frames.append(df)
    else:
        print(f"File {file_path} not found.")

# Combining all the DataFrames into a single DataFrame
betting_df = pd.concat(data_frames, ignore_index=True)


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [3]:
# Selecting Columns and Data Cleaning

# Converting 'Date' into a datetime object
betting_df["Date"] = pd.to_datetime(betting_df["Date"], errors='coerce')

# Selecting only the relevant columns
columns = [
    "Date",
    "Tournament",
    "Surface",
    "Winner",
    "Loser",
    "WRank",
    "WPts",
    "LRank",
    "LPts",
    "B365W",
    "B365L",
    "PSW",
    "PSL"
]
betting_df = betting_df[columns]

# Converting categorical columns to 'category' data type
categorical_columns = ["Tournament", "Surface"]
betting_df[categorical_columns] = betting_df[categorical_columns].astype("category")

# Handling missing values in 'WRank' and 'LRank'
betting_df["WRank"] = betting_df["WRank"].fillna(100000)
betting_df["LRank"] = betting_df["LRank"].fillna(100000)

# Handling missing values in 'WPts' and 'LPts' by imputing with the median
betting_df["WPts"] = betting_df["WPts"].fillna(betting_df["WPts"].median())
betting_df["LPts"] = betting_df["LPts"].fillna(betting_df["LPts"].median())

# Removing remaining NaN values
betting_df.dropna(inplace=True)

# Creating a higher-ranked player column
betting_df["higher_rank_won"] = betting_df["WRank"] < betting_df["LRank"]

# Calculating the difference in ranking points between the higher and lower-ranked players
betting_df["diff"] = (
    betting_df["WPts"] * betting_df["higher_rank_won"] +
    betting_df["LPts"] * (~betting_df["higher_rank_won"])
) - (
    betting_df["LPts"] * betting_df["higher_rank_won"] +
    betting_df["WPts"] * (~betting_df["higher_rank_won"])
)

# Print the last few rows of the DataFrame to verify the output
print(betting_df.tail())


            Date   Tournament Surface         Winner          Loser  WRank  \
40385 2019-11-15  Masters Cup    Hard      Nadal R.   Tsitsipas S.     1.0   
40386 2019-11-15  Masters Cup    Hard     Zverev A.    Medvedev D.     7.0   
40387 2019-11-16  Masters Cup    Hard  Tsitsipas S.     Federer R.     6.0   
40388 2019-11-16  Masters Cup    Hard      Thiem D.      Zverev A.     5.0   
40389 2019-11-17  Masters Cup    Hard  Tsitsipas S.       Thiem D.     6.0   

         WPts  LRank    LPts  B365W  B365L   PSW   PSL  higher_rank_won  \
40385  9585.0    6.0  4000.0   1.44   2.75  1.39  3.26             True   
40386  2945.0    4.0  5705.0   1.90   1.90  2.14  1.79            False   
40387  4000.0    3.0  6190.0   3.50   1.30  3.75  1.33            False   
40388  5025.0    7.0  2945.0   1.80   2.00  1.84  2.10             True   
40389  4000.0    5.0  5025.0   2.00   1.80  2.00  1.93            False   

         diff  
40385  5585.0  
40386  2760.0  
40387  2190.0  
40388  2080.0  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_df[categorical_columns] = betting_df[categorical_columns].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_df["WRank"] = betting_df["WRank"].fillna(100000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  betting_df["LRank"] = betting_df["LRank"].fillna(100000)
A valu

In [4]:
# Assuming 'betting_df' is the cleaned DataFrame from your previous step
# Ensure 'higher_rank_won' is correctly set up as a binary column
betting_df["higher_rank_won"] = betting_df["WRank"] < betting_df["LRank"]

n = len(betting_df)  # Total number of rows in the dataset

# Calculate naive accuracy
naive_accuracy = betting_df["higher_rank_won"].mean()  # The probability that the higher-ranked player will win

# Log-Loss for naive model
w = betting_df["higher_rank_won"]  # Outcomes of matches (whether the higher-ranked player won)
pi_naive = naive_accuracy  # Constant probability for the naive model

# Calculate log-loss for the naive model
log_loss_naive = -1 / n * sum(w * np.log(pi_naive) + (1 - w) * np.log(1 - pi_naive))

# Calibration for naive model
calibration_naive = naive_accuracy * n / sum(w)  # Expected to be close to 1 if the model is well-calibrated

# Creating a DataFrame to store validation stats
validation_stats = pd.DataFrame({
    "model": ["naive"],
    "accuracy": [naive_accuracy],
    "calibration": [calibration_naive],
    "log_loss": [log_loss_naive]
})

print(validation_stats)  # Return validation metrics for the naive model


   model  accuracy  calibration  log_loss
0  naive  0.658661          1.0  0.641919
