In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', None)

# Historical Odds Import and Conversion

In [2]:
# Creating a formula for odds conversion 
def odds_conversion(x):
    if x < 0:
        return (-x) / ((-x) + 100) 
    else: 
        return (100 / (x + 100))

In [3]:
# Creating a list of all the race file names. This will be used to loop over for importing the data into dataframes

import os 

path = '../Raw Data/Odds Data/Historical Odds/2022'
table_list = []

for filename in os.listdir(path):
    
    if filename.endswith('.csv'):
        table_list.append(filename[:-4])

print(table_list)
   

['Singapore', 'UnitedStates', 'Canadian', 'Brazilian', 'Miami', 'Azerbaijan', 'British', 'Spanish', 'Australian', 'Hungarian', 'Bahrain', 'Italian', 'Dutch', 'Japanese', 'SaudiArabian', 'Austrian', 'Monaco', 'AbuDhabi', 'Belgian', 'MexicoCity', 'EmiliaRomagna', 'French']


In [4]:
# Import CSV loop

odds_df_dict = {}

for race in table_list:
    df = pd.read_csv('../Raw Data/Odds Data/Historical Odds/2022/'+race+'.csv',header = 0,sep = '|')
    odds_df_dict[race] = df
    


In [5]:
# Converting the odds from American odds format to implied probabilities
# There is also some data cleaning for driver names in this loop

for race in odds_df_dict:
    odds_df_dict[race]['Odds to Win'] = odds_df_dict[race]['Odds to Win'].apply(odds_conversion)
    odds_df_dict[race]['Odds to Finish Top Three'] = odds_df_dict[race]['Odds to Finish Top Three'].apply(odds_conversion)
    odds_df_dict[race]['Odds to Finish Top Six'] = odds_df_dict[race]['Odds to Finish Top Six'].apply(odds_conversion)
    odds_df_dict[race]['Odds to Finish Top Ten'] = odds_df_dict[race]['Odds to Finish Top Ten'].apply(odds_conversion)
    
    # Below here is data cleaning - making sure the driver name is consistent across files
    odds_df_dict[race]['Driver'] = odds_df_dict[race]['Driver'].str.replace('Alex Albon','Alexander Albon',regex = True)
    odds_df_dict[race]['Driver'] = odds_df_dict[race]['Driver'].str.replace('Carlos Sainz Jr.','Carlos Sainz',regex = True)
    odds_df_dict[race]['Driver'] = odds_df_dict[race]['Driver'].str.replace('Guanyu Zhou','Zhou Guanyu',regex = True)
    odds_df_dict[race]['Driver'] = odds_df_dict[race]['Driver'].str.replace('Nick Latifi','Nicholas Latifi',regex = True)

# Importing and Formatting Race Information

In [6]:
# Importing race results, the race information, and driver information

results = pd.read_csv('../Raw Data/Historical Race Data/1950_to_2022_CSVs/races.csv',header = 0,sep = ',')
races = pd.read_csv('../Raw Data/Historical Race Data/1950_to_2022_CSVs/results.csv',header = 0,sep = ',')
drivers = pd.read_csv('../Raw Data/Historical Race Data/1950_to_2022_CSVs/drivers.csv',header = 0,sep = ',')

results_2022 = results.loc[results['year'] == 2022]
#results_2022.head(22)

In [7]:
# Creating a dictionary for the circuit IDs and race file names

raceId_dict = {'Singapore': 1091, 
              'UnitedStates': 1093, 
              'Canadian': 1082, 
              'Brazilian': 1095, 
              'Miami': 1078, 
              'Azerbaijan': 1081, 
              'British': 1083, 
              'Spanish': 1079, 
              'Australian': 1076, 
              'Hungarian': 1086, 
              'Bahrain': 1074, 
              'Italian': 1089, 
              'Dutch': 1088, 
              'Japanese': 1092, 
              'SaudiArabian': 1075, 
              'Austrian': 1084, 
              'Monaco': 1080, 
              'AbuDhabi': 1096, 
              'Belgian': 1087, 
              'MexicoCity': 1094, 
              'EmiliaRomagna': 1077, 
              'French': 1085}

In [8]:
# Creating a dictionary for the driver Ids and racer names

drivers['combined name'] = drivers['forename'] + ' ' + drivers['surname']
drivers.head()

#drivers.loc[drivers['combined name'].isin(odds_df_dict['Australian']['Driver'])] 

driverId_dict = {
      'Lewis Hamilton': 1,
     'Fernando Alonso': 4,
    'Sebastian Vettel': 20,
        'Pierre Gasly': 842,
    'Daniel Ricciardo': 817,
     'Valtteri Bottas': 822,
     'Kevin Magnussen': 825,
      'Max Verstappen': 830,
        'Carlos Sainz': 832,
        'Esteban Ocon': 839,
        'Lance Stroll': 840,
     'Charles Leclerc': 844,
        'Lando Norris': 846,
      'George Russell': 847,
     'Nicholas Latifi': 849,
        'Yuki Tsunoda': 852,
     'Mick Schumacher': 854,
         'Zhou Guanyu': 855,
     'Alexander Albon': 848,
        'Sergio Perez': 815,
     'Nico Hulkenberg': 807
}

# Backtesting - dummy probability predictions for each race

In [9]:
# Creating the dummy probability CSVs programmatically

EvenWeighting = pd.read_csv('../Processed Data/Dummy Probability Outputs/EvenWeighting.csv',header = 0,sep = ',')

for race in raceId_dict:
    EvenWeighting.to_csv('../Processed Data/Dummy Probability Outputs/Even Weighting All Races/'+race+'_EvenWeighting.csv', index=False)

In [10]:
# Reading the dummy probabilities into a dictionary of dataframes

predictions_df_dict = {}

for race in raceId_dict:
    df = pd.read_csv('../Processed Data/Dummy Probability Outputs/Even Weighting All Races/'+race+'_EvenWeighting.csv',header = 0,sep = ',')
    predictions_df_dict[race] = df

In [11]:
# Creating a dictionary of converted predictions 
# Transforming the even weighting dummy file so that it contains odds for 'Odds to Win', 'Odds to Finish Top Three', 
# 'Odds to Finish Top Six', and 'Odds to Finish Top Ten'

converted_predictions_df_dict = {}

for race in raceId_dict:

    converted_predictions_df_dict[race] = pd.DataFrame(columns=['Driver','Probability of Winning', 
                                                    'Probability of Finishing Top Three', 
                                                    'Probability of Finishing Top Six', 
                                                    'Probability of Finishing Top Ten'])

    converted_predictions_df_dict[race]['Driver'] = predictions_df_dict[race]['Driver']
    converted_predictions_df_dict[race]['Probability of Winning'] = predictions_df_dict[race]['1']
    converted_predictions_df_dict[race]['Probability of Finishing Top Three'] = predictions_df_dict[race]['1'] + predictions_df_dict[race]['2'] + predictions_df_dict[race]['3']
    converted_predictions_df_dict[race]['Probability of Finishing Top Six'] = predictions_df_dict[race]['1'] + predictions_df_dict[race]['2'] + predictions_df_dict[race]['3'] + predictions_df_dict[race]['4'] + predictions_df_dict[race]['5'] + predictions_df_dict[race]['6']
    converted_predictions_df_dict[race]['Probability of Finishing Top Ten'] = predictions_df_dict[race]['1'] + predictions_df_dict[race]['2'] + predictions_df_dict[race]['3'] + predictions_df_dict[race]['4'] + predictions_df_dict[race]['5'] + predictions_df_dict[race]['6']+ predictions_df_dict[race]['7'] + predictions_df_dict[race]['8'] + predictions_df_dict[race]['9'] + predictions_df_dict[race]['10']

In [12]:
# Creating an empty dataframe for the backtesting log

BacktestingLog = pd.DataFrame(columns=['Race','Driver', 'Bet placed', 'Driver race outcome', 
                                                'Implied probability', 'Estimated probability',
                                               'Expected value', 'Bet outcome', 'Units won',
                                               'Net units won'])


In [13]:
# Creating a douple loop over the race implied probability dataframes
# This will perform the backtesting and log the results into a new dataframe

temp = []

for race in odds_df_dict:

    for driver in odds_df_dict[race]['Driver']:

        
        # NOTE: This if statement is for handling two situations where a driver was subbed out last minute 
        # for another driver. Because this is a rare scenario, I thought it was better to handle these manually 
        # rather than trying to program something dynamic
        if (race == 'Italian' and driver == 'Alexander Albon') or (race == 'SaudiArabian' and driver == 'Sebastian Vettel'):
            continue
            
          
        # NOTE: It is likely possible to replace the four 'comparison' sections with a loop but this was not deemed a priority 
        
        
        #First comparison - odds to win
        ImpliedOdds = odds_df_dict[race].loc[odds_df_dict[race]['Driver'] == driver,'Odds to Win'] 
        EstimatedOdds = converted_predictions_df_dict[race].loc[converted_predictions_df_dict[race]['Driver'] == driver,'Probability of Winning']
        

        if EstimatedOdds.iloc[0] > ImpliedOdds.iloc[0]:

            
            DriverOutcome = races.loc[((races['driverId'] == driverId_dict[driver]) 
                                       & (races['raceId'] == raceId_dict[race])),'position']
            
            BetOutcome = 0
            UnitsWon = 0
            
            if DriverOutcome.iloc[0] == '1':
                BetOutcome = 1
                UnitsWon = 1 / ImpliedOdds.iloc[0]
                
            NetUnitsWon = UnitsWon - 1
            
            BacktestingLog = pd.concat([BacktestingLog, pd.DataFrame.from_records([{
                'Race': race,
                'Driver': driver,
                'Bet placed': 'Odds to Win',
                'Driver race outcome': DriverOutcome.iloc[0],
                'Implied probability': ImpliedOdds.iloc[0], 
                'Estimated probability': EstimatedOdds.iloc[0],
                'Expected value': (EstimatedOdds.iloc[0] / ImpliedOdds.iloc[0]) - 1,
                'Bet outcome': BetOutcome,
                'Units won': UnitsWon,
                'Net units won': NetUnitsWon, 
               
            }])])
        
        
        #Second comparison - Odds to Finish Top Three
        ImpliedOdds = odds_df_dict[race].loc[odds_df_dict[race]['Driver'] == driver,'Odds to Finish Top Three'] 
        EstimatedOdds = converted_predictions_df_dict[race].loc[converted_predictions_df_dict[race]['Driver'] == driver,'Probability of Finishing Top Three']
        

        if EstimatedOdds.iloc[0] > ImpliedOdds.iloc[0]:

            
            DriverOutcome = races.loc[((races['driverId'] == driverId_dict[driver]) 
                                       & (races['raceId'] == raceId_dict[race])),'position']
            
            BetOutcome = 0
            UnitsWon = 0
            
            if DriverOutcome.iloc[0] in ['1',  '2', '3']:
                BetOutcome = 1
                UnitsWon = 1 / ImpliedOdds.iloc[0]
                
            NetUnitsWon = UnitsWon - 1
            
            BacktestingLog = pd.concat([BacktestingLog, pd.DataFrame.from_records([{
                'Race': race,
                'Driver': driver,
                'Bet placed': 'Odds to Finish Top Three',
                'Driver race outcome': DriverOutcome.iloc[0],
                'Implied probability': ImpliedOdds.iloc[0], 
                'Estimated probability': EstimatedOdds.iloc[0],
                'Expected value': (EstimatedOdds.iloc[0] / ImpliedOdds.iloc[0]) - 1,
                'Bet outcome': BetOutcome,
                'Units won': UnitsWon,
                'Net units won': NetUnitsWon, 
               
            }])])            
    
        #Third comparison - Odds to Finish Top Six
        ImpliedOdds = odds_df_dict[race].loc[odds_df_dict[race]['Driver'] == driver,'Odds to Finish Top Six'] 
        EstimatedOdds = converted_predictions_df_dict[race].loc[converted_predictions_df_dict[race]['Driver'] == driver,'Probability of Finishing Top Six']
        

        if EstimatedOdds.iloc[0] > ImpliedOdds.iloc[0]:

            
            DriverOutcome = races.loc[((races['driverId'] == driverId_dict[driver]) 
                                       & (races['raceId'] == raceId_dict[race])),'position']
            
            BetOutcome = 0
            UnitsWon = 0
            
            if DriverOutcome.iloc[0] in ['1',  '2', '3', '4','5','6']:
                BetOutcome = 1
                UnitsWon = 1 / ImpliedOdds.iloc[0]
                
            NetUnitsWon = UnitsWon - 1
            
            BacktestingLog = pd.concat([BacktestingLog, pd.DataFrame.from_records([{
                'Race': race,
                'Driver': driver,
                'Bet placed': 'Odds to Finish Top Six',
                'Driver race outcome': DriverOutcome.iloc[0],
                'Implied probability': ImpliedOdds.iloc[0], 
                'Estimated probability': EstimatedOdds.iloc[0],
                'Expected value': (EstimatedOdds.iloc[0] / ImpliedOdds.iloc[0]) - 1,
                'Bet outcome': BetOutcome,
                'Units won': UnitsWon,
                'Net units won': NetUnitsWon, 
               
            }])])       
            
        #Fourth comparison - Odds to Finish Top Ten
        ImpliedOdds = odds_df_dict[race].loc[odds_df_dict[race]['Driver'] == driver,'Odds to Finish Top Ten'] 
        EstimatedOdds = converted_predictions_df_dict[race].loc[converted_predictions_df_dict[race]['Driver'] == driver,'Probability of Finishing Top Ten']
        

        if EstimatedOdds.iloc[0] > ImpliedOdds.iloc[0]:

            
            DriverOutcome = races.loc[((races['driverId'] == driverId_dict[driver]) 
                                       & (races['raceId'] == raceId_dict[race])),'position']
            
            BetOutcome = 0
            UnitsWon = 0
            
            if DriverOutcome.iloc[0] in ['1',  '2', '3', '4','5','6','7','8','9','10']:
                BetOutcome = 1
                UnitsWon = 1 / ImpliedOdds.iloc[0]
                
            NetUnitsWon = UnitsWon - 1
            
            BacktestingLog = pd.concat([BacktestingLog, pd.DataFrame.from_records([{
                'Race': race,
                'Driver': driver,
                'Bet placed': 'Odds to Finish Top Ten',
                'Driver race outcome': DriverOutcome.iloc[0],
                'Implied probability': ImpliedOdds.iloc[0], 
                'Estimated probability': EstimatedOdds.iloc[0],
                'Expected value': (EstimatedOdds.iloc[0] / ImpliedOdds.iloc[0]) - 1,
                'Bet outcome': BetOutcome,
                'Units won': UnitsWon,
                'Net units won': NetUnitsWon, 
               
            }])])          

## Results summary 

In [14]:
print('Bets placed: ' + str(BacktestingLog['Bet outcome'].count()))
print('Bets won: ' + str(BacktestingLog['Bet outcome'].sum()))
print('Net units won: ' + str(BacktestingLog['Net units won'].sum()))
print('ROI %: ' + str(
    (BacktestingLog['Net units won'].sum() / BacktestingLog['Bet outcome'].count()) * 100
    ))
print('\n')
print('Average expected value: ' + str(BacktestingLog['Expected value'].mean()))
print('Min expected value: ' + str(BacktestingLog['Expected value'].min()))
print('Median expected value: ' + str(BacktestingLog['Expected value'].median()))
print('Max expected value: ' + str(BacktestingLog['Expected value'].max()))


Bets placed: 1752
Bets won: 439
Net units won: -761.0553841720804
ROI %: -43.43923425639728


Average expected value: 7.84496609555216
Min expected value: -0.9344115004479245
Median expected value: 0.8571428571799999
Max expected value: 427.61904762762


In [15]:
BacktestingLog.to_csv('../Processed Data/Backtesting Results/EvenWeighting_BackTestingLog.csv', index=False)