# Economic experiment testing accuracy- vs. profitability-optimised models on artificial data

## Import

In [None]:
import os
import random
import numpy as np

# Generate reproducible random number generator
SEED = 10
rng = np.random.default_rng(SEED)

from dfg_rating.model import factory
from dfg_rating.model.betting.betting import FixedBetting
from dfg_rating.model.betting.betting import KellyBetting
from dfg_rating.model.bookmaker.base_bookmaker import BaseBookmaker
from dfg_rating.model.evaluators.accuracy import RankProbabilityScore, Likelihood, ProbabilityDifference, ProbabilityPointer, FavouriteProbability
from dfg_rating.model.evaluators.profitability import BettingReturnsEvaluator
from dfg_rating.model.evaluators.base_evaluators import BettingActivity
from dfg_rating.model.forecast.true_forecast import LogFunctionForecast
from dfg_rating.model.network.base_network import BaseNetwork
from dfg_rating.model.rating.controlled_trend_rating import ControlledTrendRating, ControlledRandomFunction
from dfg_rating.model.rating.base_rating import RatingFunctionError
from dfg_rating.logic.controller import Controller

from pathlib import Path

import pandas as pd
import time
import math
from tqdm import tqdm
import itertools as it

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.width = None
pd.set_option('display.float_format', lambda x: '%.5f' % x)

## Specification of the experimental runs (particularly the bias used)

In [None]:
#The controller is the helper that includes the functions to load and save networks. A configuration object with the parameters of the experiment.
main_controller = Controller()

In [None]:
#The first bias is a home advantage bias implicitly contained in the bookmaker_forecast
experimentHA = dict(
    test_name="ExperimentHA", create_data=True,
    bookmaker_error1 = 0.0, bookmaker_margin1 = 0.00,
    bookmaker_error2 = 0.0, bookmaker_margin2 = 0.10,
    betting_bankrole=100,
    rating_error_loc=0, rating_error_scale=50,
    bookmaker_rating_error_loc=0, bookmaker_rating_error_scale=25,
    number_of_leagues=40,
    number_of_teams=20,
    number_of_seasons=10,
    true_forecast=dict(
        coefficients=[-0.9,0.3],
        beta=0.006
    ),
    bookmaker_forecast=dict(
        coefficients=[-1.2, 0.0],
        beta=0.006
    ),
    in_sample_maximum=5,
)

In [None]:
#The second bias is a draw bias implicitly contained in the bookmaker_forecast
experimentDraw = dict(
    test_name="ExperimentDraw", create_data=True,
    bookmaker_error1 = 0.0, bookmaker_margin1 = 0.00,
    bookmaker_error2 = 0.0, bookmaker_margin2 = 0.10,
    betting_bankrole=100,
    rating_error_loc=0, rating_error_scale=50,
    bookmaker_rating_error_loc=0, bookmaker_rating_error_scale=25,
    number_of_leagues=40,
    number_of_teams=20, number_of_seasons=10,
    true_forecast=dict(
        coefficients=[-0.9,0.3],
        beta=0.006
    ),
    bookmaker_forecast=dict(
        coefficients=[-0.8, 0.2],
        beta=0.006
    ),
    in_sample_maximum=5,
)

In [None]:
#The third bias is a favorite longshot bias implicitly contained in the bookmaker_forecast
experimentFL = dict(
    test_name="ExperimentFL", create_data=True,
    bookmaker_error1 = 0.0, bookmaker_margin1 = 0.00,
    bookmaker_error2 = 0.0, bookmaker_margin2 = 0.10,
    betting_bankrole=100,
    rating_error_loc=0, rating_error_scale=50,
    bookmaker_rating_error_loc=0, bookmaker_rating_error_scale=25,
    number_of_leagues=40,
    number_of_teams=20, number_of_seasons=10,
    true_forecast=dict(
        coefficients=[-0.9,0.3],
        beta=0.006
    ),
    bookmaker_forecast=dict(
        coefficients=[-0.8, 0.4],
        beta=0.004
    ),
    in_sample_maximum=5,
)

### !!Choice of the bias used in the current experimental run !!

In [None]:
# The desired experimental configuration is stored at config
config = experimentFL

### For all runs, different margins and two betting strategies are tested.

We create several bookmakers and a betting strategy that are going to interact with the generated networks.

In [None]:
#Two bookmakers are created (just using different margins as specified in the specification)
bookmaker1: BaseBookmaker = factory.new_bookmaker(
    'simple',
    error=factory.new_forecast_error(error_type='factor', error=config["bookmaker_error1"], scope='positive'),
    margin=factory.new_bookmaker_margin('simple', margin=config["bookmaker_margin1"])
)
bookmaker2: BaseBookmaker = factory.new_bookmaker(
    'simple',
    error=factory.new_forecast_error(error_type='factor', error=config["bookmaker_error2"], scope='positive'),
    margin=factory.new_bookmaker_margin('simple', margin=config["bookmaker_margin2"])
)

#Two betting strategies are used, where one bets fixed stakes and the other one is based on Kelly betting
bettingFixed = FixedBetting(config["betting_bankrole"])
bettingKelly = KellyBetting(config["betting_bankrole"])

#Moreover, the rating errors of forecaster and bookmaker are applied, as specified in the specification
rating_error = RatingFunctionError(error='normal', random_number_generator = rng, loc=config["rating_error_loc"], scale=config["rating_error_scale"])
rating_error_bookmaker = RatingFunctionError(error='normal', random_number_generator = rng, loc=config["bookmaker_rating_error_loc"], scale=config["bookmaker_rating_error_scale"])

#Finally, we use the number of leagues, as specified in the specification
numberLeagues = config["number_of_leagues"]

## Generation of artificial data (i.e. competition networks) with the simulation framework

In [None]:
#For every league, we generate the schedule of matches and true dimensions and we add odds for each game
networks = []

gen_start_time = time.time()
for league in range(numberLeagues):
    if config["create_data"]:
        network: BaseNetwork = factory.new_network(
            'multiple-round-robin',
            random_number_generator = rng,
            teams=config["number_of_teams"],
            days_between_rounds=7,
            seasons=config["number_of_seasons"],
            league_teams=config["number_of_teams"],
            league_promotion=0,
            create=True,
            true_forecast=LogFunctionForecast(
                outcomes=['home', 'draw', 'away'], 
                coefficients = config["true_forecast"]["coefficients"], 
                beta_parameter=config["true_forecast"]["beta"]
            ),
            true_rating=ControlledTrendRating(
                starting_point=ControlledRandomFunction(random_number_generator = rng, distribution='normal', loc=1000, scale=100),
                delta=ControlledRandomFunction(random_number_generator = rng, distribution='normal', loc=0, scale=3),
                trend=ControlledRandomFunction(random_number_generator = rng, distribution='normal', loc=0, scale=20/365),
                season_delta=ControlledRandomFunction(random_number_generator = rng, distribution='normal', loc=0, scale=10),
                random_number_generator = rng,
            )
        )
        #adding biased bookmaker forecast to the network
        network.add_forecast(
            forecast=LogFunctionForecast(
                outcomes=['home', 'draw', 'away'], 
                coefficients = config["bookmaker_forecast"]["coefficients"], 
                beta_parameter=config["bookmaker_forecast"]["beta"], 
                home_team_error=rating_error_bookmaker, 
                away_team_error=rating_error_bookmaker
            ),
            forecast_name='bookmaker_forecast',
            base_ranking='true_rating'
        )
        #adding odds    
        network.add_odds(
            bookmaker_name="bm1",
            bookmaker=bookmaker1,
            base_forecast='bookmaker_forecast'
        )
        network.add_odds(
            bookmaker_name="bm2",
            bookmaker=bookmaker2,
            base_forecast='bookmaker_forecast'
        )
    else:
        print("Loading network data")
        main_controller.load_network_from_sql(
            network_name=f"{config['test_name']}_network_{league}",
            new_network_name=f"{config['test_name']}_network_{league}"
        )
        network: BaseNetwork = main_controller.networks[f"{config['test_name']}_network_{league}"]
    networks.append(network)
print(f"{numberLeagues} leagues added in {float(time.time() - gen_start_time)} seconds.")

### Some helper functions required in the analysis

In [None]:
#aggregation of the relavant measures for evaluation (e.g. likelihood, betting returns, etc.)
def aggregate_measures(list_of_matches, prefix='', addBettingMetrics=True):
    measures = {
        prefix+'_likelihood': sum([m[3]['metrics']['likelihood'] for m in list_of_matches])
    }
    if addBettingMetrics:
        for m in list_of_matches:
            betting_returns_keys = [k for k in m[3]['metrics'].keys() if k.startswith('betting_returns')]
            betting_activity_keys = [k for k in m[3]['metrics'].keys() if k.startswith('betting_activity')]
            for br in betting_returns_keys:
                measures[f"{prefix}_{br}"] = sum([b[0] for b in m[3]['metrics'][br]]) + measures.get(f"{prefix}_{br}", 0.0)
                measures[f"{prefix}_{br}_expected"] = sum([b[2] for b in m[3]['metrics'][br]]) + measures.get(f"{prefix}_{br}_expected", 0.0)
            for ba in betting_activity_keys:
                measures[f"{prefix}_{ba}_activity"] = m[3]['metrics'][ba]['qty'] + measures.get(f"{prefix}_{ba}_activity", 0.0)
    return measures

In [None]:
#Getting first the values for the true model and the bookmaker model. Remove some of the if they are not needed and if the IS/OOS split is not required.
entire_model_values = {}
for forecast_pointer in ['true_forecast', 'bookmaker_forecast']:
    all_matches =  []
    is_matches =  []
    oos_matches =  []
    for network in tqdm(networks):
        likelihood = Likelihood(outcomes=['home', 'draw', 'away'], forecast_name = forecast_pointer)
        network.add_evaluation([
            (likelihood, 'likelihood')
        ])
        all_matches += [(a,h, match_id, match_attributes) for a,h, match_id, match_attributes in network.iterate_over_games()]
        is_matches += [(a,h, match_id, match_attributes) for a,h, match_id, match_attributes in network.iterate_over_games() if match_attributes['season'] < 6]
        oos_matches += [(a,h, match_id, match_attributes) for a,h, match_id, match_attributes in network.iterate_over_games() if match_attributes['season'] > 5]

    result_all = aggregate_measures(all_matches, f'all_{forecast_pointer}', False)
    result_is = aggregate_measures(is_matches, f'is_{forecast_pointer}', False)
    result_oos = aggregate_measures(oos_matches, f'oos_{forecast_pointer}', False)
    entire_model_values = {
        **entire_model_values,
        **result_is, **result_oos, **result_all
    }

### Start of the experiment looping over all predefined model parameters

In [None]:
result_list = []
experiment_start_time = time.time()

#Loop over all OLR model coefficients that were specified for the run
for c0 in np.arange(-1.20, -0.60, 0.1):#please change back to -0.6
    for c1 in np.arange(0.0, 0.60, 0.1):#please change back to 0.6
        for beta in np.arange(0.002, 0.010, 0.002):#please change back to 0.01
            print(f'Variables: c0: {c0}, c1: {c1} and beta: {beta}')
            forecast_pointer = f"player_forecast_{c0:.2f}_{c1:.2f}_{beta:.3f}"
            cell_start_time = time.time()
            all_matches =  []
            is_matches =  []
            oos_matches =  []
            
            bm_start_time = time.time()
            for network_number, network in enumerate(networks):
                if config["create_data"]:
                    network.add_forecast(
                        forecast=LogFunctionForecast(outcomes=['home', 'draw', 'away'], coefficients=[c0, c1], beta_parameter=beta, home_team_error=rating_error, away_team_error=rating_error),
                        forecast_name=forecast_pointer,
                        base_ranking='true_rating'
                    )
                for betting, bettingName in [(bettingFixed, "Fixed"), (bettingKelly, "Kelly")]:
                #for betting, bettingName in [(bettingKelly, "Kelly")]:
                    for i in [1, 2]:
                    #for i in [1]:
                        #Pointer of each bookmaker
                        bookmaker = 'bm' + str(i)
                        bettor = 'b' + str(i)
                        network.add_bets(
                            bettor_name=bettor,
                            bookmaker=bookmaker,
                            betting=betting,
                            base_forecast=forecast_pointer
                        )
                        # Bettors can be added and then we can extract all the results at once
                        betting_activity = BettingActivity(outcomes=['home', 'draw', 'away'], player_name = bettor)
                        betting_returns = BettingReturnsEvaluator(outcomes=['home', 'draw', 'away'], player_name = bettor, true_model = 'true_forecast', bookmaker_name = bookmaker)
                        network.add_evaluation([
                            (betting_returns, f'betting_returns_{bettingName}_{bookmaker}'),
                            (betting_activity, f'betting_activity_{bettingName}_{bookmaker}')
                        ])

                likelihood = Likelihood(outcomes=['home', 'draw', 'away'], forecast_name = forecast_pointer)
                        
                network.add_evaluation(
                    [
                        (likelihood, 'likelihood')
                    ]
                )

                all_matches += [(a,h, match_id, match_attributes) for a,h, match_id, match_attributes in network.iterate_over_games()]
                is_matches += [(a,h, match_id, match_attributes) for a,h, match_id, match_attributes in network.iterate_over_games() if match_attributes['season'] <= config["in_sample_maximum"]]
                oos_matches += [(a,h, match_id, match_attributes) for a,h, match_id, match_attributes in network.iterate_over_games() if match_attributes['season'] > config["in_sample_maximum"]]

            print(f"BMs finished in {float(time.time() - bm_start_time)} seconds")
             
            am_start_time = time.time()
            result_all = aggregate_measures(all_matches, 'all')
            result_is = aggregate_measures(is_matches, 'is')
            result_oos = aggregate_measures(oos_matches, 'oos')
            result = {
                'bookmaker': bookmaker, 'c0' : c0, 'c1' : c1, 'beta' : beta, **result_is, **result_oos, **result_all, **entire_model_values
            }
                    
            result_list.append(result)
            print(f"Aggregation finished in {float(time.time() - am_start_time)} seconds")    
                               
            print(f"Finished in {float(time.time() - cell_start_time)} seconds")

print(f"Experiment finished in {float(time.time() - experiment_start_time)} seconds with {len(result_list)} observations.")

df = pd.DataFrame(result_list)
        

### Evaluation of the results of this run / Storing full results of the run and information needed for Table 3 and Table 4

In [None]:
#sort full data frame to identify accuracy-optimised model (the accuracy-optimised model does not depend on bookmaker and betting strategy)
accOpt = df.sort_values(by=['is_likelihood'], ascending = False).iloc[0]

#loop over four different cases (two margins and two betting strategies)
for case in ['Fixed_bm1', 'Fixed_bm2', 'Kelly_bm1', 'Kelly_bm2']:
    profOpt = df.sort_values(by=['is_betting_returns_' + case], ascending = False).iloc[0]

    #obtain data for the best performing models
    modelAcc = "c0:" + str(round(accOpt["c0"],2)) + " c1: " + str(round(accOpt["c1"],2)) + " beta: " + str(round(accOpt["beta"],3))
    modelProf = "c0:" + str(round(profOpt["c0"],2)) + " c1: " + str(round(profOpt["c1"],2)) + " beta: " + str(round(profOpt["beta"],3))
    numberBetsAcc = round(accOpt["oos_betting_activity_" + case + "_activity"],0)
    numberBetsProf = round(profOpt["oos_betting_activity_" + case + "_activity"],0)
    observedReturnsAcc = round(accOpt["oos_betting_returns_" + case],2)
    observedReturnsProf = round(profOpt["oos_betting_returns_" + case],2)
    observedReturnsDiff = observedReturnsProf - observedReturnsAcc
    observedReturnsPerBetAcc = round(observedReturnsAcc/numberBetsAcc,4)
    observedReturnsPerBetProf = round(observedReturnsProf/numberBetsProf,4)
    observedReturnsPerBetDiff = observedReturnsPerBetProf - observedReturnsPerBetAcc
    expectedReturnsAcc = round(accOpt["oos_betting_returns_" + case + "_expected"],2)
    expectedReturnsProf = round(profOpt["oos_betting_returns_"  + case + "_expected"],2)
    expectedReturnsDiff = expectedReturnsProf - expectedReturnsAcc
    expectedReturnsPerBetAcc = round(expectedReturnsAcc/numberBetsAcc,4)
    expectedReturnsPerBetProf = round(expectedReturnsProf/numberBetsProf,4)
    expectedReturnsPerBetDiff = expectedReturnsPerBetProf - expectedReturnsPerBetAcc


    #construct results table
    resultTable = pd.DataFrame({'Method': ['Model Selection', 'Number of Bets', 'Observed Returns', 'Observed Returns per bet', 'Expected Returns', 'Expected Returns per bet']})
    resultTable["Accuracy"] = [modelAcc, numberBetsAcc, observedReturnsAcc, observedReturnsPerBetAcc, expectedReturnsAcc, expectedReturnsPerBetAcc]
    resultTable["Profitability"] = [modelProf, numberBetsProf, observedReturnsProf, observedReturnsPerBetProf, expectedReturnsProf, expectedReturnsPerBetProf]
    resultTable["Difference"] = ["", "", observedReturnsDiff, observedReturnsPerBetDiff, expectedReturnsDiff, expectedReturnsPerBetDiff]
    
    #save result tables for specific case
    resultTable.to_excel(os.path.join(Path(os.getcwd()).parent.absolute(), 'results', f"artificial_table_"+config["test_name"]+"_"+case+".xlsx"))

#save raw results
#df.to_excel(os.path.join(Path(os.getcwd()).parent.absolute(), 'results', f"artificial_raw_"+config["test_name"]+".xlsx"))

In [None]:
resultTable