In [20]:
import json
import os
import numpy as np
import pandas as pd

def get_file_paths(directory_path):
    file_paths = []

    if not os.path.exists(directory_path):
        return file_paths
    
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_paths.append(os.path.join(root, file))
    return file_paths

import json
import numpy as np
import pandas as pd

def open_data(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data

def find_best_odds(df):
    team1_cols = [col for col in df.columns if 'team1' in col]
    team2_cols = [col for col in df.columns if 'team2' in col]

    def max_with_source(lst, source_lst):
        max_value, max_source = None, None
        for i, value in enumerate(lst):
            if value is not None and (max_value is None or value > max_value):
                max_value, max_source = value, source_lst[i]
        return max_value, max_source

    df['best_team1_odds'], df['best_team1_source'] = zip(*df.apply(
        lambda row: max_with_source([row[col] for col in team1_cols], team1_cols), axis=1))
    df['best_team2_odds'], df['best_team2_source'] = zip(*df.apply(
        lambda row: max_with_source([row[col] for col in team2_cols], team2_cols), axis=1))

    return df[['match', 'best_team1_odds', 'best_team1_source', 'best_team2_odds', 'best_team2_source']]

def arbitrage(row):
    implied_team1 = 1 / float(row['best_team1_odds'])
    implied_team2 = 1 / float(row['best_team2_odds'])
    return 1 if implied_team1 + implied_team2 < 1 else 0


def get_names(data):
    names = []
    for d in data:
        match_names = [x['id'] for x in d]
        names.extend(match_names)
    return np.unique(names)


In [29]:

bookie_names = []
data = []
filepaths = get_file_paths('data/rugby_union')

for path in filepaths:
    data.append(open_data(path))
    bookie_names.append(path.split('/')[-1].split('.')[0])
    
match_names = get_names(data)

df = pd.DataFrame({'match': match_names})

for d, name in zip(data, bookie_names):
    team1_dict = {x['id']: x['odds'][0] for x in d}
    team2_dict = {x['id']: x['odds'][2] for x in d}

    team1 = [team1_dict.get(match) for match in match_names]
    team2 = [team2_dict.get(match) for match in match_names]

    df[f'{name}_team1'] = team1
    df[f'{name}_team2'] = team2 

best_odds = find_best_odds(df).copy()
best_odds['arbitrage'] = best_odds.apply(lambda row: arbitrage(row), axis=1)
mask = best_odds['arbitrage'] == 1
best_odds[mask]



Unnamed: 0,match,best_team1_odds,best_team1_source,best_team2_odds,best_team2_source,arbitrage
16,Hurricanes Reds,1.3,TAB_team1,4.4,sportsbet_team2,1
