In [1]:
import json
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter


import os
import re

In [2]:
#GLOBAL CONFIG VALUES
min_traded_volume = 10000

In [3]:
#Auxiliar Functions

def convertLongToDateTime(utcDate):
    return datetime.fromtimestamp(utcDate/1000)

def subtract_18H(date):
    return date - timedelta(hours=18)

def convertISOToDateTime(date):
    return datetime.fromisoformat(date[:-1]).strftime('%Y-%m-%d %H:%M:%S')

def has_no_trading_volume(runner_change):
    return runner_change['tv'] == 0.0

def has_no_last_traded_price(runner_change):
    return runner_change['ltp'] == 0.0

def getMarketMetadata(data_frame):
    top_row = df.head(n=1)
    # The market definition is nested in mc
    market_definition = top_row['mc'][0][0]['marketDefinition']
    runners = market_definition['runners']
    event_name = market_definition['eventName']
    event_id = market_definition['eventId']
    market_time = convertISOToDateTime(market_definition['marketTime'])

    runner1 = {"id": runners[0]['id'], "name": runners[0]['name']}
    runner2 = {"id": runners[1]['id'], "name": runners[1]['name']}
    return (
        runner1,
        runner2,
        event_name,
        event_id,
        market_time
    )

def parseData(df):
    
    df['pt'] = df['pt'].apply(convertLongToDateTime)

    inPlayTime = 0
    inPlayTimeIndex = 0
    firstTimeInPlayFound = False
    indices_to_delete = []
    total_tv = 0
    for idx, row in df.iterrows():
        for r in row['mc']:
            if ('marketDefinition' in r.keys() and r['marketDefinition']['inPlay'] == True):
                if (firstTimeInPlayFound != True):
                    inPlayTime = row['pt']
                    inPlayTimeIndex = idx
                    firstTimeInPlayFound = True
            elif ('rc' in r.keys()):
                for rc in r['rc']:
                    if (has_no_trading_volume(rc) or has_no_last_traded_price(rc)):
                        indices_to_delete.append(idx)
                    elif rc['id'] == runner1['id']:
                        df.loc[idx, f"ltp_{runner1['name']}"] = rc['ltp']
                        df.loc[idx, f"tv_{runner1['name']}"] = rc['tv']
                        total_tv += rc['tv']
                    elif rc['id'] == runner2['id']:
                        df.loc[idx, f"ltp_{runner2['name']}"] = rc['ltp']
                        df.loc[idx, f"tv_{runner2['name']}"] = rc['tv']
                        total_tv += rc['tv']
                        
    
    df.drop(['op', 'clk', 'mc'], axis=1, inplace=True)
    
    indices_no_duplicates = list(set(indices_to_delete))
    filter_indices_iterator = filter(lambda number: number < inPlayTimeIndex, indices_no_duplicates)
    df_to_plot = df[0:inPlayTimeIndex].drop(list(filter_indices_iterator) , inplace=False, axis=0).dropna()
    
    if total_tv < min_traded_volume:
        return pd.DataFrame()

    return df_to_plot, inPlayTime, inPlayTimeIndex

def findFavourite(df):
    
    columns = df.columns

    favourite = ''
    for col in columns:
        if 'ltp' in col:
            if favourite == '':
                favourite = col
            elif (df[col].min() < df[favourite].min()):
                favourite = df[col].min()
                
    return favourite

def getAvgTimeToVolume(df, percentage_of_volume):
    num_of_games = df['Game_#'].unique()
    
    volume_breakpoints = np.array([])
    for num in num_of_games:
        favourite = findFavourite(df)
        currentGame = df[df['Game_#'] == num]
        volume_breakpoint = currentGame[favourite].max() * percentage_of_volume
        firstDateTime = currentGame[df[favourite] > volume_breakpoint].first_valid_index()
        lastDateTime = currentGame.last_valid_index()
        difference = lastDateTime - firstDateTime
        volume_breakpoints = np.insert(volume_breakpoints, difference.seconds)
    
    average = volume_breakpoints.mean()
    hours, remainder = divmod(average.seconds, 3600)
    return (hours, remainder)

In [4]:
import os

walkDir = "ADVANCED_test"

team_list = ["Dominic Thiem"]
str_values = ["" for x in range(len(team_list))]
team_paths_dict = dict(zip(team_list, str_values))

game_dfs = []
for subdir, dirs, files in os.walk(walkDir):
    for file in files:
        filepath = subdir + os.sep + file
        game_event_matched = re.match(r'^([\s\d]+)$', file.split(".")[-1])
        if bool(game_event_matched):  
            df = pd.read_json(filepath,lines=True)
            (runner1, runner2, event_name, event_id, market_time) = getMarketMetadata(df)
            
            print('RUNNER 1', runner1)
            print('RUNNER 2', runner2)
            print('EVENT 1', event_id)
            print("EVENT NAME",event_name)
            print("MARKET TIME", market_time)
            
            (parsedData, inPlayTime, inPlayTimeIndex) = parseData(df)
            
            if parsedData.empty:
                print("file ignored: ", filepath)
                continue
                
            for team in team_list:
                if (runner1['name'] == team) or (runner2['name'] == team):
                    if type(team_paths_dict[team]) != str:
                        parsedData['Game_#'] = team_paths_dict[team]['Game_#'].max() + 1
                        team_paths_dict[team] = team_paths_dict[team].append(parsedData, ignore_index = True)
                    else:
                        parsedData['Game_#'] = 1
                        team_paths_dict[team] = parsedData

In [5]:
first = team_paths_dict['Dominic Thiem'].first_valid_index()
last= team_paths_dict['Dominic Thiem'].index[100]
last2= team_paths_dict['Dominic Thiem'].last_valid_index()

team_paths_dict['Dominic Thiem']

AttributeError: 'str' object has no attribute 'first_valid_index'

In [6]:
currentPlayer = team_paths_dict['Dominic Thiem'].set_index('pt')
getAvgTimeToVolume(currentPlayer, 0.1)

AttributeError: 'str' object has no attribute 'set_index'

In [7]:
# For windows
# file_path = r'/Users/miguel/Documents/code/betting/Betfair-Dragoon/ADVANCED_test/Jan/31/29676224/1.168065827'

# Get current Directory
current_dir = os.getcwd()

# Use this if you want to look into specific file - replace each string with a directory folder
# This allows the path to work for both windows, mac and linux
file_path = os.path.join(current_dir, 'ADVANCED_test','Jan','31','29676224','1.168065827')
print('Analyzing file', file_path)

df = pd.read_json(file_path, lines=True)
market_metadata = getMarketMetadata(df)
(runner1, runner2, event_name, event_id, market_time) = market_metadata
print('RUNNER 1', runner1)
print('RUNNER 2', runner2)
print('EVENT 1', event_id)
print("EVENT NAME",event_name)
print("MARKET TIME", market_time)

parsedData, inPlayTime, inPlayTimeIndex = parseData(df)


print("IN PLAY TIME", inPlayTime)
eighteen_h_ago = subtract_18H(inPlayTime)
print('IN PLAY INDEX', inPlayTimeIndex)
print("18H before", eighteen_h_ago)

Analyzing file C:\Users\WilsonYeung.AzureAD\ADVANCED_test\Jan\31\29676224\1.168065827


ValueError: Expected object or value

In [8]:
pd.set_option('display.max_rows', 10000)

# Compute the differences between rows
diff = parsedData.diff()

parsedData[f"tv_diff_{runner1['name']}"] = diff[f"tv_{runner1['name']}"]
parsedData[f"tv_diff_{runner2['name']}"] = diff[f"tv_{runner2['name']}"]

# Changed the order of columns
reindexed = parsedData[['pt', f"ltp_{runner1['name']}", f"tv_{runner1['name']}", f"tv_diff_{runner1['name']}", f"ltp_{runner2['name']}", f"tv_{runner2['name']}", f"tv_diff_{runner2['name']}"]]

reindexed[ (reindexed[f"tv_diff_{runner1['name']}"] != 0.0) & (reindexed[f"tv_d

SyntaxError: EOL while scanning string literal (<ipython-input-8-29a059f9fc79>, line 12)

In [9]:
df_to_plot = reindexed.set_index('pt')


fig, axes = plt.subplots(2, 1, figsize=(16,16))

# default grid appearance
axes[0].plot(df_to_plot.index,df_to_plot[f"ltp_{runner1['name']}"], lw=2)
axes[0].grid(True)
axes[0].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))


ax_player1 = axes[0].twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:red'
ax_player1.set_ylabel('Volume Traded', color=color)  # we already handled the x-label with ax1
ax_player1.plot(df_to_plot.index,df_to_plot[f"tv_{runner1['name']}"], color=color)
ax_player1.set_title(f"{runner1['name']}")
ax_player1.tick_params(axis='y', labelcolor=color)
ax_player1.axvline(x=inPlayTime, ymin=0, ymax=1, color='tab:purple')
ax_player1.axvline(x=eighteen_h_ago, ymin=0, ymax=1, color='tab:purple')

# custom grid appearance
axes[1].plot(df_to_plot.index,df_to_plot[f"ltp_{runner2['name']}"], color='r', lw=2)
axes[1].grid(True)
axes[1].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

ax_player2 = axes[1].twinx()  

color = 'tab:green'
ax_player2.set_ylabel('Volume Traded', color=color)  
ax_player2.plot(df_to_plot.index,df_to_plot[f"tv_{runner2['name']}"], color=color)
ax_player2.set_title(f"{runner2['name']}")
ax_player2.tick_params(axis='y', labelcolor=color)
ax_player2.axvline(x=inPlayTime, ymin=0, ymax=1, color='tab:purple')
ax_player2.axvline(x=eighteen_h_ago, ymin=0, ymax=1, color='tab:purple')

NameError: name 'reindexed' is not defined

In [10]:
# Plot the 18h prior to in-play
df_18h_plot = df_to_plot[df_to_plot.index >= eighteen_h_ago]

fig, axes = plt.subplots(2, 1, figsize=(32,32))

# default grid appearance
axes[0].plot(df_18h_plot.index,df_18h_plot[f"ltp_{runner1['name']}"], lw=2)
axes[0].grid(True)
axes[0].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))


ax_player1 = axes[0].twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:red'
ax_player1.set_ylabel('Volume Traded', color=color)  # we already handled the x-label with ax1
ax_player1.plot(df_18h_plot.index,df_18h_plot[f"tv_{runner1['name']}"], color=color)
ax_player1.set_title(f"{runner1['name']}")
ax_player1.tick_params(axis='y', labelcolor=color)

# custom grid appearance
axes[1].plot(df_18h_plot.index,df_18h_plot[f"ltp_{runner2['name']}"], color='r', lw=2)
axes[1].grid(True)
axes[1].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

ax_player2 = axes[1].twinx()  

color = 'tab:green'
ax_player2.set_ylabel('Volume Traded', color=color)  
ax_player2.plot(df_18h_plot.index,df_18h_plot[f"tv_{runner2['name']}"], color=color)
ax_player2.set_title(f"{runner2['name']}")
ax_player2.tick_params(axis='y', labelcolor=color)

NameError: name 'df_to_plot' is not defined

In [11]:
df_to_plot.count()

NameError: name 'df_to_plot' is not defined