In [None]:
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

In [None]:
import keras
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn import preprocessing

from cataclop.ml import preprocessing as cataclop_preprocessing

In [None]:
from cataclop.ml.pipeline import factories

dataset = factories.Dataset.factory('default', params={
    "from": "2019-02-01",
    "to": "2019-12-31"
})
dataset.load(force=True)

In [None]:
race_features = ['prize', 'declared_player_count']
race_features += ['odds_{:d}'.format(i) for i in range(10)]


horse_features = ['age'] + ['hist_{}_pos'.format(i+1) for i in range(6)]

for f in dataset.agg_features:
            if f.startswith('final_odds'):
                continue
            horse_features.append(f)
            #horse_features.append('{}_r'.format(f))
            #for s in dataset.agg_features_funcs:
                #features.append('{}_{}'.format(f, s[0]))

horse_cat_features = ['horse_sex', 'horse_breed']
race_cat_features = ['category', 'sub_category']

race_features = sorted(list(set(race_features)))
horse_features = sorted(list(set(horse_features)))
horse_cat_features = sorted(list(set(horse_cat_features)))
race_cat_features = sorted(list(set(race_cat_features)))

features = race_features + horse_features
cat_features = race_cat_features + horse_cat_features

print(features)
print(cat_features)


In [None]:
NAN_FLAG = 0

df = dataset.players
df = df.groupby('race_id').filter(lambda r: (r['trueskill_mu'] == 25).sum() < r['declared_player_count'].max()/2 and r['position'].min() == 1 and r['winner_dividend'].max() > 0 and r['odds_0'].min() != dataset.params['nan_flag'] and r['odds_1'].min() != dataset.params['nan_flag'] )
df.reset_index(inplace=True)
df.loc[:, features] = df.loc[:, features].fillna(NAN_FLAG)

df['position'] = df['position'].fillna(20)

scaler = preprocessing.MinMaxScaler() 
scaled = scaler.fit_transform(df.loc[:, features].values)

df.loc[:, features] = scaled

In [None]:
horse_dummies = cataclop_preprocessing.get_dummies(df, horse_cat_features, limit=5)

df_horse_dummies = cataclop_preprocessing.get_dummy_values(df, horse_dummies)

df = pd.concat([df, df_horse_dummies], axis=1)

race_dummies = cataclop_preprocessing.get_dummies(df, race_cat_features, limit=5)

df_race_dummies = cataclop_preprocessing.get_dummy_values(df, race_dummies)

df = pd.concat([df, df_race_dummies], axis=1)


all_horse_features = list(set(horse_features + list(df_horse_dummies.columns)))

all_race_features = list(set(race_features + list(df_race_dummies.columns)))

print(all_horse_features, all_race_features)

In [None]:

def make_Xy(df, train=True):

    races = df.groupby('race_id')
    n_races = len(races)

    n_features = len(features)
    X = []
    y = []

    # keep track of the raw data position in the dataset 
    back_idx = []

    for race_id, race in tqdm(races, total=n_races):
        n_players = len(race)


        for i in range(n_players):
            player1 = race.iloc[i]
            x1 = player1[all_horse_features].values

            # only train on 4 first
            if train and (player1['position'] == 0 or player1['position'] > 6):
                continue

            for j in range(n_players):
                if j == i:
                    continue
                player2 = race.iloc[j]

                # only train on 4 first
                if train and (player2['position'] == 0 or player2['position'] > 6):
                    continue

                x2 = player2[all_horse_features].values

                row_x = np.concatenate((x1, x2, player1[all_race_features]))

                if player1['position'] == 0 and player2['position'] == 0:
                    row_y = 0
                elif player1['position'] == 0:
                    row_y = 0
                elif player2['position'] == 0:
                    row_y = 0
                else:
                    row_y = 1 if player1['position'] < player2['position'] else 0

                X.append(row_x)
                y.append(row_y)

                back_idx.append((race.index[i], race.index[j]))

    X = np.array(X)
    X = X.astype(np.float32)

    y = np.array(y)
    y = y.astype(np.float32)
                
    return (X, y, back_idx)
    

In [None]:
race_ids = df['race_id'].unique()
test_portion = int(len(race_ids) * 0.3)
test_race_ids = race_ids[0:test_portion]
train_race_ids = race_ids[test_portion+1:]

df_train = df[df['race_id'].isin(train_race_ids)].copy()
df_test = df[df['race_id'].isin(test_race_ids)].copy()

In [None]:
X, y, _ = make_Xy(df_train)

In [None]:
#X = np.reshape(X, (X.shape[0], X.shape[1], 1))
X = np.reshape(X, (X.shape[0], X.shape[1]))

In [None]:
X.shape
X[0:1]

In [None]:
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint, EarlyStopping

model = Sequential()


model.add(Dense(100, input_dim=X.shape[1]))
model.add(Dropout(0.5))
model.add(Activation('relu'))
'''
model.add(Conv1D(32, 9, input_shape=(X.shape[1], 1)))
model.add(Activation('sigmoid'))
#model.add(Dropout(0.5))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(64, 3))
model.add(Activation('sigmoid'))
#model.add(Dropout(0.5))
model.add(MaxPooling1D(pool_size=2))

model.add(Flatten())

model.add(Dense(20))
model.add(Activation('relu'))
model.add(Dropout(0.5))
'''

model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
      optimizer='adam',
      metrics=['accuracy'])
          

In [None]:
model.summary()

In [None]:
model.fit(X, y, batch_size=16, epochs=10, validation_split=0.1)

In [None]:
X_test, y_test, back_idx = make_Xy(df_test, train=False)

In [None]:
predictions = model.predict(X_test)

In [None]:
df_test['score'] = 0

for i in tqdm(range(len(predictions))):
    
    player1 = df_test.loc[back_idx[i][0]]
    player2 = df_test.loc[back_idx[i][1]]
    
    df_test.loc[ back_idx[i][0], 'score' ] = df_test.loc[ back_idx[i][0], 'score' ] + predictions[i]
    

In [None]:
from cataclop.ml.exploration import random_race

random_race(df_test, ['position', 'sub_category', 'num', 'final_odds', 'final_odds_ref', 'score'])

In [None]:
def fast_bet(r):
    p = 'score'

    s = r.sort_values(by=p, ascending=False)
    o = s.index.sort_values(ascending=True, return_indexer=True)
        
    idx = (r[p] == r[p].max())
        
    r['pn'] = o[1]
    return r

In [None]:
df_test['pn'] = 0
df_test = df_test.groupby('race_id').apply(fast_bet)

In [None]:
df_test['bet'] = 1
df_test['profit'] = (df_test['winner_dividend'].fillna(0.)/100.-1.0)

In [None]:
df_test[(df_test['pn']==0) & (df_test['final_odds'] > 10.)]['profit'].describe()