In [2]:
import pandas as pd
import numpy as np
from cataclop.ml.pipeline import factories

In [3]:
d = factories.Dataset.factory('default')

In [4]:
d.load(force=True)
d.players.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,earnings,final_odds,final_odds_ref,handicap_distance,handicap_weight,herder_id,horse_id,imported_at_player,is_first_timer,...,start_at,sub_category,date,hippodrome_id,imported_at_session,num_session,code,country,imported_at_hippo,name
race_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2,5,15748400,5.1,5.8,2100.0,,,2,2018-05-23 15:29:04.165302,False,...,2017-10-20 19:43:00,EUROPEENNE_AUTOSTART,2017-10-20,1,2018-05-23 15:29:03.883002,1,VIN,FRA,2018-05-21 14:53:00.375766,VINCENNES
1,3,4,19096000,11.8,12.4,2100.0,,2.0,3,2018-05-23 15:29:03.962649,False,...,2017-10-20 19:43:00,EUROPEENNE_AUTOSTART,2017-10-20,1,2018-05-23 15:29:03.883002,1,VIN,FRA,2018-05-21 14:53:00.375766,VINCENNES
1,4,4,7796500,29.0,21.5,2100.0,,3.0,4,2018-05-23 15:29:03.992521,False,...,2017-10-20 19:43:00,EUROPEENNE_AUTOSTART,2017-10-20,1,2018-05-23 15:29:03.883002,1,VIN,FRA,2018-05-21 14:53:00.375766,VINCENNES
1,5,5,18067000,17.7,14.5,2100.0,,4.0,5,2018-05-23 15:29:04.025540,False,...,2017-10-20 19:43:00,EUROPEENNE_AUTOSTART,2017-10-20,1,2018-05-23 15:29:03.883002,1,VIN,FRA,2018-05-21 14:53:00.375766,VINCENNES
1,6,5,20673000,63.3,36.1,2100.0,,5.0,6,2018-05-23 15:29:04.055841,False,...,2017-10-20 19:43:00,EUROPEENNE_AUTOSTART,2017-10-20,1,2018-05-23 15:29:03.883002,1,VIN,FRA,2018-05-21 14:53:00.375766,VINCENNES


In [5]:
import time

df = d.players.groupby('race_id').filter(lambda race: race['race_count'].max() > 0)

races = df.groupby('race_id')

agg_features = ['race_count', 
                'victory_count', 
                'placed_2_count', 
                'placed_3_count',
                'victory_earnings',
                'prev_year_earnings',
                'handicap_distance',
                'handicap_weight'
               ]

stats = races[agg_features].agg([np.mean, np.std])
stats.columns = ['_'.join(col) for col in stats.columns.values]

df = df.join(stats, how='inner')

for f in agg_features:
    df['{}_r'.format(f)] = df[f] - df['{}_mean'.format(f)] / df['{}_std'.format(f)]
    
agg_features = ['{}_r'.format(f) for f in agg_features]

df[agg_features] = df[agg_features].replace([np.inf, -np.inf], np.nan)
df[agg_features] = df[agg_features].fillna(1000)

start = time.time()

odds = pd.DataFrame(columns=['odds_{:d}'.format(i) for i in range(20)], index=df.index)

races = df.groupby('race_id')

for (id, race) in races:
    odds_sorted = sorted(race['final_odds_ref'].values)
    odds.loc[race.index, ['odds_{:d}'.format(i) for i, v in enumerate(odds_sorted)]] = odds_sorted
    
end = time.time()
print(end - start)

df = pd.concat([df,odds], axis=1)

df[['odds_{:d}'.format(i) for i in range(20)]] = df[['odds_{:d}'.format(i) for i in range(20)]].fillna(1000.0)

df['speed'] = (df['distance'] / df['time']).fillna(1000)
df['final_odds_ref_inv'] = (1. / df['final_odds_ref']).fillna(0.)

df = df.sort_index()

76.61699604988098


In [22]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, GroupKFold
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

groups = df.index.droplevel(1).values

group_kfold = GroupKFold(n_splits=3)

features = agg_features

features = features + ['declared_player_count', 'speed']

features = features + ['odds_{:d}'.format(i) for i in range(20)]

df['pred_win'] = 0.0
df['y_win'] = (df['position'] == 1).astype('int32')

splits = list(group_kfold.split(df.values, df['position'].values, groups))

from cataclop.ml import preprocessing

models = [
    {
        'name': 'rf_20',
        'model': RandomForestClassifier(n_estimators=20),
        'scaler': False
    },
    {
        'name': 'rf_100',
        'model': RandomForestClassifier(n_estimators=100),
        'scaler': False
    },
    {
        'name': 'knn_5',
        'model': KNeighborsClassifier(n_neighbors=5),
        'scaler': False
    },
    {
        'name': 'knn_20',
        'model': KNeighborsClassifier(n_neighbors=20),
        'scaler': False
    },
    {
        'name': 'ada',
        'model': AdaBoostClassifier(),
        'scaler': False
    },
    {
        'name': 'svc',
        'model': SVC(probability=True),
        'scaler': True
    }
]

for train_index, test_index in splits:
    
    for model in models:

        X_train = df[features].iloc[train_index].copy()
        y_train = df['y_win'].iloc[train_index]

        dummies = preprocessing.get_dummies(df.iloc[train_index], ['sub_category'])
        X_train = pd.concat([X_train, preprocessing.get_dummy_values(df.iloc[train_index], dummies)], axis=1)
        
        X_train = X_train[df.iloc[train_index]['final_odds_ref'] < 20]
        y_train = df['y_win'].iloc[train_index][df.iloc[train_index]['final_odds_ref'] < 20]

        X_test = df[features].iloc[test_index].copy()
        y_test = df['y_win'].iloc[test_index]

        X_test = pd.concat([X_test, preprocessing.get_dummy_values(df.iloc[test_index], dummies)], axis=1)

        X_train = X_train.values
        X_test = X_test.values

        idx = df.iloc[test_index].index

        if model['scaler']:
            scaler = Normalizer()
            scaler.fit(X_train)

            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)

        clf = model['model']
        #clf = MLPClassifier(activation='logistic', hidden_layer_sizes=(100,), alpha=1.0, max_iter=500)
        #clf = KNeighborsClassifier(n_neighbors=20)
        clf.fit(X_train, y_train.values)

        p = clf.predict_proba(X_test)

        print( model['name'] )
        print( log_loss(y_test, p) )

        df.loc[idx, 'pred_{}'.format(model['name'])] = p[:, list(clf.classes_).index(1)]
    
    

rf_20
0.43292347495540084
rf_100
0.3006817967627792
knn_5
1.4967287064628296
knn_20
0.4684580623937895
ada
0.6618436844528162
svc
0.26141826343236246
rf_20
0.5400690921655733
rf_100
0.3031024283057852
knn_5
1.6108644975591
knn_20
0.4655013526414776
ada
0.6638252165313495
svc
0.2705596619901211
rf_20
0.4463096014355933
rf_100
0.34192085283774865
knn_5
1.695563835683812
knn_20
0.5613817583885871
ada
0.6642403393161309
svc
0.28352647783728113


In [23]:

stacked_features = ['pred_{}'.format(model['name']) for model in models]

#stacked_features = stacked_features + ['final_odds_ref_inv']

print(df[stacked_features].head())

for train_index, test_index in splits:

    X_train = df[stacked_features].iloc[train_index].copy()
    y_train = df['y_win'].iloc[train_index]

    dummies = preprocessing.get_dummies(df.iloc[train_index], ['sub_category'])
    X_train = pd.concat([X_train, preprocessing.get_dummy_values(df.iloc[train_index], dummies)], axis=1)
    
    X_train = X_train[df.iloc[train_index]['final_odds_ref'] < 20]
    y_train = df['y_win'].iloc[train_index][df.iloc[train_index]['final_odds_ref'] < 20]


    X_test = df[stacked_features].iloc[test_index].copy()
    y_test = df['y_win'].iloc[test_index]

    X_test = pd.concat([X_test, preprocessing.get_dummy_values(df.iloc[test_index], dummies)], axis=1)

    X_train = X_train.values
    X_test = X_test.values

    idx = df.iloc[test_index].index

    #clf = MLPClassifier(hidden_layer_sizes=5)
    clf = RandomForestClassifier(n_estimators=100)

    clf.fit(X_train, y_train.values)

    p = clf.predict_proba(X_test)

    print( 'stacked' )
    print( log_loss(y_test, p) )

    df.loc[idx, 'pred_win'] = p[:, list(clf.classes_).index(1)]



            pred_rf_20  pred_rf_100  pred_knn_5  pred_knn_20  pred_ada  \
race_id id                                                               
1       2         0.25         0.28         0.2         0.15  0.493306   
        3         0.10         0.13         0.0         0.00  0.492012   
        4         0.35         0.18         0.0         0.15  0.489785   
        5         0.05         0.13         0.0         0.00  0.489181   
        6         0.05         0.11         0.0         0.05  0.492012   

            pred_svc  
race_id id            
1       2   0.115099  
        3   0.116199  
        4   0.115367  
        5   0.115118  
        6   0.115523  
stacked
0.30901171782467424
stacked
0.3296592437992215
stacked
0.34530089395281227


In [25]:
print( log_loss(df['y_win'], df['pred_win']) )

df[['category', 'speed', 'victory_earnings_r', 'num', 'position', 'final_odds_ref', 'pred_win']+['pred_{}'.format(m['name']) for m in models]][:100]

0.3279906185255693


Unnamed: 0_level_0,Unnamed: 1_level_0,category,speed,victory_earnings_r,num,position,final_odds_ref,pred_win,pred_rf_20,pred_rf_100,pred_knn_5,pred_knn_20,pred_ada,pred_svc
race_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,2,ATTELE,0.013444,1.574840e+07,2,2.0,5.8,0.120000,0.250,0.280000,0.2,0.15,0.493306,0.115099
1,3,ATTELE,0.013389,1.909600e+07,3,5.0,12.4,0.060000,0.100,0.130000,0.0,0.00,0.492012,0.116199
1,4,ATTELE,1000.000000,7.796496e+06,4,,21.5,0.140000,0.350,0.180000,0.0,0.15,0.489785,0.115367
1,5,ATTELE,0.013378,1.806700e+07,5,6.0,14.5,0.110000,0.050,0.130000,0.0,0.00,0.489181,0.115118
1,6,ATTELE,0.013273,2.067300e+07,6,7.0,36.1,0.080000,0.050,0.110000,0.0,0.05,0.492012,0.115523
1,7,ATTELE,0.013472,1.634700e+07,7,,11.8,0.204667,0.300,0.160000,0.0,0.10,0.492498,0.120995
1,8,ATTELE,0.013412,1.491420e+07,8,4.0,48.7,0.320000,0.350,0.280000,0.0,0.05,0.491507,0.118419
1,5154,ATTELE,0.013483,1.981000e+07,1,1.0,1.6,0.030000,0.150,0.140000,0.0,0.05,0.490213,0.118088
14,51,PLAT,1000.000000,3.644999e+06,1,2.0,9.7,0.360000,0.200,0.140000,0.4,0.15,0.490364,0.103912
14,52,PLAT,1000.000000,6.999989e+05,2,4.0,6.5,0.075000,0.100,0.060000,0.0,0.10,0.491878,0.106006


In [18]:
fi = pd.Series(clf.feature_importances_, index=stacked_features+preprocessing.get_dummy_features(dummies))
print(fi.sort_values(ascending=False))
    

pred_ada                                               0.267438
pred_svc                                               0.256575
pred_rf_100                                            0.170019
pred_rf_20                                             0.096006
pred_knn_20                                            0.079866
pred_knn_5                                             0.038717
sub_category_value_COURSE_A_CONDITIONS                 0.015327
sub_category_value_AUTOSTART                           0.014629
sub_category_value_HANDICAP                            0.013412
sub_category_value_HANDICAP_DIVISE                     0.012174
sub_category_value_A_RECLAMER                          0.010238
sub_category_value_HANDICAP_CATEGORIE_DIVISE           0.006732
sub_category_value_GROUPE_I                            0.006464
sub_category_value_INTERNATIONALE_AUTOSTART            0.005910
sub_category_value_HANDICAP_DE_CATEGORIE               0.004570
sub_category_value_APPRENTIS_LADS_JOCKEY

In [34]:
races = df.groupby('race_id')

import random

bets = []

for (id, race) in races:
    
    bet = 1
    
    r = race.sort_values(by='pred_rf_100', ascending=False)
    
    r = r[r['final_odds_ref']<20.]
    
    if len(r) == 0:
        continue
    
    #num = random.randint(0,min(len(race)-1, 3))
    player = r.iloc[0]
    
    profit = player['winner_dividend']/100.0 * bet - bet
    
    bets.append( (id, bet, player['final_odds'], player['num'], profit) )
    
bets = pd.DataFrame(bets, columns=['id', 'bet', 'odds', 'num', 'profit'])

bets['stash'] = bets['profit'].cumsum()

bets


Unnamed: 0,id,bet,odds,num,profit,stash
0,1,1,5.1,2,-1.0,-1.0
1,14,1,4.3,5,3.4,2.4
2,16,1,1.4,2,0.4,2.8
3,18,1,3.1,2,-1.0,1.8
4,21,1,9.8,1,-1.0,0.8
5,22,1,1.8,4,0.6,1.4
6,23,1,1.9,1,0.9,2.3
7,24,1,22.1,8,-1.0,1.3
8,25,1,7.1,5,-1.0,0.3
9,26,1,6.0,2,-1.0,-0.7
