In [2]:
import pandas as pd
import numpy as np
from cataclop.ml.pipeline import factories

In [3]:
d = factories.Dataset.factory('default')

In [4]:
d.load(force=True)
d.players.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,earnings,final_odds,final_odds_ref,handicap_distance,handicap_weight,herder_id,horse_id,imported_at_player,is_first_timer,...,start_at,sub_category,date,hippodrome_id,imported_at_session,num_session,code,country,imported_at_hippo,name
race_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2,5,15748400,5.1,5.8,2100.0,,,2,2018-05-23 15:29:04.165302,False,...,2017-10-20 19:43:00,EUROPEENNE_AUTOSTART,2017-10-20,1,2018-05-23 15:29:03.883002,1,VIN,FRA,2018-05-21 14:53:00.375766,VINCENNES
1,3,4,19096000,11.8,12.4,2100.0,,2.0,3,2018-05-23 15:29:03.962649,False,...,2017-10-20 19:43:00,EUROPEENNE_AUTOSTART,2017-10-20,1,2018-05-23 15:29:03.883002,1,VIN,FRA,2018-05-21 14:53:00.375766,VINCENNES
1,4,4,7796500,29.0,21.5,2100.0,,3.0,4,2018-05-23 15:29:03.992521,False,...,2017-10-20 19:43:00,EUROPEENNE_AUTOSTART,2017-10-20,1,2018-05-23 15:29:03.883002,1,VIN,FRA,2018-05-21 14:53:00.375766,VINCENNES
1,5,5,18067000,17.7,14.5,2100.0,,4.0,5,2018-05-23 15:29:04.025540,False,...,2017-10-20 19:43:00,EUROPEENNE_AUTOSTART,2017-10-20,1,2018-05-23 15:29:03.883002,1,VIN,FRA,2018-05-21 14:53:00.375766,VINCENNES
1,6,5,20673000,63.3,36.1,2100.0,,5.0,6,2018-05-23 15:29:04.055841,False,...,2017-10-20 19:43:00,EUROPEENNE_AUTOSTART,2017-10-20,1,2018-05-23 15:29:03.883002,1,VIN,FRA,2018-05-21 14:53:00.375766,VINCENNES


In [5]:
import time

df = d.players.groupby('race_id').filter(lambda race: race['race_count'].max() > 0)

races = df.groupby('race_id')

agg_features = ['race_count', 
                'victory_count', 
                'placed_2_count', 
                'placed_3_count',
                'victory_earnings',
                'prev_year_earnings',
                'handicap_distance',
                'handicap_weight'
               ]

stats = races[agg_features].agg([np.mean, np.std])
stats.columns = ['_'.join(col) for col in stats.columns.values]

df = df.join(stats, how='inner')

for f in agg_features:
    df['{}_r'.format(f)] = df[f] - df['{}_mean'.format(f)] / df['{}_std'.format(f)]
    
agg_features = ['{}_r'.format(f) for f in agg_features]

df[agg_features] = df[agg_features].replace([np.inf, -np.inf], np.nan)
df[agg_features] = df[agg_features].fillna(1000)

start = time.time()

odds = pd.DataFrame(columns=['odds_{:d}'.format(i) for i in range(20)], index=df.index)

races = df.groupby('race_id')

for (id, race) in races:
    odds_sorted = sorted(race['final_odds_ref'].values)
    odds.loc[race.index, ['odds_{:d}'.format(i) for i, v in enumerate(odds_sorted)]] = odds_sorted
    
end = time.time()
print(end - start)

df = pd.concat([df,odds], axis=1)

df[['odds_{:d}'.format(i) for i in range(20)]] = df[['odds_{:d}'.format(i) for i in range(20)]].fillna(1000.0)

df['speed'] = (df['distance'] / df['time']).fillna(1000)
df['final_odds_ref_inv'] = (1. / df['final_odds_ref']).fillna(0.)

df = df.sort_index()

76.61699604988098


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, GroupKFold
from sklearn.preprocessing import MinMaxScaler, Normalizer

groups = df.index.droplevel(1).values

group_kfold = GroupKFold(n_splits=3)

features = agg_features

features = features + ['declared_player_count', 'speed']

features = features + ['odds_{:d}'.format(i) for i in range(20)]

df['pred_win'] = 0.0
df['y_win'] = (df['position'] == 1).astype('int32')

splits = list(group_kfold.split(df.values, df['position'].values, groups))

from cataclop.ml import preprocessing

models = [
    {
        'name': 'rf_20',
        'model': RandomForestClassifier(n_estimators=20),
    },
    {
        'name': 'rf_100',
        'model': RandomForestClassifier(n_estimators=100),
    },
    {
        'name': 'knn_5',
        'model': KNeighborsClassifier(n_neighbors=5),
    },
    {
        'name': 'knn_20',
        'model': KNeighborsClassifier(n_neighbors=20),
    }
]

for train_index, test_index in splits:
    
    for model in models:

        X_train = df[features].iloc[train_index].copy()
        y_train = df['y_win'].iloc[train_index]

        dummies = preprocessing.get_dummies(df.iloc[train_index], ['sub_category'])
        X_train = pd.concat([X_train, preprocessing.get_dummy_values(df.iloc[train_index], dummies)], axis=1)

        X_test = df[features].iloc[test_index].copy()
        y_test = df['y_win'].iloc[test_index]

        X_test = pd.concat([X_test, preprocessing.get_dummy_values(df.iloc[test_index], dummies)], axis=1)

        X_train = X_train.values
        X_test = X_test.values

        idx = df.iloc[test_index].index


        #scaler = Normalizer()
        #scaler.fit(X_train.values)

        #X_train = scaler.transform(X_train.values)
        #X_test = scaler.transform(X_test.values)
        #X_train = X_train.values
        #X_test = X_test.values


        clf = model['model']
        #clf = MLPClassifier(activation='logistic', hidden_layer_sizes=(100,), alpha=1.0, max_iter=500)
        #clf = KNeighborsClassifier(n_neighbors=20)
        clf.fit(X_train, y_train.values)

        p = clf.predict_proba(X_test)

        print( model['name'] )
        print( log_loss(y_test, p) )

        df.loc[idx, 'pred_{}'.format(model['name'])] = p[:, list(clf.classes_).index(1)]
    
    

rf_20
0.5060115240620294
rf_100
0.24339083734100636
knn_5
1.6651813177495367
knn_20
0.6376529993615907
rf_20
0.6470385123800579
rf_100
0.2791671201656047
knn_5
1.8992720213060599
knn_20
0.8575552343462115
rf_20
0.5444154303260733
rf_100
0.2729601517707549
knn_5
1.9810136886674472
knn_20
0.7827758478620378


In [7]:

stacked_features = ['pred_{}'.format(model['name']) for model in models]

#stacked_features = stacked_features + ['final_odds_ref_inv']

print(df[stacked_features].head())

for train_index, test_index in splits:

    X_train = df[stacked_features].iloc[train_index].copy()
    y_train = df['y_win'].iloc[train_index]

    dummies = preprocessing.get_dummies(df.iloc[train_index], ['sub_category'])
    X_train = pd.concat([X_train, preprocessing.get_dummy_values(df.iloc[train_index], dummies)], axis=1)

    X_test = df[stacked_features].iloc[test_index].copy()
    y_test = df['y_win'].iloc[test_index]

    X_test = pd.concat([X_test, preprocessing.get_dummy_values(df.iloc[test_index], dummies)], axis=1)

    X_train = X_train.values
    X_test = X_test.values

    idx = df.iloc[test_index].index

    #clf = MLPClassifier(hidden_layer_sizes=5)
    clf = RandomForestClassifier(n_estimators=100)

    clf.fit(X_train, y_train.values)

    p = clf.predict_proba(X_test)

    print( 'stacked' )
    print( log_loss(y_test, p) )

    df.loc[idx, 'pred_win'] = p[:, list(clf.classes_).index(1)]



            pred_rf_20  pred_rf_100  pred_knn_5  pred_knn_20
race_id id                                                  
1       2         0.15         0.20         0.0         0.05
        3         0.10         0.10         0.0         0.00
        4         0.10         0.11         0.0         0.10
        5         0.10         0.08         0.0         0.00
        6         0.05         0.05         0.0         0.00
stacked
0.707625956403324
stacked
0.8594576631624679
stacked
0.9344503408067227


In [8]:
print( log_loss(df['y_win'], df['pred_win']) )

df[['category', 'speed', 'victory_earnings_r', 'num', 'position', 'final_odds_ref', 'pred_win']][:100]

0.8338446534575049


Unnamed: 0_level_0,Unnamed: 1_level_0,category,speed,victory_earnings_r,num,position,final_odds_ref,pred_win
race_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2,ATTELE,0.013444,1.574840e+07,2,2.0,5.8,0.031429
1,3,ATTELE,0.013389,1.909600e+07,3,5.0,12.4,0.024167
1,4,ATTELE,1000.000000,7.796496e+06,4,,21.5,0.020000
1,5,ATTELE,0.013378,1.806700e+07,5,6.0,14.5,0.015000
1,6,ATTELE,0.013273,2.067300e+07,6,7.0,36.1,0.028214
1,7,ATTELE,0.013472,1.634700e+07,7,,11.8,0.005000
1,8,ATTELE,0.013412,1.491420e+07,8,4.0,48.7,0.016667
1,5154,ATTELE,0.013483,1.981000e+07,1,1.0,1.6,0.030000
14,51,PLAT,1000.000000,3.644999e+06,1,2.0,9.7,0.690000
14,52,PLAT,1000.000000,6.999989e+05,2,4.0,6.5,0.058500
