In [None]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

from cataclop.ml import preprocessing
from cataclop.ml import exploration

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 500)

from cataclop.ml.pipeline import factories

In [2]:
program = factories.Program.factory('default', version='1.1')

In [None]:
program.train(dataset_params = {
    #'from': '2013-05-10'
    'from': '2016-01-01',
    'to': '2018-12-31',
    'categories': ['PLAT']
})

In [None]:
program.dataset.save()
print(program.dataset.hash)
program.model.save()
print(program.model.hash)

In [None]:
print('{} races from {} to {}'.format(program.df.race_id.nunique(), program.df.start_at.min(), program.df.start_at.max()))

In [None]:

for model in program.model.models:
    clf = model['estimators'][-1]['pipeline'].steps[-1][1]
    if hasattr(clf, 'feature_importances_'):
        fi = pd.Series(clf.feature_importances_, index=program.model.features+preprocessing.get_dummy_features(model['estimators'][-1]['dummies']))
        print(model['name'])
        print(fi.sort_values(ascending=False)[0:100])
        print("---\n")
    if hasattr(clf, 'coef_'):
        fi = pd.Series(clf.coef_, index=program.model.features+preprocessing.get_dummy_features(model['estimators'][-1]['dummies']))
        print(model['name'])
        print(fi.sort_values(ascending=False)[0:100])
        print("---\n")


In [None]:
df = program.df

for model in program.model.models:
    print(model['name'], df['pred_{}_1'.format(model['name'])].mean())

In [None]:
races = df.groupby('race_id')
for (id, race) in races:
    for model in program.model.models:
        df.loc[race.index, 'pred_{}_std'.format(model['name'])] = race['pred_{}_1'.format(model['name'])].std()
        df.loc[race.index, 'pred_{}_min'.format(model['name'])] = race['pred_{}_1'.format(model['name'])].min()
        df.loc[race.index, 'pred_{}_max'.format(model['name'])] = race['pred_{}_1'.format(model['name'])].max()

In [None]:
stacked_features = ['declared_player_count'] + ['odds_{:d}'.format(i) for i in range(10)]
#stacked_features = program.model.features
stacked_features = stacked_features + ['pred_{}_1'.format(model['name']) for model in program.model.models]
stacked_features = stacked_features + ['pred_{}_std'.format(model['name']) for model in program.model.models]
stacked_features = stacked_features + ['pred_{}_min'.format(model['name']) for model in program.model.models]
stacked_features = stacked_features + ['pred_{}_max'.format(model['name']) for model in program.model.models]

In [None]:
stacked_features.remove('final_odds_ref')

In [None]:
stacked_features

In [None]:
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, MinMaxScaler, RobustScaler
from cataclop.ml import preprocessing
from xgboost import XGBRegressor

from keras.models import Sequential
from keras.layers import Dense, Dropout


for pos in range(1,8):


    df['pred_stacked_{}'.format(pos+1)] = 0

    from sklearn.model_selection import KFold, GroupKFold

    groups = df['race_id'].values

    group_kfold = GroupKFold(n_splits=program.model.params['kfolds'])

    splits = list(group_kfold.split(df.values, df['position'].values, groups))

    for train_index, test_index in splits:

        X_train = df[stacked_features].iloc[train_index].copy()

        #idx = (df.iloc[train_index]['final_odds_ref'] < 20) & ()
        idx = (df.iloc[train_index]['target_returns'] != program.model.params['nan_flag']) & (df.iloc[train_index]['final_odds_ref'] < 300) & ((df.iloc[train_index]['position'] == 1) | (df.iloc[train_index]['position'] <= pos) )
        y_train = df['target_returns'].iloc[train_index]

        dummies = preprocessing.get_dummies(df.iloc[train_index], program.model.categorical_features)
        X_train = pd.concat([X_train, preprocessing.get_dummy_values(df.iloc[train_index], dummies)], axis=1)

        X_train = X_train[idx]
        y_train = y_train[idx]

        X_test = df[stacked_features].iloc[test_index].copy()
        y_test = df['target_returns'].iloc[test_index]

        X_test = pd.concat([X_test, preprocessing.get_dummy_values(df.iloc[test_index], dummies)], axis=1)

        X_train = X_train.values
        X_test = X_test.values

        scaler = RobustScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        y_train = y_train / y_train.max()
        y_test = y_test / y_train.max()

        #estimator = MLPRegressor(activation='logistic', hidden_layer_sizes=(100,), random_state=program.model.params['seed'])
        #estimator = RandomForestRegressor(n_estimators=100)
        estimator = linear_model.Ridge()

        estimator = XGBRegressor()

        estimator.fit(X_train, y_train.values)

        p = estimator.predict(X_test)

        '''
        model = Sequential()
        model.add(Dense(10, input_dim=X_train.shape[1], activation='sigmoid'))
        model.add(Dropout(0.5))

        model.add(Dense(1, activation='sigmoid'))

        model.compile(loss='mse', optimizer='adam')

        model.fit(X_train, y_train,
                  epochs=5,
                  batch_size=20)

        p = model.predict(X_test)
        '''


        idx = df.iloc[test_index].index
        df.loc[idx, 'pred_stacked_{}'.format(pos+1)] = p



In [None]:


if hasattr(estimator, 'feature_importances_'):
    fi = pd.Series(estimator.feature_importances_, index=stacked_features+preprocessing.get_dummy_features(dummies))
    print(fi.sort_values(ascending=False)[0:100])
    
if hasattr(estimator, 'coef_'):
    fi = pd.Series(estimator.coef_, index=stacked_features+preprocessing.get_dummy_features(dummies))
    print(fi.sort_values(ascending=False)[0:100])

In [None]:
df['pred_stacked'] = df[['pred_stacked_{}'.format(pos+1) for pos in range(1,8)]].sum(axis=1)

In [None]:
exploration.random_race(df, cols=['position', 'declared_player_count', 'sub_category', 'num', 'final_odds', 'final_odds_ref', 'pred_stacked'] + ['pred_stacked_{}'.format(pos+1) for pos in range(1,8)] + ['pred_xgb_100_1', 'pred_rf_100_1', 'pred_knn_5_1', 'pred_mlp_100_1', 'pred_ridge_1_1']).sort_values(by='pred_stacked_8', ascending=False)



In [None]:
df['pred_sum'] = df[['pred_{}_1'.format(model['name']) for model in program.model.models ]].sum(axis=1)

In [None]:
df['pred_rnd'] = np.random.rand(df.shape[0])

In [None]:
def bet(df, features, categorical_features, targets, N=1, max_odds=20, break_on_bet=True, break_on_odds=False):

    races = df.sort_values('start_at').groupby('race_id')

    bets = []

    for (id, race) in races:

        candidate_bets = []

        nums = []

        for target in targets:

            r = race.sort_values(by=target, ascending=False)

            if len(r) <= N:
                break

            for n in range(N):

                player = r.iloc[n]

                odds = player['final_odds_ref']

                if max_odds is not None and odds > max_odds:
                    if break_on_odds:
                        break
                    else:
                        continue

                #nth = (r['final_odds_ref']<odds).sum()+1
                
                if player[target] < 0:
                    break

                bet = np.clip(player[target]/100.0, 0, 10)
                
                bet = np.round(1+bet) * 1.5
                
                if bet <= 0:
                    break

                profit = player['winner_dividend']/100.0 * bet - bet

                row = [id, player['date'], player['num'], odds, player['final_odds'], target, player[target], r[target].std(), bet, profit]

                for nn in range(1,4):
                    if n+nn < len(r):
                        row.append(r.iloc[n+nn][target])
                    else:
                        row.append(np.nan)

                for f in features:
                    row.append(player[f])
                for f in categorical_features:
                    row.append(player[f])

                candidate_bets.append( row )

                nums.append(player['num'])

                if break_on_bet:
                    break

        #if len(candidate_bets) == 1:
        #    bets += candidate_bets
        bets += candidate_bets

    cols = ['id', 'date', 'num', 'odds_ref', 'odds_final', 'target', 'pred', 'pred_std', 'bet', 'profit']

    for nn in range(1,4):
        cols.append('next_pred_{}'.format(nn))

    cols = cols + features + categorical_features

    bets = pd.DataFrame(bets, columns=cols)

    bets.index = bets['date']

    bets = bets.sort_index()

    bets['bets'] = bets['bet'].cumsum()
    bets['stash'] = bets['profit'].cumsum()

    return bets



In [None]:
bets = bet(df, program.model.features, program.model.categorical_features, 
           N=1, max_odds=30, break_on_bet=True, break_on_odds=True, 
           targets=['pred_rnd', 'final_odds_ref', 'pred_stacked_8', 'pred_sum'] + ['pred_{}_1'.format(model['name']) for model in program.model.models]
          )

In [None]:
#program.bet(N=1, max_odds=30, break_on_bet=True, break_on_odds=True, targets=['pred_rnd', 'final_odds_ref', 'pred_stacked_8', 'pred_sum'] + ['pred_{}_1'.format(model['name']) for model in program.model.models])
#bets = program.bets
#bets['profit'].mean()

In [None]:
df[['pred_{}_1'.format(model['name']) for model in program.model.models]].describe()

In [None]:
bets = bets.reset_index(drop=True)

In [None]:

#bb = bets.groupby('id').filter(lambda r: r['num'].nunique() <= 4  ).copy()
#bb = bets.copy()
bb = bets[(bets['target'].str.contains('^pred_sum')) ].copy()

f = 'sub_category'

bb['prize_round'] = np.round(np.log(bb['prize']))
bb['pred_round'] = np.round(bb['pred'].clip(-1, 1), 1)

bb['odds_round'] = np.round(bb['odds_final'].clip(1, 30)/5.)

results = []

for s in bb[f].value_counts().index:
    results = []
    x = bb[ (bb[f] == s) & (bb['pred_std'] > 0) & (bb['odds_ref']<20) & (bb['declared_player_count'] > 1)].copy()
    if len(x) == 0:
        continue
    #print("---\n{}\t{:+.2f}\t{:+.2f}\t{:+.2f}\n---".format(s, x['profit'].sum(), x['bet'].sum(), len(x)))
    
    fig, axs = plt.subplots(1,1)
    
    for t in x['target'].value_counts().index:
        bbb = x[(x['target'] == t)].copy()
        bbb = bbb[ (bbb['pred']>bbb['pred'].quantile(0.85)) ].copy()
        
        
        results.append({
            'idx_1': s,
            'idx_2': t,
            'profit': bbb.profit.sum(),
            'bet': bbb.bet.sum(),
            'bets': bbb.bet.count(),
            'avg': bbb.profit.sum() / bbb.bet.sum()
        })
        
        '''
        bbb['stash'] = bbb['profit'].cumsum()
        bbb['cbet'] = bbb['bet'].cumsum()

        fig, axs = plt.subplots(1,1)
        bbb['stash'].plot(figsize=(6, 2))
        bbb['cbet'].plot(figsize=(6, 2))
        plt.show()
        '''
        
        #print("{:10s}: \t {:+.2f} \t {:+.0f} \t {:+.2f}".format(t, bbb.profit.sum(), bbb.profit.count(), bbb.profit.mean()) )
        
        bbb['stash'] = bbb['profit'].cumsum()
        bbb['cbet'] = bbb['bet'].cumsum()

        bbb['stash'].plot(figsize=(10, 2))
        bbb['cbet'].plot(figsize=(10, 2))
    
    results = pd.DataFrame(results)
    results = results.set_index(['idx_1', 'idx_2'])
    print(results.sort_values(by='avg', ascending=False))
    
        
    plt.show()    
    
    
    

In [None]:
len(bb[ (bb['target']=='pred_sum') & (bb['sub_category']=='COURSE_A_CONDITIONS') & (bb['odds_ref']<20) & (bb['pred'] > 300)])

In [None]:
bb[ (bb['target']=='pred_sum') & (bb['sub_category']=='COURSE_A_CONDITIONS') & (bb['odds_ref']<20)]['pred'].quantile(0.85)

In [None]:
bets = bet(df, program.model.features, program.model.categorical_features, N=1, max_odds=30, break_on_bet=True, break_on_odds=True, targets=['pred_rnd', 'final_odds_ref', 'pred_stacked_8', 'pred_sum'] + ['pred_{}_1'.format(model['name']) for model in program.model.models])

In [None]:
d = bb.copy()

In [None]:
import seaborn as sns
#fig, axs = plt.subplots(1,1)
#sns.heatmap(d[['pred', 'profit']].values, annot=True)
#sns.jointplot('profit', 'pred', data=d)
sns.kdeplot(d['prize'], d['profit'])
plt.show()    
