In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np

import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

from cataclop.ml import preprocessing
from cataclop.ml import exploration

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 500)

from cataclop.ml.pipeline import factories

In [None]:
program = factories.Program.factory('default', version='1.2')

In [None]:
program.train(dataset_params = {
    'from': '2018-01-01',
    'to': '2021-03-01',
    #'hippodrome': ['DEAUVILLE']
    #'countries': ['FRA'],
    #'categories': ['PLAT'],
    'sub_categories': ['HANDICAP'],
    #'prize_min': 40000
}, model_params = {
    'seed': 123456,
    'kfolds': 3,
    'nan_flag': 0,
    'n_targets': 1,
}, dataset_reload=True, model='default', dataset='default')

In [None]:
program.df.describe()

In [None]:
program.model.features

In [None]:
print('{} samples, {} features'.format(program.df.shape[0], len(program.model.features)))

In [None]:
if not hasattr(program.model, 'stacked_models'):
    program.model.stacked_models = []
    program.df['target_stacked'] = 0

In [None]:
[model["name"] for model in program.model.models] + [model["name"] for model in program.model.stacked_models]

In [None]:
print(
    '{} races from {} to {}'.format(
        program.df.race_id.nunique(), 
        program.df.start_at.min(), 
        program.df.start_at.max()
    )
)

In [None]:
for model in program.model.models:
    clf = model['estimators'][-1]['pipeline'].steps[-1][1]
    if hasattr(clf, 'classes_'):
        clf = model['estimators'][-1]['pipeline'].steps[-1][1]
        print(clf.classes_)

In [None]:
from cataclop.ml import preprocessing
pd.set_option('precision',7)
    
for model in program.model.models:
    clf = model['estimators'][-1]['pipeline'].steps[-1][1]
    if hasattr(clf, 'feature_importances_'):
        fi = pd.Series(clf.feature_importances_, index=program.model.features+preprocessing.get_dummy_features(model['estimators'][-1]['dummies']))
        print(model['name'])
        print(fi.sort_values(ascending=False)[0:100])
        print("---\n")
    if hasattr(clf, 'coef_'):
        fi = pd.Series(clf.coef_, index=program.model.features+preprocessing.get_dummy_features(model['estimators'][-1]['dummies']))
        print(model['name'])
        print(fi.sort_values(ascending=False)[0:100])
        print("---\n")

if hasattr(program.model, "stacked_models"):
    for model in program.model.stacked_models:
        clf = model['estimators'][-1]['pipeline'].steps[-1][1]
        if hasattr(clf, 'feature_importances_'):
            fi = pd.Series(clf.feature_importances_, index=program.model.stacked_features+preprocessing.get_dummy_features(model['estimators'][-1]['dummies']))
            print(model['name'])
            print(fi.sort_values(ascending=False)[0:100])
            print("---\n")
        if hasattr(clf, 'coef_'):
            fi = pd.Series(clf.coef_, index=program.model.stacked_features+preprocessing.get_dummy_features(model['estimators'][-1]['dummies']))
            print(model['name'])
            print(fi.sort_values(ascending=False)[0:100])
            print("---\n")
        
pd.set_option('precision',7)



In [None]:
df = program.df

for model in program.model.models:
    print(model['name'], df['pred_{}_1'.format(model['name'])].mean())

In [None]:
from sklearn import preprocessing

df['pred_sum_1'] = 1
for model in program.model.models:
    m = model['name']
    #s = preprocessing.MinMaxScaler()
    #scaled = s.fit_transform(df[['pred_{}_1'.format(m)]].values)
    #df['pred_{}_s_1'.format(m)] = scaled
    #df['pred_sum_1'] = df['pred_sum_1'] * df['pred_{}_s_1'.format(m)]
    
df['pred_sum_1'] = df[['pred_{}_1'.format(model['name']) for model in program.model.models]].sum(axis=1)
df['pred_sum_1'] /= len(program.model.models)

df['pred_stacked_sum_1'] = df[['pred_stacked_{}_1'.format(model['name']) for model in program.model.stacked_models]].sum(axis=1)
df['pred_stacked_sum_1'] /= len(program.model.stacked_models)

df['pred_odds_1'] = df['final_odds_ref']
df['pred_rnd_1'] = np.random.rand(df.shape[0])
df['pred_trueskill_mu_1'] = df['trueskill_mu']

In [None]:
cols = [
            'position', 
            'declared_player_count', 
            'sub_category', 
            'num', 
            'final_odds', 
            'final_odds_ref', 
            'final_odds_ref_unibet', 
            'target',
            'target_stacked',
            'pred_sum_1',
            'pred_stacked_sum_1',
            'race_winner_dividend',
            'winner_dividend',
            'placed_dividend',
            'trueskill_mu',
            'hist_1_pos'
        ] + [
            'pred_{}_1'.format(model['name']) for model in program.model.models
        ] + [
            'pred_stacked_{}_1'.format(model['name']) for model in program.model.stacked_models
        ] 
exploration.random_race(df, 
                        cols=cols
                       ).sort_values(by='position', ascending=True)

In [None]:
# more exploration of a single race features
#(df.reset_index(drop=True).set_index(['race_id', df.index]).loc[65509][['position'] + program.model.features]).sort_values(by='position', ascending=True)

In [None]:
dd = df

In [None]:
models = program.model.models + [{"name": "sum"}, {"name": "maj"}, {"name": "stacked_maj"}, {"name": "stacked_sum"}, {"name": "rnd"}, {"name": "odds"}, {"name": "trueskill_mu"}]
#dd = df.groupby('race_id').filter(lambda r: r['pred_sum'].std()!=0)
models = models + [{"name": "stacked_{}".format(model["name"])} for model in program.model.stacked_models]

for model in models:
    m = model['name']
    dd['bet_{}'.format(m)] = 0.
    dd['profit_{}'.format(m)] = 0.
    dd['n_{}'.format(m)] = 0.
    dd['n_odds_{}'.format(m)] = 0.
    dd['pred_{}_std'.format(m)] = 0.
    

In [None]:
dd['pred_rnd_1'] = np.random.rand(dd.shape[0])
dd['maj_1'] = 0.
dd['pred_maj_1'] = 0.
dd['stacked_maj_1'] = 0.
dd['pred_stacked_maj_1'] = 0.

In [None]:
[model["name"] for model in models]

In [None]:
def fast_bet(r):
    for model in models:
        asc = not model['name'].startswith('stacked')
        p = 'pred_{}_1'.format(model['name'])
        #print(model['name'], df['pred_{}_1'.format(model['name'])].mean())
        s = r.sort_values(by=p, ascending=asc)
        o = s.index.sort_values(ascending=True, return_indexer=True)
        s2 = r.sort_values(by='final_odds_ref')
        o2 = s2.index.sort_values(ascending=True, return_indexer=True)


        idx = (r[p] == r[p].max())

        r['bet_{}'.format(model['name'])] = np.clip(r[p], a_min=0., a_max=1.) #((idx).astype('float'))


        r['n_{}'.format(model['name'])] = o[1]
        r['n_odds_{}'.format(model['name'])] = o2[1]
    return r

In [None]:
dd = dd.groupby('race_id').apply(fast_bet)

for model in models:
    dd['profit_{}'.format(model['name'])] = dd['bet_{}'.format(model['name'])] * 1.0 * (dd['target_returns']-1.0)

cols = ['profit_{}'.format(model['name']) for model in models]

In [None]:

mm = [m['name'] for m in models if m['name'].startswith('nn_') or m['name'].startswith('knn_') or m['name'].startswith('xgb_')]

dd['n_maj'] = dd[['n_{}'.format(m) for m in mm]].sum(axis=1)
dd['n_maj'] /= dd['declared_player_count']
dd['n_maj'] = dd['n_maj'].round(1)
dd['pred_maj_1'] = dd['n_maj']

mm = [m['name'] for m in models if m['name'].startswith('stacked_stacked')]

dd['n_stacked_maj'] = dd[['n_{}'.format(m) for m in mm]].sum(axis=1)
dd['n_stacked_maj'] /= dd['declared_player_count']
dd['n_stacked_maj'] = dd['n_stacked_maj'].round(1)
dd['pred_stacked_maj_1'] = dd['n_stacked_maj']


for model in models:
    m = model['name']
    #dd['profit_{}'.format(m)] = np.clip(dd['pred_{}_1'.format(m)], a_min=0., a_max=10.) * 1.0 * (dd['target_returns']-1.0)
    #dd['profit_{}'.format(m)] = 1.0 * (dd['target_returns']-1.0)
    dd['bet_{}'.format(m)] = 1#np.ceil(0.1 * np.clip((df['pred_{}_1'.format(m)]), a_min=0., a_max=10.) * np.log(df['n_odds_{}'.format(m)]+1.) )
    dd['profit_{}'.format(m)] = dd['bet_{}'.format(m)] * 1.0 * (dd['target_returns']-1.0)


In [None]:
cols = [
            'position', 
            'n_maj',
            'n_stacked_maj',
            'declared_player_count', 
            'sub_category', 
            'num', 
            'final_odds', 
            'final_odds_ref', 
            'final_odds_ref_unibet', 
            'target',
            'target_stacked',
            'race_winner_dividend',
            'trueskill_mu',
        ] + [
            'n_{}'.format(model['name']) for model in models
        ] + [
            'pred_{}_1'.format(model['name']) for model in models
        ] + [
            'bet_{}'.format(model['name']) for model in models
        ] + [
            'profit_{}'.format(model['name']) for model in models
        ]
exploration.random_race(dd, 
                        cols=cols
                       ).sort_values(by='position', ascending=True)

In [None]:
#s = dd.groupby('race_id')['final_odds_offline'].std()
#s = s.to_frame()
#s.columns = ['final_odds_offline_std']

In [None]:
#dd = dd.join(s, how='left', on='race_id')

In [None]:
#dd['prize'].apply(np.exp).describe()

In [None]:
for m in models: 
    print(m['name'])
    print(dd['pred_{}_1'.format(m['name'])].quantile(0.5))

In [None]:
f = 'category'
for s in dd[f].value_counts().index:
    r = pd.DataFrame(columns=['r', 'bets_mean', 'bets_max', 'bets_min', 'profit_mean', 'profit_max', 'profit_min', 'stash_min', 'stash_max', 'count'], index=[model['name'] for model in models])
    print(s)
    for model in models:
        
        m = model['name']
        #if not m.startswith('stacked'):
        #    continue

        #for c in dd['sub_category'].value_counts().sort_values(ascending=False).index[0:7]:
        #print(c)

        ddd = dd[dd[f]==s]#dd[(dd['category'].isin(['PLAT'])) & (dd['sub_category'].isin(['HANDICAP'])) ]#dd[(dd['sub_category']==c)]# & (dd['sub_category'].isin(['HANDICAjP_DIVISE', 'HANjDICAP', 'AUTOSTART'])) ]
        #ddd = ddd.groupby('race_id').filter(lambda r: r['pred_{}_1'.format(m)].std() > 0.01)
        #g = ddd[(ddd['n_odds_{}'.format(m)]>=0) & (ddd['pred_{}_1'.format(m)] >= np.quantile(ddd['pred_{}_1'.format(m)], 0.7)) & (ddd['final_odds_ref_offline'].notnull()) & (ddd['final_odds_ref_offline'] > ddd['final_odds_offline']) & (ddd['final_odds_ref'] > 20) & (ddd['final_odds_ref']<50)][['start_at', 'race_id', 'position', 'final_odds', 'final_odds_ref', 'final_odds_ref_unibet', 'final_odds_ref_offline_std', 'final_odds_offline_std', 'profit_{}'.format(m), 'bet_{}'.format(m), 'pred_{}_1'.format(m)]].copy()
        #(ddd['final_odds_ref_offline'] > ddd['final_odds_offline']) 
        g = ddd[  (ddd['final_odds_ref_offline'] > ddd['final_odds_offline'])  & (ddd['n_{}'.format(m)]<=0) & (ddd['final_odds_ref'] > 20) & (ddd['final_odds_ref']<50) & (ddd['final_odds_ref_offline'].notnull())  ][['start_at', 'race_id', 'position', 'final_odds', 'final_odds_ref', 'profit_{}'.format(m), 'bet_{}'.format(m), 'pred_{}_1'.format(m)]].copy()
        #g = ddd[(ddd['n_{}'.format(m)]==0)].copy()
        g['stash_{}'.format(m)] = g['profit_{}'.format(m)].cumsum()
        g['bets_{}'.format(m)] = g['bet_{}'.format(m)].cumsum()
        g['stash_{}'.format(m)].plot(figsize=(16, 4), title=m)
        g['bets_{}'.format(m)].plot()
        plt.grid(True)
        plt.show()

        #print(m, np.quantile(ddd['pred_{}_1'.format(m)], 0.7), ddd['start_at'].min(), ddd['start_at'].max())

        r.loc[m]['r'] = g['profit_{}'.format(m)].sum() / g['bet_{}'.format(m)].sum()
        r.loc[m]['bets_mean'] = g['bet_{}'.format(m)].mean()
        r.loc[m]['bets_min'] = g['bet_{}'.format(m)].min()
        r.loc[m]['bets_max'] = g['bet_{}'.format(m)].max()
        r.loc[m]['profit_mean'] = g['profit_{}'.format(m)].mean()
        r.loc[m]['profit_min'] = g['profit_{}'.format(m)].min()
        r.loc[m]['profit_max'] = g['profit_{}'.format(m)].max()
        r.loc[m]['stash_min'] = g['stash_{}'.format(m)].min()
        r.loc[m]['stash_max'] = g['stash_{}'.format(m)].max()
        r.loc[m]['count'] = g['stash_{}'.format(m)].count()

    print(r[['r', 'count']])


In [None]:
def bet(df):
    models = [{"name":'mlp_30'}]
    
    def fast_bet(r):
        for model in models:
            p = 'pred_{}_1'.format(model['name'])
            #print(model['name'], df['pred_{}_1'.format(model['name'])].mean())
            s = r.sort_values(by=p)
            o = s.index.sort_values(ascending=True, return_indexer=True)
            s2 = r.sort_values(by='final_odds_ref')
            o2 = s2.index.sort_values(ascending=True, return_indexer=True)


            idx = (r[p] == r[p].max())
        #idx = (r['pred_knn_5_1'] > 0) & (r['final_odds_ref'] > 5)
        #idx = (r['pred_knn_5_1'] > 0.) & (r['final_odds_ref'] > 5) & (r['final_odds_ref'] < 30)
            #if r[p].std() == 0:
            #    r['bet'] = 0
            #    return r
            r['bet_{}'.format(model['name'])] = np.clip(r[p], a_min=0., a_max=1.) #((idx).astype('float'))


            r['n_{}'.format(model['name'])] = o[1]
            r['n_odds_{}'.format(model['name'])] = o2[1]
        return r
    
    df = df[(df['country']=='FRA') & (df['sub_category'].isin(['HANDICAP', 'HANDICAP_DIVISE']))].copy()
    df = df.groupby('race_id').apply(fast_bet)

    for model in models:
        m = model['name']
        #dd['profit_{}'.format(m)] = np.clip(dd['pred_{}_1'.format(m)], a_min=0., a_max=10.) * 1.0 * (dd['target_returns']-1.0)
        #dd['profit_{}'.format(m)] = 1.0 * (dd['target_returns']-1.0)
        df['bet_{}'.format(m)] = np.ceil(0.1 * np.clip((df['pred_{}_1'.format(m)]/10.), a_min=0., a_max=10.) * np.log(df['n_odds_{}'.format(m)]+1.) )
        df['profit_{}'.format(m)] = df['bet_{}'.format(m)] * 1.0 * (df['target_returns']-1.0)

    df['bet'] = df[['bet_{}'.format(model['name']) for model in models]].sum(axis=1)
    df['profit'] = df[['profit_{}'.format(model['name']) for model in models]].sum(axis=1)
    df['target'] = 'mlp_30_1'
    
    bets = df[(df['pred_mlp_30_1'] >= 13.6) & (df['final_odds_ref_offline']>df['final_odds_offline']) & (df['final_odds_ref'] > 20) & (df['final_odds_ref']<50)][['race_id', 'start_at', 'bet', 'category', 'sub_category', 'country', 'target', 'profit', 'num', 'race_id', 'position', 'final_odds_ref', 'final_odds', 'profit_{}'.format(m), 'bet_{}'.format(m), 'pred_{}_1'.format(m)]].copy()
    bets['date'] = pd.to_datetime(bets['start_at'])
    
    #bets = bets.set_index(bets['date'])
    #bets = bets.sort_index()
    
    bets['bets'] = bets['bet'].cumsum()
    bets['stash'] = bets['profit'].cumsum()

    return bets

In [None]:
#bets = bet(df, program.model.features, program.model.categorical_features, 
#    N=1, max_odds=30, break_on_bet=True, break_on_odds=False, 
#    targets=['pred_rnd_1', 'final_odds_ref', 'pred_sum_1'] + ['pred_{}_{}'.format(model['name'], i+1) for i in range(program.model.params['n_targets']) for model in program.model.models]# + ['pred_stacked_{}_1'.format(model['name']) for model in program.model.stacked_models] 
#   )   

In [None]:
bets = bet(df)

In [None]:
#bets = bets.set_index(bets['date'])
#bets = bets.sort_index()

In [None]:
bets['target'] = 'fg'
list(bets.columns)

In [None]:
def graph_bb(bb, f):
    results = []

    for s in bb[f].value_counts().index:
        results = []
        x = bb[ (bb[f] == s)  ].copy()
        if len(x) == 0:
            continue
        #print("---\n{}\t{:+.2f}\t{:+.2f}\t{:+.2f}\n---".format(s, x['profit'].sum(), x['bet'].sum(), len(x)))

        fig, axs = plt.subplots(1,1)

        for t in x['target'].value_counts().index:
            bbb = x[(x['target'] == t)].copy()
            #bbb = bbb[ (bbb['pred']>bbb['pred'].quantile(0.85)) ].copy()
            
            if 'profit_placed' not in list(bbb.columns):
                bbb['profit_placed'] = 0

            results.append({
                'idx_1': s,
                'idx_2': t,
                'profit': bbb.profit.sum(),
                'profit_place': bbb.profit_placed.sum(),
                'bet': bbb.bet.sum(),
                'bets': bbb.bet.count(),
                'avg': bbb.profit.sum() / bbb.bet.sum()
            })

            '''
            bbb['stash'] = bbb['profit'].cumsum()
            bbb['cbet'] = bbb['bet'].cumsum()

            fig, axs = plt.subplots(1,1)
            bbb['stash'].plot(figsize=(6, 2))
            bbb['cbet'].plot(figsize=(6, 2))
            plt.show()
            '''

            #print("{:10s}: \t {:+.2f} \t {:+.0f} \t {:+.2f}".format(t, bbb.profit.sum(), bbb.profit.count(), bbb.profit.mean()) )

            bbb['stash'] = bbb['profit'].cumsum()
            bbb['cbet'] = bbb['bet'].cumsum()

            bbb['stash'].plot(figsize=(10, 2))
            #bbb['cbet'].plot(figsize=(10, 2))

        results = pd.DataFrame(results)
        results = results.set_index(['idx_1', 'idx_2'])
        print(results.sort_values(by='avg', ascending=False))


        plt.show()
        

In [None]:
graph_bb(bets, 'country')

In [None]:
#countries = list(bets['country'].value_counts()[0:10].index)
#sub_categories = list(bets['sub_category'].value_counts()[0:10].index)

#%store countries
#%store sub_categories

In [None]:
from skopt.space import Real, Integer
from skopt.utils import use_named_args

target = 'pred_knn_1_1'

c_min = bets[(bets['target'] == target)]['pred'].min()
c_mean = bets[(bets['target'] == target)]['pred'].mean()
c_max = bets[(bets['target'] == target)]['pred'].max()

space = [
          Real(1, 10, name='min_odds'),
          Real(10, 60, name='max_odds'),
          Real(c_min, c_mean, name='min_pred'),
          Real(c_mean, c_max, name='max_pred'),
          Integer(5, 25, name='max_players')] + [
          Integer(0,1, name='country_{}'.format(country)) for country in countries] + [
          Integer(0,1, name='sub_category_{}'.format(sub_category)) for sub_category in sub_categories] + [
          Integer(0,1, name='nb_{}'.format(n)) for n in range(0,6)]
         
def x_to_params(x):
    params = {}
    
    params['min_odds'] = x[0]
    params['max_odds'] = x[1]
    params['max_pred'] = x[3]
    params['min_pred'] = x[2]
    params['max_players'] = x[4]
    
    for i, country in enumerate(countries):
        params['country_{}'.format(country)] = x[5+i]
        
    for i, sc in enumerate(sub_categories):
        params['sub_category_{}'.format(sc)] = x[5+len(countries)+i]
        
    for n in range(0,6):
        params['nb_{}'.format(n)] = x[5+len(countries)+len(sub_categories)+n]

    return params

@use_named_args(space)
def f(**params):
    return ff(params)
    
def ff(params, train=True):
    
    print(params)
    
    b = bets[ (bets['pred'] != 0) & (bets['target']==target) & (bets['pred_std'] != 0)]
    
    b = b[ (b['odds_ref'] > params['min_odds']) & (b['odds_ref'] < params['max_odds']) & (b['declared_player_count'] > 1) & (b['declared_player_count'] <= params['max_players']) ]
    
    #b = b[(b['pred'] >= params['min_pred']) & (b['pred'] <= params['max_pred'])]
    
    for country in countries:
        if params['country_{}'.format(country)] == 0:
            b = b[ b['country'] != country ]
            
    for sub_category in sub_categories:
        if params['sub_category_{}'.format(sub_category)] == 0:
            b = b[ b['sub_category'] != sub_category ]
            
    for n in range(0,6):
        if params['nb_{}'.format(n)] == 0:
            b = b[ b['nb'] != n]
    
    p = b.profit.sum()
    pp = len(b[b['profit'] > 0])
    bet = b.bet.sum()
    
    print("{:+.2f} {:+.2f}\n".format(p, bet))
        
    if bet == 0:
        return 0
    
    if not train:
        return b
    
    return - (p)
    
    

In [None]:
#c_mean, c_min, c_max

In [None]:
#from skopt import dummy_minimize, gp_minimize, gbrt_minimize

#res = dummy_minimize(f, space, n_calls=500)

In [None]:
#x_to_params(res.x), res.fun

In [None]:
#params = x_to_params(res.x)
#%store params
#params

In [None]:
#%store -r params
#%store -r countries
#%store -r sub_categories

In [None]:
#bb = ff(params, False)

In [None]:
#bb = bets[(bets['pred'] > 1.) & (bets['next_pred_1'] != bets['next_pred_2'])]
#bb = bets[(bets['pred_std'] < 0.1) & (bets['pred'] < 1.)]
#bb = bets[ (bets['odds_ref'] > 1) & (bets['odds_ref'] < 30) & (bets['pred'] > 1) & (bets['target'] == 'pred_xgb_100_1') ]
#graph_bb( bb[bb['target']=='pred_knn_1_1'], 'category')
#graph_bb( bets, 'category')

In [None]:
bets = bets.iloc[:,~bets.columns.duplicated()]

In [None]:
#bb = bets[(bets['sub_category']=='HANDICAP') & (bets['country'] != 'FRA')]
#bb = bb[(bb['pred_mlp_10_1']>118) & (bb['final_odds_ref']>=bb['final_odds'])].copy()
bb = bets
bb['stash'] = bb.profit.cumsum()
bb['stash'].plot(figsize=(12,4))

In [None]:
# to save the program when the model looks good, call: program.lock('2020-03-03')

# a copy of the program, model and dataset class will be created

# after this, open the new program class in *ml/pipeline/programs* and update the `model_params` in the `run` function

# add any additional bet filter and bet strategy to the program bet function

# add the new program to the better command *pmu/management/commands/bet.py*: programs = ['2020-03-03']

In [None]:
program.lock('position_prediction')