In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss
from collections import defaultdict

In [3]:
%matplotlib inline
%config Completer.use_jedi = False

In [4]:
base_path = '/Users/sergmiller/Documents/my/raif/data'

In [5]:
data = pd.read_csv(base_path + '/raw_train.csv', index_col=0)

In [6]:
data

Unnamed: 0,Division,Time,home_team,away_team,full_time_home_goals,full_time_away_goals,half_time_home_goals,half_time_away_goals,Referee,home_shots,...,away_fouls,home_corners,away_corners,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards,home_coef,draw_coef,away_coef
0,0,,152.0,426.0,0.0,0.0,0.0,0.0,-1,,...,,,,,,,,3.627690,3.577721,2.198600
1,0,,216.0,341.0,2.0,2.0,0.0,1.0,-1,,...,,,,,,,,1.608321,4.251999,6.533805
2,0,,223.0,119.0,2.0,2.0,2.0,0.0,-1,,...,,,,,,,,2.166811,3.486596,3.821467
3,0,,68.0,137.0,2.0,1.0,0.0,1.0,-1,,...,,,,,,,,2.259244,3.592401,3.460103
4,1,,425.0,225.0,2.0,2.0,1.0,0.0,-1,13.0,...,15.0,4.0,10.0,3.0,4.0,1.0,0.0,2.424278,3.414191,3.282876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,15,,231.0,309.0,0.0,1.0,0.0,0.0,-1,4.0,...,17.0,3.0,3.0,3.0,1.0,0.0,0.0,2.102645,3.404282,4.155226
39996,15,,248.0,398.0,4.0,0.0,2.0,0.0,-1,15.0,...,8.0,8.0,4.0,1.0,2.0,0.0,0.0,1.340170,5.956310,10.423542
39997,15,,453.0,126.0,1.0,2.0,1.0,1.0,-1,15.0,...,7.0,6.0,5.0,2.0,0.0,0.0,0.0,1.963689,3.506588,4.638715
39998,13,,164.0,403.0,2.0,2.0,1.0,2.0,-1,11.0,...,22.0,5.0,7.0,0.0,0.0,0.0,0.0,4.050577,3.852988,1.985771


In [7]:
data.columns

Index(['Division', 'Time', 'home_team', 'away_team', 'full_time_home_goals',
       'full_time_away_goals', 'half_time_home_goals', 'half_time_away_goals',
       'Referee', 'home_shots', 'away_shots', 'home_shots_on_target',
       'away_shots_on_target', 'home_fouls', 'away_fouls', 'home_corners',
       'away_corners', 'home_yellow_cards', 'away_yellow_cards',
       'home_red_cards', 'away_red_cards', 'home_coef', 'draw_coef',
       'away_coef'],
      dtype='object')

In [130]:
available_columns = ['Division', 'Time', 'home_team', 'away_team', 'Referee', 'home_coef', 'draw_coef', 'away_coef']
coef_columns = ['away_coef' , 'draw_coef', 'home_coef']

In [129]:
def simple_target(goals_home: np.array, goal_away: np.array):
    return (goals_home == goal_away) + 2 * (goals_home > goal_away)  # 0 if away > home, 2 if home > away, 1 if home = away

In [131]:
X = data[available_columns].values
y = simple_target(data['full_time_home_goals'].values, data['full_time_away_goals'].values)
coefs = data[coef_columns].values
X.shape, y.shape, coefs.shape

((39992, 8), (39992,), (39992, 3))

In [11]:
list(mode([[1,2,2],[2,3,3]], axis=0).mode.reshape(-1))

[1, 2, 2]

In [None]:
def get_team_profitability(teams_profitability: dict, team_id: int, n_matches: int):
    if team_id in teams_profitability:
        return np.mean(teams_profitability[team_id][-n_matches:])
    else:
        return 0


def get_profitability(teams_profitability: dict, home_team_id, away_team_id, n_matches=5):
    home_team_profitability = get_team_profitability(teams_profitability, home_team_id, n_matches)
    away_team_profitability = get_team_profitability(teams_profitability, away_team_id, n_matches)
    return home_team_profitability, away_team_profitability


def get_profitability_multiple_n_matches(teams_profitability: dict, home_team_id, away_team_id, n_matches_range: list):
    answer = []
    for n_matches in n_matches_range:
        home_team_profitability, away_team_profitability = get_profitability(teams_profitability, home_team_id, away_team_id, n_matches)
        answer.append(home_team_profitability)
        answer.append(away_team_profitability)
    return answer


def update_teams_profitability(home_coef, away_coef, home_goals, away_goals, home_team, away_team, teams_profitability):
    bet_on_home_profit = home_coef * (home_goals > away_goals) - 1
    if home_team in teams_profitability:
        teams_profitability[home_team].append(bet_on_home_profit)
    else:
        teams_profitability[home_team] = [bet_on_home_profit]
    bet_on_away_profit = away_coef * (home_goals < away_goals) - 1
    if away_team in teams_profitability:
        teams_profitability[away_team].append(bet_on_away_profit)
    else:
        teams_profitability[away_team] = [bet_on_away_profit]



In [114]:
import tqdm
from scipy.stats import mode

K = 5

def process_in_time(rows: np.array):
    def calc_f(seq):
        if len(seq) == 0:
            seq = [np.ones(12) * (-1)]
        seq = np.array(seq)
#         return list(np.min(seq, axis=0).reshape(-1)) \
#             + list(np.max(seq, axis=0).reshape(-1)) \
#             + list(np.median(seq, axis=0).reshape(-1)) \
        return list(mode(seq, axis=0).mode.reshape(-1)) \
            + list(seq[-1])
    
    stats = defaultdict(list)
    F = []
    for line in tqdm.tqdm(rows,position=0):
        Division, Time, home_team, away_team, full_time_home_goals, \
        full_time_away_goals, half_time_home_goals, half_time_away_goals, \
        Referee, home_shots, away_shots, home_shots_on_target, \
        away_shots_on_target, home_fouls, away_fouls, home_corners, \
        away_corners, home_yellow_cards, away_yellow_cards, \
        home_red_cards, away_red_cards, home_coef, draw_coef, \
        away_coef = line
        features = [Division, Time, home_team, away_team, Referee, home_coef, draw_coef, away_coef]
        home_match_seq = stats[home_team]
        away_match_seq = stats[away_team]
        
        features += calc_f(home_match_seq)
        features += calc_f(away_match_seq)
        
        F.append(features)
        
        home_profit = home_coef * (full_time_home_goals > full_time_away_goals) - 1
        away_profit = away_coef * (full_time_home_goals < full_time_away_goals) - 1
        
        stats[home_team].append((Division, full_time_home_goals, \
        half_time_home_goals, \
        Referee, home_shots, home_shots_on_target, \
        home_fouls, home_corners, \
        home_yellow_cards,
        home_red_cards, home_coef, draw_coef))
        
        stats[away_team].append((Division, 
        full_time_away_goals, half_time_away_goals, \
        Referee, away_shots, \
        away_shots_on_target, away_fouls, \
        away_corners, away_yellow_cards, \
        away_red_cards, away_coef, draw_coef))
        
        if len(stats[home_team]) > K:
            stats[home_team] = stats[home_team][-K:]
        
        if len(stats[away_team]) > K:
            stats[away_team] = stats[away_team][-K:]
            
    return np.array(F), stats

In [101]:
defaultdict(list, {1:2})[0]

[]

In [115]:
X_ext, stats = process_in_time(data.values)

100%|██████████| 39992/39992 [00:31<00:00, 1262.12it/s]


In [116]:
X_ext.shape

(39992, 56)

In [125]:
import json
with open("submit/stats_v4.json", "w") as f:
    json.dump(stats, f)

In [96]:
np.savez_compressed("X_ext_v4.npz", X_ext)

In [19]:
import catboost

In [20]:
from sklearn.model_selection import TimeSeriesSplit

In [21]:
tscv = TimeSeriesSplit()

In [184]:
# cb_params = {"has_time":True, "auto_class_weights": "Balanced"}
cb_params = {"learning_rate":1e-2, "has_time":True, "max_bin": 8}

In [71]:
from sklearn.ensemble import RandomForestClassifier

In [148]:
y

array([1, 1, 1, ..., 0, 1, 1])

In [150]:
coefs[np.arange(len(coefs)), y]

array([3.57772141, 4.25199924, 3.48659608, ..., 4.63871489, 3.85298814,
       3.92419353])

In [198]:
from sklearn.linear_model import LogisticRegression

logs = {}

for i, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X_ext[train_index], X_ext[test_index]
    y_train, y_test = y[train_index], y[test_index]
    _coefs = coefs[np.arange(len(coefs)), y][train_index]
    _coefs_test = coefs[np.arange(len(coefs)), y][test_index]
#     model = catboost.CatBoostClassifier(**cb_params)
    model = LogisticRegression(tol=1e-3,max_iter=1000)
#     pool = catboost.Pool(X_train, label=y_train, weight=_coefs)
#     val_pool = catboost.Pool(X_test, label=y_test, weight=_coefs_test)
#     model.fit(pool, eval_set=val_pool, early_stopping_rounds=100)
    model.fit(np.nan_to_num(X_train), y_train)
    y_pred = model.predict_proba(np.nan_to_num(X_test))
    logs[i] = {'y_true': y_test, 'y_pred': y_pred, 'index': test_index, 'model': model}

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [199]:
model.coef_

array([[-8.60233171e-03,  0.00000000e+00,  4.42378424e-05,
        -1.20872753e-04,  7.59756599e-04,  1.51225098e-01,
         3.64128499e-02, -1.27671963e-01,  2.52676534e-02,
         1.13590237e-02, -4.48509465e-03, -5.13140242e-05,
         4.05243899e-04, -1.53880667e-03,  7.08178588e-04,
         4.93561530e-03,  3.60321994e-03,  8.46938828e-03,
         1.75688233e-02,  9.08163268e-03, -2.23469793e-02,
         8.83902051e-04, -1.24600761e-02, -1.00372638e-03,
        -3.30612936e-03,  1.51234665e-03, -2.56317598e-03,
        -5.10428982e-03, -2.62995865e-03,  9.09256535e-03,
         7.25644856e-04,  8.66741106e-03, -1.44343716e-02,
        -2.49197900e-02,  2.18451138e-02,  1.95090605e-03,
        -1.13925263e-03,  2.10463266e-03,  3.74820244e-03,
         2.48080105e-03,  1.06927125e-02,  9.47357037e-03,
        -5.88296808e-02,  1.31417014e-02,  1.46004837e-02,
        -4.54228436e-03, -7.93664654e-03, -4.57033497e-04,
         4.37363077e-03, -1.55337298e-03,  2.56158600e-0

In [186]:
model.feature_importances_

array([1.88352917, 0.        , 2.71391951, 3.93700683, 1.52177586,
       2.26379625, 9.33367393, 2.46340628, 1.81025035, 1.79741957,
       1.06573736, 1.54716433, 1.32985724, 0.36068038, 1.70607775,
       1.77226785, 0.17927129, 0.44683975, 4.69251646, 2.38227188,
       1.03442105, 0.4612277 , 1.87226544, 1.08204135, 1.19488169,
       2.24636396, 0.6896161 , 0.84611332, 0.91676009, 1.17518442,
       4.41106478, 1.09630686, 3.03649864, 3.2500209 , 1.06235497,
       1.01054359, 1.35850759, 1.98100582, 1.25871911, 1.3725485 ,
       1.8762374 , 0.96474361, 1.78194187, 3.94831101, 2.52089276,
       2.02660625, 0.74067264, 0.21782086, 0.86865919, 0.67934283,
       2.23333639, 1.81986723, 1.82717837, 0.61767857, 1.92454956,
       1.38825353])

In [153]:
logs

{0: {'y_true': array([2, 1, 2, ..., 2, 0, 2]),
  'y_pred': array([[0.14101672, 0.37980723, 0.47917605],
         [0.41088468, 0.40500917, 0.18410615],
         [0.40733843, 0.12574334, 0.46691823],
         ...,
         [0.20825531, 0.33274437, 0.45900033],
         [0.23588312, 0.49212544, 0.27199144],
         [0.39297958, 0.33223362, 0.2747868 ]]),
  'index': array([ 6667,  6668,  6669, ..., 13329, 13330, 13331]),
  'model': <catboost.core.CatBoostClassifier at 0x7ff9fbcbf8d0>},
 1: {'y_true': array([1, 1, 0, ..., 1, 0, 0]),
  'y_pred': array([[0.30717796, 0.39334743, 0.29947461],
         [0.22026469, 0.41316086, 0.36657445],
         [0.40609855, 0.39476376, 0.1991377 ],
         ...,
         [0.62150327, 0.08739822, 0.29109851],
         [0.14697006, 0.27970642, 0.57332352],
         [0.43687529, 0.31208269, 0.25104202]]),
  'index': array([13332, 13333, 13334, ..., 19994, 19995, 19996]),
  'model': <catboost.core.CatBoostClassifier at 0x7ffa02c55910>},
 2: {'y_true': array([2,

In [154]:
def calc_score(target, coefs, decision):
    assert target.shape[0] == coefs.shape[0] == decision.shape[0]
    assert coefs.shape[1] == 3
    res = 0
    maxx = 0
    for t, c, d in zip(target, coefs, decision):
        maxx += (c[t] - 1)
        d = int(d)
        t = int(t)
        if d == -1:
            continue
        if d == t:
            res += c[t]
        res -= 1
    return res, maxx, len(target)

In [155]:
coefs.shape

(39992, 3)

In [166]:
summ = defaultdict(lambda:0.0)

for i in range(len(logs)):
    t_ = logs[i]['y_true']
    coefs_ = coefs[logs[i]['index']]
    loss = log_loss(logs[i]['y_true'], logs[i]['y_pred'])
    decision_argmax = np.argmax(logs[i]['y_pred'], axis=1)
    bet_cost = logs[i]['y_pred'][np.arange(coefs_.shape[0]), decision_argmax] * coefs_[np.arange(coefs_.shape[0]), decision_argmax]
    for th in [-1]:
        decision_argmax_cliped = decision_argmax * (bet_cost > th) + (-1) * (bet_cost <= th)

        s1 = calc_score(t_, coefs_, decision_argmax)
        s2 = calc_score(t_, coefs_, decision_argmax_cliped)
        summ[th] += s2[0]
        print(i, th, s1, s2, loss)
# print(summ)

0 -1 (-6.672643941713661, 13059.14363894807, 6665) (-6.672643941713661, 13059.14363894807, 6665) 1.0984564086796795
1 -1 (135.77230387736864, 12920.115236405249, 6665) (135.77230387736864, 12920.115236405249, 6665) 1.093038242518073
2 -1 (101.52457972625318, 12995.348236897227, 6665) (101.52457972625318, 12995.348236897227, 6665) 1.0940798919035815
3 -1 (13.292422549018712, 12754.609639256461, 6665) (13.292422549018712, 12754.609639256461, 6665) 1.0953533416928123
4 -1 (53.85982627478271, 12748.03111277741, 6665) (53.85982627478271, 12748.03111277741, 6665) 1.0960794534226428


In [200]:
summ = defaultdict(lambda:0.0)

for i in range(len(logs)):
    t_ = logs[i]['y_true']
    coefs_ = coefs[logs[i]['index']]
    loss = log_loss(logs[i]['y_true'], logs[i]['y_pred'])
    decision_argmax = np.argmax(logs[i]['y_pred'], axis=1)
    bet_cost = logs[i]['y_pred'][np.arange(coefs_.shape[0]), decision_argmax] * coefs_[np.arange(coefs_.shape[0]), decision_argmax]
    for th in [-1]:
        decision_argmax_cliped = decision_argmax * (bet_cost > th) + (-1) * (bet_cost <= th)

        s1 = calc_score(t_, coefs_, decision_argmax)
        s2 = calc_score(t_, coefs_, decision_argmax_cliped)
        summ[th] += s2[0]
        print(i, th, s1, s2, loss)
# print(summ)

0 -1 (-125.20269534659397, 13059.14363894807, 6665) (-125.20269534659397, 13059.14363894807, 6665) 1.0220306973684374
1 -1 (-8.164399649312621, 12920.115236405249, 6665) (-8.164399649312621, 12920.115236405249, 6665) 1.007567199967968
2 -1 (-68.35736367888515, 12995.348236897227, 6665) (-68.35736367888515, 12995.348236897227, 6665) 1.0134736870692385
3 -1 (121.47703298407068, 12754.609639256461, 6665) (121.47703298407068, 12754.609639256461, 6665) 1.001781583627602
4 -1 (68.2803463828375, 12748.03111277741, 6665) (68.2803463828375, 12748.03111277741, 6665) 0.9930522669131258


In [202]:
calc_score(t_, coefs_, decision_argmax)

(68.2803463828375, 12748.03111277741, 6665)

In [201]:
calc_score(t_, coefs_, decision_argmax_cliped)

(68.2803463828375, 12748.03111277741, 6665)

1.096607647775357

In [99]:
logs[4]['model'].save_model('submit/model_v8_1.cbm')

In [123]:
for i in range(5):
    model = catboost.CatBoostClassifier(**cb_params)
    model.fit(X_ext, y)
    model.save_model('submit/model_v9_{}.cbm'.format(i+1))

Learning rate set to 0.095505
0:	learn: 1.0827684	total: 17.9ms	remaining: 17.8s
1:	learn: 1.0694410	total: 31.1ms	remaining: 15.5s
2:	learn: 1.0585042	total: 44.2ms	remaining: 14.7s
3:	learn: 1.0497287	total: 55.8ms	remaining: 13.9s
4:	learn: 1.0424199	total: 67.5ms	remaining: 13.4s
5:	learn: 1.0365728	total: 80ms	remaining: 13.3s
6:	learn: 1.0315130	total: 92.6ms	remaining: 13.1s
7:	learn: 1.0270859	total: 105ms	remaining: 13s
8:	learn: 1.0234911	total: 117ms	remaining: 12.9s
9:	learn: 1.0202912	total: 133ms	remaining: 13.2s
10:	learn: 1.0178578	total: 151ms	remaining: 13.6s
11:	learn: 1.0157556	total: 166ms	remaining: 13.6s
12:	learn: 1.0140614	total: 177ms	remaining: 13.5s
13:	learn: 1.0126168	total: 190ms	remaining: 13.4s
14:	learn: 1.0113709	total: 205ms	remaining: 13.5s
15:	learn: 1.0101093	total: 220ms	remaining: 13.5s
16:	learn: 1.0090847	total: 235ms	remaining: 13.6s
17:	learn: 1.0081434	total: 253ms	remaining: 13.8s
18:	learn: 1.0072955	total: 265ms	remaining: 13.7s
19:	lear