In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from tqdm import tqdm_notebook
import warnings
from statsmodels.tsa.stattools import kpss
from statsmodels.stats.multitest import multipletests
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error
import time
from catboost import CatBoostRegressor, CatBoostClassifier

import lightgbm as lgb
import xgboost as xgb
import catboost as ctb

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import gc

from scripts.feature_engineering import reduce_mem_usage, add_rolling_features
from scripts.feature_engineering import exponential_smoothing, signal_shifts
from scripts.feature_engineering import batch_stats2, add_minus_signal
from scripts.feature_engineering import delete_objects_after_rolling

from scripts.feature_engineering import add_quantiles, add_target_encoding

from copy import copy

from bayes_opt import BayesianOptimization

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

## Обучение catboost.

Попробуем обучить дефолтный катбуст на сгенерированных ранее признаках.

In [2]:
train = pd.read_csv('data-without-drift/train_clean.csv')
test = pd.read_csv('data-without-drift/test_clean.csv')

In [3]:
train.tail()

Unnamed: 0,time,signal,open_channels
4999995,499.9996,2.919274,7
4999996,499.9997,2.697906,7
4999997,499.9998,4.516337,8
4999998,499.9999,5.639669,9
4999999,500.0,5.3792,9


In [4]:
test.tail()

Unnamed: 0,time,signal
1999995,699.9996,-2.9092
1999996,699.9997,-2.7422
1999997,699.9998,-2.8285
1999998,699.9999,-2.9092
1999999,700.0,-2.7422


Добавим признаки и применим сжатие данных.

In [5]:
def prepare_df(df, window_sizes, alphas, shifts, batch_sizes):
    df = reduce_mem_usage(df)
    df = add_rolling_features(df, window_sizes)
    df = reduce_mem_usage(df)
    df = exponential_smoothing(df, alphas)
    df = reduce_mem_usage(df)
    df = signal_shifts(df, shifts)
    df = reduce_mem_usage(df)
    df = add_minus_signal(df)
    df = reduce_mem_usage(df)
    df = batch_stats2(df, batch_sizes)
    df = reduce_mem_usage(df)
    df = add_minus_signal(df)
    df = reduce_mem_usage(df)
    
    if 'open_channels' in df.columns:
        y = df['open_channels']
        df = df.drop(columns=['time'])
        return df, y
    else:
        df = df.drop(columns=['time'])
        return df

In [6]:
window_sizes = [5, 100, 5000]
alphas = [0.5, 0.1]
shifts = [1,2,-1,-2]
batch_sizes = [50000, 25000, 5000]
quantiles = [3, 7, 15]

X_train, y_train = prepare_df(train, window_sizes, alphas, shifts, batch_sizes)
X_test = prepare_df(test, window_sizes, alphas, shifts, batch_sizes)

add_quantiles(X_train, X_test, [3, 7, 15])
add_target_encoding(X_train, X_test, [3, 7, 15])

X_train = X_train.drop(columns=['open_channels'])

Mem. usage decreased to 23.84 Mb (79.2% reduction)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


Mem. usage decreased to 295.64 Mb (72.9% reduction)
Mem. usage decreased to 314.71 Mb (15.4% reduction)
Mem. usage decreased to 352.86 Mb (0.0% reduction)
Mem. usage decreased to 677.11 Mb (0.0% reduction)
Mem. usage decreased to 991.82 Mb (48.8% reduction)
Mem. usage decreased to 1630.78 Mb (0.0% reduction)
Mem. usage decreased to  7.63 Mb (75.0% reduction)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


Mem. usage decreased to 120.16 Mb (72.4% reduction)
Mem. usage decreased to 127.79 Mb (15.2% reduction)
Mem. usage decreased to 143.05 Mb (0.0% reduction)
Mem. usage decreased to 276.57 Mb (0.0% reduction)
Mem. usage decreased to 402.45 Mb (48.4% reduction)
Mem. usage decreased to 661.85 Mb (0.0% reduction)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




Для улучшения качества модели будем передавать наименования колонок категориальных признаков.

In [7]:
quant_columns = np.array(quantiles).astype(str)

for i in range(len(quant_columns)):
    quant_columns[i] = 'quant_' + quant_columns[i]
    
quant_columns = quant_columns.tolist()

In [8]:
def MacroF1Metric(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.round(np.clip(preds, 0, 10)).astype(int)
    score = f1_score(labels, preds, average = 'macro')
    return ('MacroF1Metric', score, True)

Применим кросс-вадидацию. Будем каждый раз обучать модель на 4 фолдах из 5. В конце усредним предсказания.

Также, сохраним out of fold предсказания для дальнейшего стекинга моделей.

In [10]:
def catboost_cv_loop(X_train, y_train, X_test, params, num_iterations):
    n_fold = 5
    folds = KFold(n_splits=n_fold, shuffle=True, random_state=17)
   
    oof = np.zeros(len(X_train))
    prediction = np.zeros(len(X_test))
    scores = []
   
    for training_index, validation_index in tqdm_notebook(folds.split(X_train), total=n_fold):
        # разбиение на трэйн и валидацию
        X_train_ = X_train.iloc[training_index]
        y_train_ = y_train[training_index]
        X_valid = X_train.iloc[validation_index]
        y_valid = y_train[validation_index]
        
        model = CatBoostRegressor(
            random_seed=17,
            verbose=6,
            cat_features=quant_columns
        )
       
        # обучение модели
        model.fit(X_train_, y_train_, )
 
        # скор на валидации
        preds = model.predict(X_valid)
        oof[validation_index] = preds.reshape(-1,)
        
        preds = np.round(np.clip(preds, 0, 10)).astype(int)
        score = f1_score(y_valid, preds, average = 'macro')
        scores.append(score)
       
        # предсказание на тесте
        preds = model.predict(X_test)
        prediction += preds
       
        print(f'score: {score}')
       
    prediction /= n_fold
    # prediction = np.round(np.clip(prediction, 0, 10)).astype(int)
   
    return scores, oof, prediction

In [11]:
scores, oof, prediction = catboost_cv_loop(X_train, y_train, X_test, np.nan, np.nan)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Learning rate set to 0.170831
0:	learn: 2.2208776	total: 2.51s	remaining: 41m 43s
6:	learn: 0.7656598	total: 13.3s	remaining: 31m 21s
12:	learn: 0.3227851	total: 23.2s	remaining: 29m 22s
18:	learn: 0.2142144	total: 33.4s	remaining: 28m 44s
24:	learn: 0.1915270	total: 42.9s	remaining: 27m 52s
30:	learn: 0.1843314	total: 52.8s	remaining: 27m 30s
36:	learn: 0.1807013	total: 1m 2s	remaining: 27m 2s
42:	learn: 0.1782891	total: 1m 11s	remaining: 26m 27s
48:	learn: 0.1761109	total: 1m 20s	remaining: 25m 56s
54:	learn: 0.1745443	total: 1m 29s	remaining: 25m 38s
60:	learn: 0.1731750	total: 1m 39s	remaining: 25m 26s
66:	learn: 0.1718982	total: 1m 49s	remaining: 25m 20s
72:	learn: 0.1707438	total: 1m 59s	remaining: 25m 21s
78:	learn: 0.1699044	total: 2m 10s	remaining: 25m 19s
84:	learn: 0.1692466	total: 2m 21s	remaining: 25m 23s
90:	learn: 0.1684047	total: 2m 30s	remaining: 25m 8s
96:	learn: 0.1677844	total: 2m 41s	remaining: 25m 7s
102:	learn: 0.1671540	total: 2m 51s	remaining: 24m 52s
108:	lear

900:	learn: 0.1547772	total: 26m 18s	remaining: 2m 53s
906:	learn: 0.1547532	total: 26m 27s	remaining: 2m 42s
912:	learn: 0.1547268	total: 26m 36s	remaining: 2m 32s
918:	learn: 0.1547065	total: 26m 45s	remaining: 2m 21s
924:	learn: 0.1546806	total: 26m 53s	remaining: 2m 10s
930:	learn: 0.1546556	total: 27m 3s	remaining: 2m
936:	learn: 0.1546320	total: 27m 12s	remaining: 1m 49s
942:	learn: 0.1546088	total: 27m 21s	remaining: 1m 39s
948:	learn: 0.1545798	total: 27m 30s	remaining: 1m 28s
954:	learn: 0.1545527	total: 27m 39s	remaining: 1m 18s
960:	learn: 0.1545279	total: 27m 48s	remaining: 1m 7s
966:	learn: 0.1545026	total: 27m 57s	remaining: 57.3s
972:	learn: 0.1544795	total: 28m 7s	remaining: 46.8s
978:	learn: 0.1544457	total: 28m 16s	remaining: 36.4s
984:	learn: 0.1544194	total: 28m 26s	remaining: 26s
990:	learn: 0.1543939	total: 28m 35s	remaining: 15.6s
996:	learn: 0.1543683	total: 28m 44s	remaining: 5.19s
999:	learn: 0.1543566	total: 28m 48s	remaining: 0us
score: 0.9371865577191034
Le

786:	learn: 0.1550926	total: 22m 27s	remaining: 6m 4s
792:	learn: 0.1550663	total: 22m 36s	remaining: 5m 54s
798:	learn: 0.1550421	total: 22m 44s	remaining: 5m 43s
804:	learn: 0.1550128	total: 22m 54s	remaining: 5m 32s
810:	learn: 0.1549861	total: 23m 3s	remaining: 5m 22s
816:	learn: 0.1549538	total: 23m 12s	remaining: 5m 11s
822:	learn: 0.1549220	total: 23m 22s	remaining: 5m 1s
828:	learn: 0.1548886	total: 23m 31s	remaining: 4m 51s
834:	learn: 0.1548630	total: 23m 41s	remaining: 4m 40s
840:	learn: 0.1548343	total: 23m 50s	remaining: 4m 30s
846:	learn: 0.1548055	total: 23m 59s	remaining: 4m 19s
852:	learn: 0.1547770	total: 24m 8s	remaining: 4m 9s
858:	learn: 0.1547456	total: 24m 17s	remaining: 3m 59s
864:	learn: 0.1547218	total: 24m 26s	remaining: 3m 48s
870:	learn: 0.1546905	total: 24m 35s	remaining: 3m 38s
876:	learn: 0.1546628	total: 24m 44s	remaining: 3m 28s
882:	learn: 0.1546406	total: 24m 54s	remaining: 3m 18s
888:	learn: 0.1546163	total: 25m 5s	remaining: 3m 7s
894:	learn: 0.154

672:	learn: 0.1557859	total: 20m 22s	remaining: 9m 53s
678:	learn: 0.1557515	total: 20m 32s	remaining: 9m 42s
684:	learn: 0.1557120	total: 20m 42s	remaining: 9m 31s
690:	learn: 0.1556802	total: 20m 52s	remaining: 9m 20s
696:	learn: 0.1556359	total: 21m 2s	remaining: 9m 8s
702:	learn: 0.1555956	total: 21m 12s	remaining: 8m 57s
708:	learn: 0.1555634	total: 21m 21s	remaining: 8m 46s
714:	learn: 0.1555348	total: 21m 31s	remaining: 8m 34s
720:	learn: 0.1555080	total: 21m 40s	remaining: 8m 23s
726:	learn: 0.1554748	total: 21m 50s	remaining: 8m 12s
732:	learn: 0.1554367	total: 22m 1s	remaining: 8m 1s
738:	learn: 0.1554063	total: 22m 11s	remaining: 7m 50s
744:	learn: 0.1553768	total: 22m 21s	remaining: 7m 39s
750:	learn: 0.1553428	total: 22m 34s	remaining: 7m 29s
756:	learn: 0.1553109	total: 22m 46s	remaining: 7m 18s
762:	learn: 0.1552804	total: 22m 58s	remaining: 7m 8s
768:	learn: 0.1552489	total: 23m 8s	remaining: 6m 57s
774:	learn: 0.1552193	total: 23m 22s	remaining: 6m 47s
780:	learn: 0.15

564:	learn: 0.1567914	total: 15m 18s	remaining: 11m 47s
570:	learn: 0.1567522	total: 15m 28s	remaining: 11m 37s
576:	learn: 0.1567110	total: 15m 37s	remaining: 11m 27s
582:	learn: 0.1566718	total: 15m 46s	remaining: 11m 17s
588:	learn: 0.1566257	total: 15m 56s	remaining: 11m 7s
594:	learn: 0.1565881	total: 16m 5s	remaining: 10m 56s
600:	learn: 0.1565467	total: 16m 18s	remaining: 10m 49s
606:	learn: 0.1565100	total: 16m 28s	remaining: 10m 40s
612:	learn: 0.1564770	total: 16m 37s	remaining: 10m 29s
618:	learn: 0.1564442	total: 16m 46s	remaining: 10m 19s
624:	learn: 0.1564093	total: 16m 55s	remaining: 10m 9s
630:	learn: 0.1563734	total: 17m 3s	remaining: 9m 58s
636:	learn: 0.1563368	total: 17m 12s	remaining: 9m 48s
642:	learn: 0.1562938	total: 17m 21s	remaining: 9m 38s
648:	learn: 0.1562488	total: 17m 31s	remaining: 9m 28s
654:	learn: 0.1562097	total: 17m 40s	remaining: 9m 18s
660:	learn: 0.1561668	total: 17m 49s	remaining: 9m 8s
666:	learn: 0.1561234	total: 17m 59s	remaining: 8m 58s
672:

456:	learn: 0.1580222	total: 13m 58s	remaining: 16m 36s
462:	learn: 0.1579484	total: 14m 9s	remaining: 16m 25s
468:	learn: 0.1578807	total: 14m 19s	remaining: 16m 13s
474:	learn: 0.1578204	total: 14m 29s	remaining: 16m 1s
480:	learn: 0.1577599	total: 14m 39s	remaining: 15m 49s
486:	learn: 0.1577183	total: 14m 50s	remaining: 15m 37s
492:	learn: 0.1576623	total: 15m	remaining: 15m 25s
498:	learn: 0.1576149	total: 15m 9s	remaining: 15m 13s
504:	learn: 0.1575605	total: 15m 19s	remaining: 15m 1s
510:	learn: 0.1575124	total: 15m 28s	remaining: 14m 48s
516:	learn: 0.1574595	total: 15m 43s	remaining: 14m 41s
522:	learn: 0.1574136	total: 15m 54s	remaining: 14m 30s
528:	learn: 0.1573712	total: 16m 4s	remaining: 14m 18s
534:	learn: 0.1573165	total: 16m 15s	remaining: 14m 7s
540:	learn: 0.1572666	total: 16m 31s	remaining: 14m 1s
546:	learn: 0.1572206	total: 16m 42s	remaining: 13m 50s
552:	learn: 0.1571743	total: 16m 54s	remaining: 13m 40s
558:	learn: 0.1571306	total: 17m 7s	remaining: 13m 30s
564:

Средний скор на валидации:

In [13]:
np.mean(scores)

0.9378753275122685

In [16]:
prediction

array([-0.0239915 ,  0.02709293,  0.09398148, ..., -0.00213793,
       -0.00281308, -0.0207176 ])

Сохраним предсказания.

In [14]:
def pred_proc(pred):
    pred = np.round(np.clip(pred, 0, 10))
    return pred.astype(int)

In [17]:
y_catboost_pred = pred_proc(prediction)

sample_df = pd.read_csv("data/sample_submission.csv", dtype={'time':str})
sample_df['open_channels'] = y_catboost_pred
sample_df.to_csv("catboost_3_default.csv", index=False, float_format='%.4f')

In [19]:
np.save('preds_best_catboost', prediction)
np.save('oof_best_catboost', oof)

***0.940 LB*** 

Неплохой результат для дефолтной модели.