In [15]:
import numpy as np
import pandas as pd
from os import listdir
from tqdm.notebook import tqdm
import optuna
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

WINDOW_SIZE = 10

In [2]:
def window_generator(myo, w_size=200):
    window_myo = []
    myo = np.concatenate((np.tile(np.array(myo[0]), (w_size, 1)), myo), axis=0)
    for i in range(len(myo)):
        window_myo.append(myo[i:w_size+i])
    return np.array(window_myo[:-w_size])

In [3]:
def load_data(train_root="data_train", slice_data=False, slice_size=10, w_size=200):
    
    pre_X, pre_y = [], []
    
    files = listdir(train_root)
    if slice_data:
        files = files[:slice_size]
        
    for file in tqdm(files):
        npz = np.load(f"{train_root}/{file}")
        myo = npz["data_myo"]
        vr = npz["data_vr"]
        window_myo = window_generator(myo, w_size=w_size)
        myo_reshaped = np.reshape(window_myo, (window_myo.shape[0], window_myo.shape[1]*window_myo.shape[2]))
        vr_reshaped = np.reshape(vr, (vr.shape[0], vr.shape[1]*vr.shape[2]))
        pre_X.append(myo_reshaped)
        pre_y.append(vr_reshaped)

    X = np.vstack(pre_X)
    y = np.vstack(pre_y)
    
    return X,y

In [4]:
%%time

X, y = load_data(slice_data=True, w_size=WINDOW_SIZE)

  0%|          | 0/10 [00:00<?, ?it/s]

CPU times: total: 312 ms
Wall time: 310 ms


In [5]:
def objective(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    train_pool = cb.Pool(
    data=X_train,
    label=y_train,
    )

    valid_pool = cb.Pool(
        data=X_test, 
        label=y_test,
    )

    param = {
        "loss_function": 'MultiRMSE',
        "iterations": 10,
        "allow_const_label": True,
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e0),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 1e0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 10),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
    }
    
    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    reg = cb.CatBoostRegressor(**param)
    reg.fit(train_pool, eval_set=valid_pool, verbose=5, early_stopping_rounds=100)
    y_pred = reg.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    return mae

In [6]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10, timeout=600)
print("Number of completed trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("\tBest Score: {}".format(trial.value))
print("\tBest Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2023-02-02 20:30:40,591][0m A new study created in memory with name: no-name-ccce4677-5742-4b63-8314-7992bc36eacb[0m


0:	learn: 3.3967768	test: 3.4001484	best: 3.4001484 (0)	total: 432ms	remaining: 3.89s
5:	learn: 3.3122835	test: 3.3158931	best: 3.3158931 (5)	total: 1.67s	remaining: 1.11s
9:	learn: 3.2776741	test: 3.2813762	best: 3.2813762 (9)	total: 2.71s	remaining: 0us

bestTest = 3.281376213
bestIteration = 9



[32m[I 2023-02-02 20:30:49,676][0m Trial 0 finished with value: 0.261163134530084 and parameters: {'learning_rate': 0.19173148418014033, 'l2_leaf_reg': 0.21911766758313617, 'colsample_bylevel': 0.06778045558781984, 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'min_data_in_leaf': 19, 'bagging_temperature': 1.7800495011524942}. Best is trial 0 with value: 0.261163134530084.[0m


0:	learn: 3.3552632	test: 3.3573795	best: 3.3573795 (0)	total: 1.07s	remaining: 9.64s
5:	learn: 3.2563076	test: 3.2680328	best: 3.2680328 (5)	total: 6.08s	remaining: 4.05s
9:	learn: 3.2264073	test: 3.2437818	best: 3.2437818 (9)	total: 9.59s	remaining: 0us

bestTest = 3.243781806
bestIteration = 9



[32m[I 2023-02-02 20:31:05,802][0m Trial 1 finished with value: 0.248996841661711 and parameters: {'learning_rate': 0.9787267616158092, 'l2_leaf_reg': 0.5641321164386649, 'colsample_bylevel': 0.03388650556773101, 'depth': 6, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'min_data_in_leaf': 4}. Best is trial 1 with value: 0.248996841661711.[0m


0:	learn: 3.4127910	test: 3.4103557	best: 3.4103557 (0)	total: 199ms	remaining: 1.79s
5:	learn: 3.3966670	test: 3.3943308	best: 3.3943308 (5)	total: 1.16s	remaining: 777ms
9:	learn: 3.3861561	test: 3.3835547	best: 3.3835547 (9)	total: 1.84s	remaining: 0us

bestTest = 3.383554665
bestIteration = 9



[32m[I 2023-02-02 20:31:13,838][0m Trial 2 finished with value: 0.27453897819585793 and parameters: {'learning_rate': 0.07819705925365857, 'l2_leaf_reg': 0.9409985077555518, 'colsample_bylevel': 0.02492701355202956, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'min_data_in_leaf': 3}. Best is trial 1 with value: 0.248996841661711.[0m


0:	learn: 3.3379933	test: 3.3546254	best: 3.3546254 (0)	total: 1.1s	remaining: 9.91s
5:	learn: 3.1587394	test: 3.2529050	best: 3.2529050 (5)	total: 8.18s	remaining: 5.46s
9:	learn: 3.1044141	test: 3.2476147	best: 3.2476147 (9)	total: 12.4s	remaining: 0us

bestTest = 3.247614731
bestIteration = 9



[32m[I 2023-02-02 20:31:32,647][0m Trial 3 finished with value: 0.2463218231704269 and parameters: {'learning_rate': 0.8485299335398006, 'l2_leaf_reg': 0.2776958314297208, 'colsample_bylevel': 0.09654312765931418, 'depth': 10, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'min_data_in_leaf': 20, 'subsample': 0.17838567977247483}. Best is trial 3 with value: 0.2463218231704269.[0m


0:	learn: 3.3902699	test: 3.3863725	best: 3.3863725 (0)	total: 139ms	remaining: 1.25s
5:	learn: 3.3109493	test: 3.3046036	best: 3.3046036 (5)	total: 806ms	remaining: 537ms
9:	learn: 3.2829145	test: 3.2759431	best: 3.2759431 (9)	total: 1.36s	remaining: 0us

bestTest = 3.275943086
bestIteration = 9



[32m[I 2023-02-02 20:31:40,292][0m Trial 4 finished with value: 0.2584629867017103 and parameters: {'learning_rate': 0.7489626746641415, 'l2_leaf_reg': 0.6730329516677731, 'colsample_bylevel': 0.06292780916755267, 'depth': 2, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'min_data_in_leaf': 20, 'subsample': 0.28919602122144394}. Best is trial 3 with value: 0.2463218231704269.[0m


0:	learn: 3.3396750	test: 3.3411206	best: 3.3411206 (0)	total: 4.33s	remaining: 39s
5:	learn: 3.2044964	test: 3.2231671	best: 3.2231671 (5)	total: 23.3s	remaining: 15.6s
9:	learn: 3.1615980	test: 3.1901837	best: 3.1901837 (9)	total: 36.5s	remaining: 0us

bestTest = 3.190183691
bestIteration = 9



[32m[I 2023-02-02 20:32:23,788][0m Trial 5 finished with value: 0.24562259254855356 and parameters: {'learning_rate': 0.5058476946387673, 'l2_leaf_reg': 0.6570550340041426, 'colsample_bylevel': 0.08376255404387904, 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'min_data_in_leaf': 2}. Best is trial 5 with value: 0.24562259254855356.[0m


0:	learn: 3.3523546	test: 3.3522364	best: 3.3522364 (0)	total: 11.1s	remaining: 1m 40s
5:	learn: 3.2260176	test: 3.2394165	best: 3.2394165 (5)	total: 36.6s	remaining: 24.4s
9:	learn: 3.1935972	test: 3.2131758	best: 3.2131758 (9)	total: 56.6s	remaining: 0us

bestTest = 3.213175808
bestIteration = 9



[32m[I 2023-02-02 20:33:29,421][0m Trial 6 finished with value: 0.2504205027893407 and parameters: {'learning_rate': 0.36705837529278046, 'l2_leaf_reg': 0.2695731183360748, 'colsample_bylevel': 0.05233534300098308, 'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'min_data_in_leaf': 14, 'subsample': 0.6877310995305187}. Best is trial 5 with value: 0.24562259254855356.[0m


0:	learn: 3.3975961	test: 3.3947380	best: 3.3947380 (0)	total: 875ms	remaining: 7.87s
5:	learn: 3.3122444	test: 3.3115829	best: 3.3115829 (5)	total: 4.78s	remaining: 3.19s
9:	learn: 3.2795739	test: 3.2814777	best: 3.2814777 (9)	total: 8.11s	remaining: 0us

bestTest = 3.281477747
bestIteration = 9



[32m[I 2023-02-02 20:33:45,561][0m Trial 7 finished with value: 0.2609564014002005 and parameters: {'learning_rate': 0.2589737866718724, 'l2_leaf_reg': 0.6128525903243071, 'colsample_bylevel': 0.06562318107847116, 'depth': 4, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'min_data_in_leaf': 9, 'bagging_temperature': 2.3519922071537724}. Best is trial 5 with value: 0.24562259254855356.[0m


0:	learn: 3.4119792	test: 3.4121977	best: 3.4121977 (0)	total: 247ms	remaining: 2.22s
5:	learn: 3.3601373	test: 3.3630267	best: 3.3630267 (5)	total: 1.4s	remaining: 936ms
9:	learn: 3.3271594	test: 3.3304806	best: 3.3304806 (9)	total: 2.38s	remaining: 0us

bestTest = 3.330480583
bestIteration = 9



[32m[I 2023-02-02 20:33:56,732][0m Trial 8 finished with value: 0.26771571671109873 and parameters: {'learning_rate': 0.3307788853063746, 'l2_leaf_reg': 0.15269810317000113, 'colsample_bylevel': 0.05466288763630568, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'min_data_in_leaf': 4, 'bagging_temperature': 5.033229509903421}. Best is trial 5 with value: 0.24562259254855356.[0m


0:	learn: 3.4106268	test: 3.4168127	best: 3.4168127 (0)	total: 548ms	remaining: 4.93s
5:	learn: 3.3884686	test: 3.3946702	best: 3.3946702 (4)	total: 3.52s	remaining: 2.35s
9:	learn: 3.3725745	test: 3.3789845	best: 3.3789845 (9)	total: 6.08s	remaining: 0us

bestTest = 3.378984514
bestIteration = 9



[32m[I 2023-02-02 20:34:09,768][0m Trial 9 finished with value: 0.27354457246638986 and parameters: {'learning_rate': 0.06944717774726203, 'l2_leaf_reg': 0.7632484547915874, 'colsample_bylevel': 0.048033216041651325, 'depth': 3, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'min_data_in_leaf': 8, 'subsample': 0.6304842906772133}. Best is trial 5 with value: 0.24562259254855356.[0m


Number of completed trials: 10
Best trial:
	Best Score: 0.24562259254855356
	Best Params: 
    learning_rate: 0.5058476946387673
    l2_leaf_reg: 0.6570550340041426
    colsample_bylevel: 0.08376255404387904
    depth: 8
    boosting_type: Ordered
    bootstrap_type: MVS
    min_data_in_leaf: 2


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


train_pool = cb.Pool(
    data=X_train,
    label=y_train,
)

valid_pool = cb.Pool(
    data=X_test, 
    label=y_test,
)

best_params = trial.params
best_params["loss_function"] = 'MultiRMSE'
best_params["iterations"] = 100
best_params["allow_const_label"] = True


model = cb.CatBoostRegressor(**best_params)
model.fit(train_pool, eval_set=valid_pool, verbose=5)

# Get predictions

preds = model.predict(X_test)

0:	learn: 3.3414599	test: 3.3454272	best: 3.3454272 (0)	total: 3.83s	remaining: 6m 18s
5:	learn: 3.2045444	test: 3.2231072	best: 3.2231072 (5)	total: 20.5s	remaining: 5m 21s
10:	learn: 3.1556501	test: 3.1864697	best: 3.1864697 (10)	total: 37s	remaining: 4m 59s
15:	learn: 3.1287666	test: 3.1665317	best: 3.1665317 (15)	total: 54.3s	remaining: 4m 44s
20:	learn: 3.1041692	test: 3.1534108	best: 3.1534108 (20)	total: 1m 12s	remaining: 4m 32s
25:	learn: 3.0789815	test: 3.1387868	best: 3.1387868 (25)	total: 1m 29s	remaining: 4m 15s
30:	learn: 3.0521234	test: 3.1242730	best: 3.1242730 (30)	total: 1m 45s	remaining: 3m 55s
35:	learn: 3.0286097	test: 3.1129749	best: 3.1129749 (35)	total: 2m 3s	remaining: 3m 39s
40:	learn: 3.0038104	test: 3.1027142	best: 3.1027142 (40)	total: 2m 21s	remaining: 3m 23s
45:	learn: 2.9824349	test: 3.0931602	best: 3.0931602 (45)	total: 2m 37s	remaining: 3m 5s
50:	learn: 2.9673148	test: 3.0895588	best: 3.0895588 (50)	total: 2m 53s	remaining: 2m 46s
55:	learn: 2.9496213	t

In [10]:
mean_absolute_error(y_test, preds)

0.22777496149604284

#### Для итога

In [13]:
def prepare_predictions_for_csv(list_predictions):
    """
    [ [Time, 16, 4 ], ... ]
    return np array with N values.  
    """
    result = []
    for pred in list_predictions: 
        pred = np.reshape(pred[::10], [-1])
        result.extend(pred)
    result = np.array(result)
    return result

def predict_test(model=None, sub_root = "data_submission", w_size=200):

    list_preds = []
    
    for p in tqdm(sorted(listdir(sub_root))):
        file_data = np.load(f"{sub_root}/{p}")
        myo_data = file_data['data_myo']
        window_myo = window_generator(myo_data, w_size=w_size)
        myo_reshaped = np.reshape(window_myo, (window_myo.shape[0], window_myo.shape[1]*window_myo.shape[2]))
        pr = model.predict(myo_reshaped)
        sbm = np.reshape(pr, (pr.shape[0], 16, 4))

        list_preds.append(sbm)
    
    final_preds = prepare_predictions_for_csv(list_preds)
    df = pd.DataFrame({'Predicted': final_preds})
    df.insert(0, "Id", df.index)
    
    return df

In [16]:
df = predict_test(model=model, w_size=WINDOW_SIZE)
df.head()

  0%|          | 0/15 [00:00<?, ?it/s]

Unnamed: 0,Id,Predicted
0,0,2.0026619999999998e-19
1,1,-4.641773e-19
2,2,2.735191e-22
3,3,-1.0
4,4,-0.02883891
