In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import optuna

In [2]:
prefixes = ['lgbm', 'catboost', 'xgboost']
valids = [pd.read_csv(f"output/{p}_valid.csv") for p in prefixes]
submissions = [pd.read_csv(f"output/{p}.csv") for p in prefixes]

# BLENDING

In [3]:
def objective(trial: optuna.Trial):
    p_lgbm = trial.suggest_uniform('lgbm', 0.0, 1.0)
    p_cat = trial.suggest_uniform('cat', 0.0, 1.0)
    p_xgb = trial.suggest_uniform('xgb', 0.0, 1.0)
    y = valids[0]['rating'].values
    a_lgbm = valids[0].drop('rating', axis=1).values
    a_cat = valids[1].drop('rating', axis=1).values
    a_xgb = valids[2].drop('rating', axis=1).values
    a = p_lgbm * a_lgbm + p_cat * a_cat + p_xgb * a_xgb
    b = np.argmax(a, axis=1)

    return f1_score(y, b, average='micro')

study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=10)

[32m[I 2022-05-13 09:23:47,382][0m A new study created in memory with name: no-name-44ce4259-f049-4047-bffd-6749bc9b1084[0m
[32m[I 2022-05-13 09:23:47,486][0m Trial 0 finished with value: 0.34185690345032194 and parameters: {'lgbm': 0.44832090628803345, 'cat': 0.3720287477372366, 'xgb': 0.7922331402590965}. Best is trial 0 with value: 0.34185690345032194.[0m
[32m[I 2022-05-13 09:23:47,586][0m Trial 1 finished with value: 0.34002870590876655 and parameters: {'lgbm': 0.02942117850429482, 'cat': 0.5286009084328184, 'xgb': 0.8172915373177925}. Best is trial 0 with value: 0.34185690345032194.[0m
[32m[I 2022-05-13 09:23:47,685][0m Trial 2 finished with value: 0.34178072855275715 and parameters: {'lgbm': 0.3745884402849907, 'cat': 0.8803684363265535, 'xgb': 0.27522654493247123}. Best is trial 0 with value: 0.34185690345032194.[0m
[32m[I 2022-05-13 09:23:47,786][0m Trial 3 finished with value: 0.3409067218333293 and parameters: {'lgbm': 0.6284578301246474, 'cat': 0.21607901278991

In [4]:
study.trials_dataframe().sort_values(by='value', ascending=False).head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_cat,params_lgbm,params_xgb,state
33,33,0.341993,2022-05-13 09:23:50.892405,2022-05-13 09:23:50.998765,0 days 00:00:00.106360,0.543277,0.442373,0.256432,COMPLETE
64,64,0.341973,2022-05-13 09:23:54.223293,2022-05-13 09:23:54.326835,0 days 00:00:00.103542,0.8194,0.589193,0.251004,COMPLETE
37,37,0.341965,2022-05-13 09:23:51.324407,2022-05-13 09:23:51.436561,0 days 00:00:00.112154,0.543704,0.391812,0.168807,COMPLETE
62,62,0.341965,2022-05-13 09:23:54.005936,2022-05-13 09:23:54.116359,0 days 00:00:00.110423,0.754245,0.504166,0.091476,COMPLETE
21,21,0.341961,2022-05-13 09:23:49.566799,2022-05-13 09:23:49.684320,0 days 00:00:00.117521,0.794032,0.543212,0.185263,COMPLETE


In [5]:
study.best_params

{'lgbm': 0.44237294030562296,
 'cat': 0.5432766882012364,
 'xgb': 0.25643191190565584}

In [6]:
p_lgbm = study.best_params['lgbm']
p_cat = study.best_params['cat']
p_xgb = study.best_params['xgb']

a_lgbm = submissions[0].drop(['id', 'rating'], axis=1).values
a_cat = submissions[1].drop(['id', 'rating'], axis=1).values
a = p_lgbm * a_lgbm + p_cat * a_cat

sub = submissions[0][['id']].reset_index(drop=True)
sub['rating'] = a.argmax(axis=1) + 1
sub.to_csv('blend_submission.csv', index=False)

# STACKING

In [7]:
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool

  from pandas import MultiIndex, Int64Index


In [8]:
a = valids[0].drop('rating', axis=1)
a = a.rename(columns={col: f'{col}_0' for col in a.columns})
b = valids[1].drop('rating', axis=1)
b = b.rename(columns={col: f'{col}_1' for col in b.columns})
c = valids[2].drop('rating', axis=1)
c = c.rename(columns={col: f'{col}_2' for col in c.columns})

mgd_val = pd.concat([a, b, c], axis=1)
mgd_val['rating'] = valids[0]['rating']

a = submissions[0].drop(['id', 'rating'], axis=1)
a = a.rename(columns={col: f'{col}_0' for col in a.columns})
b = submissions[1].drop(['id', 'rating'], axis=1)
b = b.rename(columns={col: f'{col}_1' for col in b.columns})
c = submissions[2].drop(['id', 'rating'], axis=1)
c = c.rename(columns={col: f'{col}_2' for col in c.columns})

mgd_sub = pd.concat([a, b, c], axis=1)
mgd_sub['id'] = submissions[0]['id']

In [9]:
X_train = mgd_val.drop('rating', axis=1)
y_train = mgd_val['rating']

X = mgd_sub.drop('id', axis=1)

params = {
    "iterations": 1000,
    "learning_rate": 0.05,
    "loss_function": 'MultiClass',
}

train_dataset = Pool(data=X_train, label=y_train)
test_dataset = Pool(data=X)

model = CatBoostClassifier(**params)
model.fit(train_dataset)
y = model.predict_proba(test_dataset)

0:	learn: 2.2411645	total: 126ms	remaining: 2m 5s
1:	learn: 2.1918750	total: 200ms	remaining: 1m 39s
2:	learn: 2.1496648	total: 270ms	remaining: 1m 29s
3:	learn: 2.1121944	total: 342ms	remaining: 1m 25s
4:	learn: 2.0790793	total: 415ms	remaining: 1m 22s
5:	learn: 2.0503005	total: 490ms	remaining: 1m 21s
6:	learn: 2.0233694	total: 565ms	remaining: 1m 20s
7:	learn: 1.9994586	total: 637ms	remaining: 1m 18s
8:	learn: 1.9778480	total: 707ms	remaining: 1m 17s
9:	learn: 1.9578926	total: 782ms	remaining: 1m 17s
10:	learn: 1.9403901	total: 857ms	remaining: 1m 17s
11:	learn: 1.9236346	total: 930ms	remaining: 1m 16s
12:	learn: 1.9082739	total: 1.01s	remaining: 1m 16s
13:	learn: 1.8943540	total: 1.08s	remaining: 1m 16s
14:	learn: 1.8819563	total: 1.16s	remaining: 1m 16s
15:	learn: 1.8697716	total: 1.24s	remaining: 1m 15s
16:	learn: 1.8587094	total: 1.31s	remaining: 1m 15s
17:	learn: 1.8485171	total: 1.39s	remaining: 1m 15s
18:	learn: 1.8386895	total: 1.47s	remaining: 1m 15s
19:	learn: 1.8296933	to

In [10]:
sub = mgd_sub[['id']].reset_index(drop=True)
sub['rating'] = y.argmax(axis=1) + 1
assert sub['rating'].min() == 1 and sub['rating'].max() == 10
sub.to_csv('stack_submission.csv', index=False)