In [88]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import optuna

In [89]:
prefixes = ['lgbm', 'catboost', 'xgboost']
valids = [pd.read_csv(f"output/{p}_valid.csv") for p in prefixes]
submissions = [pd.read_csv(f"output/{p}.csv") for p in prefixes]

# BLENDING

In [90]:
def objective(trial: optuna.Trial):
    p_lgbm = trial.suggest_uniform('lgbm', 0.0, 1.0)
    p_cat = trial.suggest_uniform('cat', 0.0, 1.0)
    p_xgb = trial.suggest_uniform('xgb', 0.0, 1.0)
    y = valids[0]['rating'].values
    a_lgbm = valids[0].drop('rating', axis=1).values
    a_cat = valids[1].drop('rating', axis=1).values
    a_xgb = valids[1].drop('rating', axis=1).values
    a = p_lgbm * a_lgbm + p_cat * a_cat + p_xgb * a_xgb
    b = np.argmax(a, axis=1)

    return f1_score(y, b, average='micro')

study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=10)

[32m[I 2022-05-13 09:12:02,985][0m A new study created in memory with name: no-name-2d5cffd5-dafa-4f74-8852-de9e5041fef8[0m
[32m[I 2022-05-13 09:12:03,083][0m Trial 0 finished with value: 0.3405058013198303 and parameters: {'lgbm': 0.0873896053366432, 'cat': 0.9079219863163073, 'xgb': 0.4266751265409895}. Best is trial 0 with value: 0.3405058013198303.[0m
[32m[I 2022-05-13 09:12:03,176][0m Trial 1 finished with value: 0.3419290691427518 and parameters: {'lgbm': 0.9298296608648309, 'cat': 0.3843218174915358, 'xgb': 0.9950914827064218}. Best is trial 1 with value: 0.3419290691427518.[0m
[32m[I 2022-05-13 09:12:03,268][0m Trial 2 finished with value: 0.34134372519304323 and parameters: {'lgbm': 0.910103329257229, 'cat': 0.5266096525872395, 'xgb': 0.5685204066897753}. Best is trial 1 with value: 0.3419290691427518.[0m
[32m[I 2022-05-13 09:12:03,360][0m Trial 3 finished with value: 0.34185289424518694 and parameters: {'lgbm': 0.4242047128956985, 'cat': 0.7857657769778151, 'xgb

In [91]:
study.trials_dataframe().sort_values(by='value', ascending=False).head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_cat,params_lgbm,params_xgb,state
94,94,0.342009,2022-05-13 09:12:12.319600,2022-05-13 09:12:12.418184,0 days 00:00:00.098584,0.975768,0.661476,0.221589,COMPLETE
45,45,0.341977,2022-05-13 09:12:07.436945,2022-05-13 09:12:07.536747,0 days 00:00:00.099802,0.996766,0.707076,0.159588,COMPLETE
51,51,0.341977,2022-05-13 09:12:08.044101,2022-05-13 09:12:08.146344,0 days 00:00:00.102243,0.997785,0.729843,0.187839,COMPLETE
62,62,0.341973,2022-05-13 09:12:09.142556,2022-05-13 09:12:09.240650,0 days 00:00:00.098094,0.928536,0.66869,0.190512,COMPLETE
39,39,0.341973,2022-05-13 09:12:06.822820,2022-05-13 09:12:06.922316,0 days 00:00:00.099496,0.852883,0.707921,0.204512,COMPLETE


In [92]:
study.best_params

{'lgbm': 0.6614758414186555,
 'cat': 0.9757680801497848,
 'xgb': 0.22158885412844143}

In [93]:
p_lgbm = study.best_params['lgbm']
p_cat = study.best_params['cat']
p_xgb = study.best_params['xgb']

a_lgbm = submissions[0].drop(['id', 'rating'], axis=1).values
a_cat = submissions[1].drop(['id', 'rating'], axis=1).values
a = p_lgbm * a_lgbm + p_cat * a_cat

sub = submissions[0][['id']].reset_index(drop=True)
sub['rating'] = a.argmax(axis=1) + 1
sub.to_csv('blend_submission.csv', index=False)

# STACKING

In [82]:
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool

In [97]:
a = valids[0].drop('rating', axis=1)
a = a.rename(columns={col: f'{col}_0' for col in a.columns})
b = valids[1].drop('rating', axis=1)
b = b.rename(columns={col: f'{col}_1' for col in b.columns})
c = valids[2].drop('rating', axis=1)
c = c.rename(columns={col: f'{col}_2' for col in c.columns})

mgd_val = pd.concat([a, b, c], axis=1)
mgd_val['rating'] = valids[0]['rating']

a = submissions[0].drop(['id', 'rating'], axis=1)
a = a.rename(columns={col: f'{col}_0' for col in a.columns})
b = submissions[1].drop(['id', 'rating'], axis=1)
b = b.rename(columns={col: f'{col}_1' for col in b.columns})
c = submissions[2].drop(['id', 'rating'], axis=1)
c = c.rename(columns={col: f'{col}_2' for col in c.columns})

mgd_sub = pd.concat([a, b, c], axis=1)
mgd_sub['id'] = submissions[0]['id']

In [98]:
X_train = mgd_val.drop('rating', axis=1)
y_train = mgd_val['rating']

X = mgd_sub.drop('id', axis=1)

params = {
    "iterations": 1000,
    "learning_rate": 0.05,
    "loss_function": 'MultiClass',
}

train_dataset = Pool(data=X_train, label=y_train)
test_dataset = Pool(data=X)

model = CatBoostClassifier(**params)
model.fit(train_dataset)
y = model.predict_proba(test_dataset)

0:	learn: 2.2411645	total: 71.7ms	remaining: 1m 11s
1:	learn: 2.1918750	total: 143ms	remaining: 1m 11s
2:	learn: 2.1496648	total: 218ms	remaining: 1m 12s
3:	learn: 2.1121944	total: 288ms	remaining: 1m 11s
4:	learn: 2.0790793	total: 355ms	remaining: 1m 10s
5:	learn: 2.0503005	total: 423ms	remaining: 1m 10s
6:	learn: 2.0233694	total: 496ms	remaining: 1m 10s
7:	learn: 1.9994586	total: 576ms	remaining: 1m 11s
8:	learn: 1.9778480	total: 645ms	remaining: 1m 10s
9:	learn: 1.9578926	total: 715ms	remaining: 1m 10s
10:	learn: 1.9403901	total: 784ms	remaining: 1m 10s
11:	learn: 1.9236346	total: 854ms	remaining: 1m 10s
12:	learn: 1.9082739	total: 927ms	remaining: 1m 10s
13:	learn: 1.8943540	total: 996ms	remaining: 1m 10s
14:	learn: 1.8819563	total: 1.06s	remaining: 1m 9s
15:	learn: 1.8697716	total: 1.14s	remaining: 1m 9s
16:	learn: 1.8587094	total: 1.2s	remaining: 1m 9s
17:	learn: 1.8485171	total: 1.27s	remaining: 1m 9s
18:	learn: 1.8386895	total: 1.34s	remaining: 1m 9s
19:	learn: 1.8296933	total:

In [99]:
sub = mgd_sub[['id']].reset_index(drop=True)
sub['rating'] = y.argmax(axis=1) + 1
assert sub['rating'].min() == 1 and sub['rating'].max() == 10
sub.to_csv('stack_submission.csv', index=False)