In [59]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import optuna

In [60]:
prefixes = ['lgbm', 'catboost']
valids = [pd.read_csv(f"output/{p}_valid.csv") for p in prefixes]
submissions = [pd.read_csv(f"output/{p}.csv") for p in prefixes]

# BLENDING

In [61]:
def objective(trial: optuna.Trial):
    p_lgbm = trial.suggest_uniform('lgbm', 0.0, 1.0)
    p_cat = trial.suggest_uniform('cat', 0.0, 1.0)
    y = valids[0]['rating'].values
    a_lgbm = valids[0].drop('rating', axis=1).values
    a_cat = valids[1].drop('rating', axis=1).values
    a = p_lgbm * a_lgbm + p_cat * a_cat
    b = np.argmax(a, axis=1)

    return f1_score(y, b, average='micro')

study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=10)

[32m[I 2022-05-13 08:57:11,437][0m A new study created in memory with name: no-name-7b854b91-11ec-408c-8d36-a3f2c40579d6[0m
[32m[I 2022-05-13 08:57:11,529][0m Trial 0 finished with value: 0.34144395532141797 and parameters: {'lgbm': 0.1946314813748523, 'cat': 0.24179975234255557}. Best is trial 0 with value: 0.34144395532141797.[0m
[32m[I 2022-05-13 08:57:11,617][0m Trial 1 finished with value: 0.341419900090608 and parameters: {'lgbm': 0.2514533817354314, 'cat': 0.8894002445417409}. Best is trial 0 with value: 0.34144395532141797.[0m
[32m[I 2022-05-13 08:57:11,705][0m Trial 2 finished with value: 0.3198664132849021 and parameters: {'lgbm': 0.9111634585070012, 'cat': 0.057808038934680006}. Best is trial 0 with value: 0.34144395532141797.[0m
[32m[I 2022-05-13 08:57:11,793][0m Trial 3 finished with value: 0.33720221628859864 and parameters: {'lgbm': 0.3030733468102875, 'cat': 0.1828658395725239}. Best is trial 0 with value: 0.34144395532141797.[0m
[32m[I 2022-05-13 08:57:

In [62]:
study.trials_dataframe().sort_values(by='value', ascending=False).head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_cat,params_lgbm,state
58,58,0.341981,2022-05-13 08:57:16.566502,2022-05-13 08:57:16.654383,0 days 00:00:00.087881,0.948013,0.579407,COMPLETE
102,102,0.341977,2022-05-13 08:57:20.462895,2022-05-13 08:57:20.550916,0 days 00:00:00.088021,0.904059,0.556613,COMPLETE
59,59,0.341977,2022-05-13 08:57:16.655153,2022-05-13 08:57:16.743001,0 days 00:00:00.087848,0.959855,0.582287,COMPLETE
66,66,0.341969,2022-05-13 08:57:17.275673,2022-05-13 08:57:17.363317,0 days 00:00:00.087644,0.830317,0.49934,COMPLETE
39,39,0.341965,2022-05-13 08:57:14.884768,2022-05-13 08:57:14.972242,0 days 00:00:00.087474,0.227769,0.138056,COMPLETE


In [78]:
p_lgbm = 0.579407
p_cat = 0.948013

a_lgbm = submissions[0].drop(['id', 'rating'], axis=1).values
a_cat = submissions[1].drop(['id', 'rating'], axis=1).values
a = p_lgbm * a_lgbm + p_cat * a_cat

sub = submissions[0][['id']].reset_index(drop=True)
sub['rating'] = a.argmax(axis=1) + 1
sub.to_csv('blend_submission.csv', index=False)

# STACKING

In [82]:
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool

In [83]:
a = valids[0].drop('rating', axis=1)
a = a.rename(columns={c: f'{c}_0' for c in a.columns})
b = valids[1].drop('rating', axis=1)
b = b.rename(columns={c: f'{c}_1' for c in b.columns})
mgd_val = pd.concat([a, b], axis=1)
mgd_val['rating'] = valids[0]['rating']

a = submissions[0].drop(['id', 'rating'], axis=1)
a = a.rename(columns={c: f'{c}_0' for c in a.columns})
b = submissions[1].drop(['id', 'rating'], axis=1)
b = b.rename(columns={c: f'{c}_1' for c in b.columns})
mgd_sub = pd.concat([a, b], axis=1)
mgd_sub['id'] = submissions[0]['id']

In [84]:
X_train = mgd_val.drop('rating', axis=1)
y_train = mgd_val['rating']

X = mgd_sub.drop('id', axis=1)

params = {
    "iterations": 1000,
    "learning_rate": 0.05,
    "loss_function": 'MultiClass',
}

train_dataset = Pool(data=X_train, label=y_train)
test_dataset = Pool(data=X)

model = CatBoostClassifier(**params)
model.fit(train_dataset)
y = model.predict_proba(test_dataset)

0:	learn: 2.2420238	total: 141ms	remaining: 2m 20s
1:	learn: 2.1917796	total: 229ms	remaining: 1m 54s
2:	learn: 2.1486238	total: 317ms	remaining: 1m 45s
3:	learn: 2.1110073	total: 402ms	remaining: 1m 40s
4:	learn: 2.0786613	total: 494ms	remaining: 1m 38s
5:	learn: 2.0494784	total: 576ms	remaining: 1m 35s
6:	learn: 2.0238064	total: 665ms	remaining: 1m 34s
7:	learn: 1.9997636	total: 750ms	remaining: 1m 33s
8:	learn: 1.9777117	total: 828ms	remaining: 1m 31s
9:	learn: 1.9579535	total: 915ms	remaining: 1m 30s
10:	learn: 1.9399007	total: 1s	remaining: 1m 30s
11:	learn: 1.9232254	total: 1.08s	remaining: 1m 29s
12:	learn: 1.9081928	total: 1.17s	remaining: 1m 29s
13:	learn: 1.8938540	total: 1.26s	remaining: 1m 29s
14:	learn: 1.8809343	total: 1.35s	remaining: 1m 28s
15:	learn: 1.8691431	total: 1.44s	remaining: 1m 28s
16:	learn: 1.8579804	total: 1.52s	remaining: 1m 27s
17:	learn: 1.8479084	total: 1.61s	remaining: 1m 27s
18:	learn: 1.8382287	total: 1.7s	remaining: 1m 27s
19:	learn: 1.8292395	total

In [85]:
sub = mgd_sub[['id']].reset_index(drop=True)
sub['rating'] = y.argmax(axis=1) + 1
assert sub['rating'].min() == 1 and sub['rating'].max() == 10
sub.to_csv('stack_submission.csv', index=False)