In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import optuna

In [None]:
prefixes = ['lgbm', 'catboost', 'xgboost']
valids = [pd.read_csv(f"output/{p}_valid.csv") for p in prefixes]
submissions = [pd.read_csv(f"output/{p}.csv") for p in prefixes]

# BLENDING

In [None]:
def objective(trial: optuna.Trial):
    p_lgbm = trial.suggest_uniform('lgbm', 0.0, 1.0)
    p_cat = trial.suggest_uniform('cat', 0.0, 1.0)
    p_xgb = trial.suggest_uniform('xgb', 0.0, 1.0)
    y = valids[0]['rating'].values
    a_lgbm = valids[0].drop('rating', axis=1).values
    a_cat = valids[1].drop('rating', axis=1).values
    a_xgb = valids[2].drop('rating', axis=1).values
    a = p_lgbm * a_lgbm + p_cat * a_cat + p_xgb * a_xgb
    b = np.argmax(a, axis=1)

    return f1_score(y, b, average='micro')

study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=60)

In [None]:
study.trials_dataframe().sort_values(by='value', ascending=False).head()

In [None]:
study.best_params

In [None]:
p_lgbm = study.best_params['lgbm']
p_cat = study.best_params['cat']
p_xgb = study.best_params['xgb']

a_lgbm = submissions[0].drop(['id', 'rating'], axis=1).values
a_cat = submissions[1].drop(['id', 'rating'], axis=1).values
a = p_lgbm * a_lgbm + p_cat * a_cat

sub = submissions[0][['id']].reset_index(drop=True)
sub['rating'] = a.argmax(axis=1) + 1
sub.to_csv('blend_submission.csv', index=False)

# STACKING

In [None]:
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool

In [None]:
a = valids[0].drop('rating', axis=1)
a = a.rename(columns={col: f'{col}_0' for col in a.columns})
b = valids[1].drop('rating', axis=1)
b = b.rename(columns={col: f'{col}_1' for col in b.columns})
c = valids[2].drop('rating', axis=1)
c = c.rename(columns={col: f'{col}_2' for col in c.columns})

mgd_val = pd.concat([a, b, c], axis=1)
mgd_val['rating'] = valids[0]['rating']

a = submissions[0].drop(['id', 'rating'], axis=1)
a = a.rename(columns={col: f'{col}_0' for col in a.columns})
b = submissions[1].drop(['id', 'rating'], axis=1)
b = b.rename(columns={col: f'{col}_1' for col in b.columns})
c = submissions[2].drop(['id', 'rating'], axis=1)
c = c.rename(columns={col: f'{col}_2' for col in c.columns})

mgd_sub = pd.concat([a, b, c], axis=1)
mgd_sub['id'] = submissions[0]['id']

In [None]:
X_train = mgd_val.drop('rating', axis=1)
y_train = mgd_val['rating']

X = mgd_sub.drop('id', axis=1)

params = {
    "iterations": 2000,
    "learning_rate": 0.05,
    "loss_function": 'MultiClass',
}

train_dataset = Pool(data=X_train, label=y_train)
test_dataset = Pool(data=X)

model = CatBoostClassifier(**params)
model.fit(train_dataset)
y = model.predict_proba(test_dataset)

In [None]:
sub = mgd_sub[['id']].reset_index(drop=True)
sub['rating'] = y.argmax(axis=1) + 1
assert sub['rating'].min() == 1 and sub['rating'].max() == 10
sub.to_csv('stack_submission.csv', index=False)

# STACKING V2

In [None]:
a = valids[0].drop('rating', axis=1)
a = a.rename(columns={col: f'{col}_0' for col in a.columns})
b = valids[1].drop('rating', axis=1)
b = b.rename(columns={col: f'{col}_1' for col in b.columns})
c = valids[2].drop('rating', axis=1)
c = c.rename(columns={col: f'{col}_2' for col in c.columns})

mgd_val = pd.concat([a, b, c], axis=1)
mgd_val['rating'] = valids[0]['rating']

a = submissions[0].drop(['id', 'rating'], axis=1)
a = a.rename(columns={col: f'{col}_0' for col in a.columns})
b = submissions[1].drop(['id', 'rating'], axis=1)
b = b.rename(columns={col: f'{col}_1' for col in b.columns})
c = submissions[2].drop(['id', 'rating'], axis=1)
c = c.rename(columns={col: f'{col}_2' for col in c.columns})

mgd_sub = pd.concat([a, b, c], axis=1)
mgd_sub['id'] = submissions[0]['id']

In [None]:
from sklearn.model_selection import StratifiedKFold
books = pd.read_csv('input/transformed/books.csv', usecols=['book_idx', 'title_idx', 'author_idx', 'publisher_idx'])
users = pd.read_csv('input/transformed/users.csv', usecols=['user_idx', 'age', 'city_idx', 'province_idx', 'country_idx'])
train_ratings = pd.read_csv('input/transformed/train_ratings.csv', usecols=['id', 'user_idx', 'book_idx', 'rating'])
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])

train = train_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx']).sort_values(by='id').drop('id', axis=1)
train['rating'] -= 1

In [None]:
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
X, y = train.drop('rating', axis=1), train['rating']

validations = []

for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    validations.append(
        X_test[['book_idx', 'user_idx', 'title_idx', 'author_idx', 'publisher_idx', 'age', 'city_idx', 'province_idx', 'country_idx']].reset_index(drop=True)
    )
validations = pd.concat(validations)

In [None]:
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])
test = test_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx'])[['book_idx', 'user_idx', 'title_idx', 'author_idx', 'publisher_idx', 'age', 'city_idx', 'province_idx', 'country_idx']].reset_index(drop=True)

In [None]:
for c in ['city_idx', 'province_idx', 'country_idx']:
    validations[c] = validations[c].fillna(-1).astype(int)
    test[c] = test[c].fillna(-1).astype(int)
for c in ['author_idx', 'publisher_idx']:
    validations[c] = validations[c].fillna(-1).astype(int)
    test[c] = test[c].fillna(-1).astype(int)

In [None]:
mgd_val = mgd_val.reset_index(drop=True)
validations = validations.reset_index(drop=True)

mgd_val = pd.concat([mgd_val, validations], axis=1)
mgd_sub = pd.concat([mgd_sub, test], axis=1)

In [None]:
X_train = mgd_val.drop('rating', axis=1)
y_train = mgd_val['rating']

X = mgd_sub.drop('id', axis=1)

params = {
    "iterations": 2000,
    "learning_rate": 0.05,
    "loss_function": 'MultiClass',
}

train_dataset = Pool(data=X_train, label=y_train)
test_dataset = Pool(data=X)

model = CatBoostClassifier(**params)
model.fit(train_dataset)
y = model.predict_proba(test_dataset)

In [None]:
sub = mgd_sub[['id']].reset_index(drop=True)
sub['rating'] = y.argmax(axis=1) + 1
assert sub['rating'].min() == 1 and sub['rating'].max() == 10
sub.to_csv('stackv2_submission.csv', index=False)