In [None]:
import numpy as np
import pandas as pd
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

In [None]:
books = pd.read_csv('input/transformed/books.csv', usecols=['book_idx', 'title_idx', 'author_idx', 'publisher_idx'])
users = pd.read_csv('input/transformed/users.csv', usecols=['user_idx', 'age', 'city_idx', 'province_idx', 'country_idx'])
train_ratings = pd.read_csv('input/transformed/train_ratings.csv', usecols=['id', 'user_idx', 'book_idx', 'rating'])
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])

In [None]:
for c in ['city_idx', 'province_idx', 'country_idx']:
    users[c] = users[c].fillna(-1).astype(int)
for c in ['author_idx', 'publisher_idx']:
    books[c] = books[c].fillna(-1).astype(int)

In [None]:
train = train_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx']).sort_values(by='id').drop('id', axis=1)
train['rating'] -= 1

In [None]:
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
X, y = train.drop('rating', axis=1), train['rating']

validations = []

cat_features = ['city_idx', 'province_idx', 'country_idx', 'title_idx', 'author_idx', 'publisher_idx']

for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_dataset = Pool(data=X_train, label=y_train, cat_features=cat_features)
    eval_dataset = Pool(data=X_test, label=y_test, cat_features=cat_features)

    params = {
        "iterations": 1000,
        "learning_rate": 0.05,
        "loss_function": 'MultiClass',
    }
    model = CatBoostClassifier(**params)
    model.fit(train_dataset, eval_set=eval_dataset, use_best_model=True)
    y_pred = model.predict_proba(eval_dataset)

    y_pred_idx = y_pred.argmax(axis=1)
    score = f1_score(y_test, y_pred_idx, average='micro')
    print(score)

    validations.append(pd.concat([
        pd.DataFrame({
            'rating': y_test.values,
        }),
        pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])
    ], axis=1))
pd.concat(validations).to_csv("output/catboost_valid.csv", index=False)


In [None]:
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])
test = test_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx'])
X_test = test.drop('id', axis=1)

train_dataset = Pool(data=X, label=y, cat_features=cat_features)
test_dataset = Pool(data=X_test, cat_features=cat_features)

params['iterations'] = model.get_best_iteration()
params['use_best_model'] = False
model = CatBoostClassifier(**params)

model.fit(train_dataset)
y_pred = model.predict_proba(test_dataset)
y_pred_idx = y_pred.argmax(axis=1)

pred = pd.concat([test[['id']], pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])], axis=1)
pred['rating'] = y_pred_idx + 1

In [None]:
assert pred['rating'].min() == 1 and pred['rating'].max() == 10
pred.to_csv('output/catboost.csv', index=False)
pred[['id', 'rating']].to_csv('output/catboost_submission.csv', index=False)