In [33]:
import numpy as np
import pandas as pd
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

In [26]:
books = pd.read_csv('input/transformed/books.csv', usecols=['book_idx', 'title_idx', 'author_idx', 'publisher_idx'])
users = pd.read_csv('input/transformed/users.csv', usecols=['user_idx', 'age', 'city_idx', 'province_idx', 'country_idx'])
train_ratings = pd.read_csv('input/transformed/train_ratings.csv', usecols=['id', 'user_idx', 'book_idx', 'rating'])
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])

In [27]:
for c in ['city_idx', 'province_idx', 'country_idx']:
    users[c] = users[c].fillna(-1).astype(int)
for c in ['author_idx', 'publisher_idx']:
    books[c] = books[c].fillna(-1).astype(int)

In [28]:
train = train_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx']).sort_values(by='id').drop('id', axis=1)
train['rating'] -= 1

In [35]:
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
X, y = train.drop('rating', axis=1), train['rating']

validations = []

cat_features = ['city_idx', 'province_idx', 'country_idx', 'title_idx', 'author_idx', 'publisher_idx']

for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_dataset = Pool(data=X_train, label=y_train, cat_features=cat_features)
    eval_dataset = Pool(data=X_test, label=y_test, cat_features=cat_features)

    params = {
        "iterations": 100,
        "learning_rate": 0.5,
        "loss_function": 'MultiClass',
    }
    model = CatBoostClassifier(**params)
    model.fit(train_dataset, eval_set=eval_dataset, use_best_model=True)
    y_pred = model.predict_proba(eval_dataset)

    y_pred_idx = y_pred.argmax(axis=1)
    score = f1_score(y_test, y_pred_idx, average='micro')
    print(score)

    validations.append(pd.concat([
        pd.DataFrame({
            'rating': y_test.values,
        }),
        pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])
    ], axis=1))
pd.concat(validations).to_csv("output/catboost_valid.csv", index=False)


0:	learn: 2.0305687	test: 2.0300892	best: 2.0300892 (0)	total: 130ms	remaining: 1.17s
1:	learn: 1.9102704	test: 1.9046784	best: 1.9046784 (1)	total: 267ms	remaining: 1.07s
2:	learn: 1.8662993	test: 1.8589518	best: 1.8589518 (2)	total: 339ms	remaining: 791ms
3:	learn: 1.8393925	test: 1.8319446	best: 1.8319446 (3)	total: 455ms	remaining: 683ms
4:	learn: 1.8208145	test: 1.8119979	best: 1.8119979 (4)	total: 575ms	remaining: 575ms
5:	learn: 1.8117012	test: 1.8020406	best: 1.8020406 (5)	total: 678ms	remaining: 452ms
6:	learn: 1.8062150	test: 1.7966439	best: 1.7966439 (6)	total: 778ms	remaining: 333ms
7:	learn: 1.7999680	test: 1.7914735	best: 1.7914735 (7)	total: 909ms	remaining: 227ms
8:	learn: 1.7967003	test: 1.7886678	best: 1.7886678 (8)	total: 1.01s	remaining: 113ms
9:	learn: 1.7940248	test: 1.7858889	best: 1.7858889 (9)	total: 1.12s	remaining: 0us

bestTest = 1.785888876
bestIteration = 9

0.3036972889754877
0:	learn: 2.0292283	test: 2.0273281	best: 2.0273281 (0)	total: 138ms	remaining: 

In [36]:
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])
test = test_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx'])
X_test = test.drop('id', axis=1)

train_dataset = Pool(data=X, label=y, cat_features=cat_features)
test_dataset = Pool(data=X_test, cat_features=cat_features)

params['iterations'] = model.get_best_iteration()
params['use_best_model'] = False
model = CatBoostClassifier(**params)

model.fit(train_dataset)
y_pred = model.predict_proba(test_dataset)
y_pred_idx = y_pred.argmax(axis=1)

pred = pd.concat([test[['id']], pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])], axis=1)
pred['rating'] = y_pred_idx + 1

0:	learn: 2.0204989	total: 218ms	remaining: 1.74s
1:	learn: 1.8991168	total: 437ms	remaining: 1.53s
2:	learn: 1.8498923	total: 622ms	remaining: 1.24s
3:	learn: 1.8197105	total: 814ms	remaining: 1.02s
4:	learn: 1.8062857	total: 993ms	remaining: 795ms
5:	learn: 1.7986047	total: 1.17s	remaining: 585ms
6:	learn: 1.7936274	total: 1.35s	remaining: 387ms
7:	learn: 1.7910994	total: 1.55s	remaining: 194ms
8:	learn: 1.7886305	total: 1.73s	remaining: 0us


In [37]:
assert pred['rating'].min() == 1 and pred['rating'].max() == 10
pred.to_csv('output/catboost.csv', index=False)
pred[['id', 'rating']].to_csv('output/catboost_submission.csv', index=False)