In [1]:
import numpy as np
import pandas as pd
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

  from pandas import MultiIndex, Int64Index


In [2]:
books = pd.read_csv('input/transformed/books.csv', usecols=['book_idx', 'title_idx', 'author_idx', 'publisher_idx'])
users = pd.read_csv('input/transformed/users.csv', usecols=['user_idx', 'age', 'city_idx', 'province_idx', 'country_idx'])
train_ratings = pd.read_csv('input/transformed/train_ratings.csv', usecols=['id', 'user_idx', 'book_idx', 'rating'])
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])

In [3]:
for c in ['city_idx', 'province_idx', 'country_idx']:
    users[c] = users[c].fillna(-1).astype(int)
for c in ['author_idx', 'publisher_idx']:
    books[c] = books[c].fillna(-1).astype(int)

In [4]:
train = train_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx']).sort_values(by='id').drop('id', axis=1)
train['rating'] -= 1

In [5]:
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
X, y = train.drop('rating', axis=1), train['rating']

validations = []

cat_features = ['city_idx', 'province_idx', 'country_idx', 'title_idx', 'author_idx', 'publisher_idx']
cat_features += ['user_idx', 'book_idx']

for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_dataset = Pool(data=X_train, label=y_train, cat_features=cat_features)
    eval_dataset = Pool(data=X_test, label=y_test, cat_features=cat_features)

    params = {
        "iterations": 1000,
        "learning_rate": 0.05,
        "loss_function": 'MultiClass',
    }
    model = CatBoostClassifier(**params)
    model.fit(train_dataset, eval_set=eval_dataset, use_best_model=True)
    y_pred = model.predict_proba(eval_dataset)

    y_pred_idx = y_pred.argmax(axis=1)
    score = f1_score(y_test, y_pred_idx, average='micro')
    print(score)

    validations.append(pd.concat([
        pd.DataFrame({
            'rating': y_test.values,
        }),
        pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])
    ], axis=1))
pd.concat(validations).to_csv("output/catboost_valid.csv", index=False)


0:	learn: 2.2587287	test: 2.2579004	best: 2.2579004 (0)	total: 436ms	remaining: 7m 15s
1:	learn: 2.2216391	test: 2.2204391	best: 2.2204391 (1)	total: 756ms	remaining: 6m 17s
2:	learn: 2.1881296	test: 2.1862355	best: 2.1862355 (2)	total: 983ms	remaining: 5m 26s
3:	learn: 2.1575536	test: 2.1548448	best: 2.1548448 (3)	total: 1.21s	remaining: 5m
4:	learn: 2.1306973	test: 2.1275994	best: 2.1275994 (4)	total: 1.49s	remaining: 4m 56s
5:	learn: 2.1059120	test: 2.1020943	best: 2.1020943 (5)	total: 1.78s	remaining: 4m 54s
6:	learn: 2.0835911	test: 2.0793685	best: 2.0793685 (6)	total: 2.21s	remaining: 5m 13s
7:	learn: 2.0637062	test: 2.0591837	best: 2.0591837 (7)	total: 2.48s	remaining: 5m 7s
8:	learn: 2.0393550	test: 2.0327681	best: 2.0327681 (8)	total: 2.74s	remaining: 5m 1s
9:	learn: 2.0174834	test: 2.0091949	best: 2.0091949 (9)	total: 3s	remaining: 4m 56s
10:	learn: 1.9980072	test: 1.9887820	best: 1.9887820 (10)	total: 3.36s	remaining: 5m 2s
11:	learn: 1.9801322	test: 1.9696324	best: 1.969632

In [None]:
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])
test = test_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx'])
X_test = test.drop('id', axis=1)

train_dataset = Pool(data=X, label=y, cat_features=cat_features)
test_dataset = Pool(data=X_test, cat_features=cat_features)

params['iterations'] = model.get_best_iteration()
params['use_best_model'] = False
model = CatBoostClassifier(**params)

model.fit(train_dataset)
y_pred = model.predict_proba(test_dataset)
y_pred_idx = y_pred.argmax(axis=1)

pred = pd.concat([test[['id']], pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])], axis=1)
pred['rating'] = y_pred_idx + 1

In [None]:
assert pred['rating'].min() == 1 and pred['rating'].max() == 10
pred.to_csv('output/catboost.csv', index=False)
pred[['id', 'rating']].to_csv('output/catboost_submission.csv', index=False)