In [6]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

In [7]:
books = pd.read_csv('input/transformed/books.csv', usecols=['book_idx', 'title_idx', 'author_idx', 'publisher_idx'])
users = pd.read_csv('input/transformed/users.csv', usecols=['user_idx', 'age', 'city_idx', 'province_idx', 'country_idx'])
train_ratings = pd.read_csv('input/transformed/train_ratings.csv', usecols=['id', 'user_idx', 'book_idx', 'rating'])
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])

In [8]:
for c in ['city_idx', 'province_idx', 'country_idx']:
    users[c] = users[c].fillna(-1).astype(int)
for c in ['author_idx', 'publisher_idx']:
    books[c] = books[c].fillna(-1).astype(int)

In [9]:
train = train_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx']).sort_values(by='id').drop('id', axis=1)
train['rating'] -= 1

In [11]:
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
X, y = train.drop('rating', axis=1), train['rating']

validations = []

cat_features = ['city_idx', 'province_idx', 'country_idx', 'title_idx', 'author_idx', 'publisher_idx']

for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    xgb_params = {
        'objective': 'multi:softmax',
        'num_class': 10,
        'eval_metric': 'mlogloss',
    }
    evals = [(dtrain, 'train'), (dtest, 'eval')]
    evals_result = {}

    model = xgb.train(xgb_params,
        dtrain,
        num_boost_round=1,
        early_stopping_rounds=20,
        evals=evals,
        evals_result=evals_result,
    )
    y_pred = model.predict_proba(dtest)

    y_pred_idx = y_pred.argmax(axis=1)
    score = f1_score(y_test, y_pred_idx, average='micro')
    print(score)

    validations.append(pd.concat([
        pd.DataFrame({
            'rating': y_test.values,
        }),
        pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])
    ], axis=1))
pd.concat(validations).to_csv("output/xgboost_valid.csv", index=False)


CUDARuntimeError: cudaErrorUnknown: unknown error

In [43]:
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])
test = test_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx'])
X_test = test.drop('id', axis=1)



dtrain = xgb.DMatrix(X, label=y)
dtest = xgb.DMatrix(X_test)


model = xgb.train(xgb_params,
    dtrain,
    num_boost_round=model.best_iteration,
)
y_pred = model.predict_proba(dtest)


y_pred_idx = y_pred.argmax(axis=1)

pred = pd.concat([test[['id']], pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])], axis=1)
pred['rating'] = y_pred_idx + 1

0:	learn: 2.0204989	total: 243ms	remaining: 17.5s
1:	learn: 1.8991168	total: 477ms	remaining: 16.9s
2:	learn: 1.8498923	total: 683ms	remaining: 15.9s
3:	learn: 1.8197105	total: 900ms	remaining: 15.5s
4:	learn: 1.8062857	total: 1.08s	remaining: 14.7s
5:	learn: 1.7986047	total: 1.27s	remaining: 14.2s
6:	learn: 1.7936274	total: 1.46s	remaining: 13.8s
7:	learn: 1.7910994	total: 1.67s	remaining: 13.6s
8:	learn: 1.7886305	total: 1.86s	remaining: 13.2s
9:	learn: 1.7862501	total: 2.05s	remaining: 12.9s
10:	learn: 1.7842038	total: 2.24s	remaining: 12.6s
11:	learn: 1.7829434	total: 2.44s	remaining: 12.4s
12:	learn: 1.7806936	total: 2.64s	remaining: 12.2s
13:	learn: 1.7792523	total: 2.81s	remaining: 11.9s
14:	learn: 1.7777990	total: 3s	remaining: 11.6s
15:	learn: 1.7768612	total: 3.18s	remaining: 11.3s
16:	learn: 1.7759711	total: 3.37s	remaining: 11.1s
17:	learn: 1.7750725	total: 3.56s	remaining: 10.9s
18:	learn: 1.7741600	total: 3.74s	remaining: 10.6s
19:	learn: 1.7732871	total: 3.92s	remaining:

In [44]:
assert pred['rating'].min() == 1 and pred['rating'].max() == 10
pred.to_csv('output/xgboost.csv', index=False)
pred[['id', 'rating']].to_csv('output/xgboost_submission.csv', index=False)