In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

  from pandas import MultiIndex, Int64Index


In [2]:
books = pd.read_csv('input/transformed/books.csv', usecols=['book_idx', 'title_idx', 'author_idx', 'publisher_idx'])
users = pd.read_csv('input/transformed/users.csv', usecols=['user_idx', 'age', 'city_idx', 'province_idx', 'country_idx'])
train_ratings = pd.read_csv('input/transformed/train_ratings.csv', usecols=['id', 'user_idx', 'book_idx', 'rating'])
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])

In [3]:
for c in ['city_idx', 'province_idx', 'country_idx']:
    users[c] = users[c].fillna(-1).astype(int)
for c in ['author_idx', 'publisher_idx']:
    books[c] = books[c].fillna(-1).astype(int)

In [4]:
train = train_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx']).sort_values(by='id').drop('id', axis=1)
train['rating'] -= 1

In [5]:
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
X, y = train.drop('rating', axis=1), train['rating']

validations = []

cat_features = ['city_idx', 'province_idx', 'country_idx', 'title_idx', 'author_idx', 'publisher_idx']

for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # dtrain = xgb.DMatrix(X_train, label=y_train)
    # dtest = xgb.DMatrix(X_test, label=y_test)

    xgb_params = {
        'objective': 'multi:softmax',
        'num_class': 10,
        'eval_metric': 'mlogloss',
        'learning_rate': 0.05,
        'tree_method': 'gpu_hist',
    }
    # evals = [(dtrain, 'train'), (dtest, 'eval')]
    evals_result = {}

    model = xgb.XGBClassifier(
        n_estimators=1000,
        **xgb_params
    )

    model.fit(
        X=X_train, y=y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=20,)
    y_pred = model.predict_proba(X_test)

    y_pred_idx = y_pred.argmax(axis=1)
    score = f1_score(y_test, y_pred_idx, average='micro')
    print(score)

    validations.append(pd.concat([
        pd.DataFrame({
            'rating': y_test.values,
        }),
        pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])
    ], axis=1))
pd.concat(validations).to_csv("output/xgboost_valid.csv", index=False)


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[0]	validation_0-mlogloss:2.27963
[1]	validation_0-mlogloss:2.25840
[2]	validation_0-mlogloss:2.23862
[3]	validation_0-mlogloss:2.22024
[4]	validation_0-mlogloss:2.20283
[5]	validation_0-mlogloss:2.18659
[6]	validation_0-mlogloss:2.17131
[7]	validation_0-mlogloss:2.15687
[8]	validation_0-mlogloss:2.14318
[9]	validation_0-mlogloss:2.13023
[10]	validation_0-mlogloss:2.11771
[11]	validation_0-mlogloss:2.10594
[12]	validation_0-mlogloss:2.09468
[13]	validation_0-mlogloss:2.08405
[14]	validation_0-mlogloss:2.07379
[15]	validation_0-mlogloss:2.06410
[16]	validation_0-mlogloss:2.05471
[17]	validation_0-mlogloss:2.04588
[18]	validation_0-mlogloss:2.03733
[19]	validation_0-mlogloss:2.02921
[20]	validation_0-mlogloss:2.02143
[21]	validation_0-mlogloss:2.01389
[22]	validation_0-mlogloss:2.00679
[23]	validation_0-mlogloss:1.99988
[24]	validation_0-mlogloss:1.99335
[25]	validation_0-mlogloss:1.98708
[26]	validation_0-mlogloss:1.98105
[27]	validation_0-mlogloss:1.97523
[28]	validation_0-mlogloss:1.9

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[0]	validation_0-mlogloss:2.27935
[1]	validation_0-mlogloss:2.25805
[2]	validation_0-mlogloss:2.23812
[3]	validation_0-mlogloss:2.21958
[4]	validation_0-mlogloss:2.20232
[5]	validation_0-mlogloss:2.18594
[6]	validation_0-mlogloss:2.17025
[7]	validation_0-mlogloss:2.15578
[8]	validation_0-mlogloss:2.14207
[9]	validation_0-mlogloss:2.12868
[10]	validation_0-mlogloss:2.11636
[11]	validation_0-mlogloss:2.10457
[12]	validation_0-mlogloss:2.09336
[13]	validation_0-mlogloss:2.08250
[14]	validation_0-mlogloss:2.07249
[15]	validation_0-mlogloss:2.06283
[16]	validation_0-mlogloss:2.05337
[17]	validation_0-mlogloss:2.04461
[18]	validation_0-mlogloss:2.03586
[19]	validation_0-mlogloss:2.02791
[20]	validation_0-mlogloss:2.01997
[21]	validation_0-mlogloss:2.01255
[22]	validation_0-mlogloss:2.00547
[23]	validation_0-mlogloss:1.99863
[24]	validation_0-mlogloss:1.99189
[25]	validation_0-mlogloss:1.98575
[26]	validation_0-mlogloss:1.97972
[27]	validation_0-mlogloss:1.97390
[28]	validation_0-mlogloss:1.9

In [None]:
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])
test = test_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx'])
X_test = test.drop('id', axis=1)

dtrain = xgb.DMatrix(X, label=y)
dtest = xgb.DMatrix(X_test)

model = xgb.XGBClassifier(
    n_estimators=model.best_iteration,
    **xgb_params,
)
model.fit(
    X=X, y=y,
)
y_pred = model.predict_proba(X_test)


y_pred_idx = y_pred.argmax(axis=1)

pred = pd.concat([test[['id']], pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])], axis=1)
pred['rating'] = y_pred_idx + 1

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [None]:
assert pred['rating'].min() == 1 and pred['rating'].max() == 10
pred.to_csv('output/xgboost.csv', index=False)
pred[['id', 'rating']].to_csv('output/xgboost_submission.csv', index=False)