In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

  from pandas import MultiIndex, Int64Index


In [2]:
books = pd.read_csv('input/transformed/books.csv', usecols=['book_idx', 'title_idx', 'author_idx', 'publisher_idx'])
users = pd.read_csv('input/transformed/users.csv', usecols=['user_idx', 'age', 'city_idx', 'province_idx', 'country_idx'])
train_ratings = pd.read_csv('input/transformed/train_ratings.csv', usecols=['id', 'user_idx', 'book_idx', 'rating'])
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])

In [3]:
for c in ['city_idx', 'province_idx', 'country_idx']:
    users[c] = users[c].fillna(-1).astype(int)
for c in ['author_idx', 'publisher_idx']:
    books[c] = books[c].fillna(-1).astype(int)

In [4]:
train = train_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx']).sort_values(by='id').drop('id', axis=1)
train['rating'] -= 1

In [19]:
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
X, y = train.drop('rating', axis=1), train['rating']

validations = []

cat_features = ['city_idx', 'province_idx', 'country_idx', 'title_idx', 'author_idx', 'publisher_idx']

for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # dtrain = xgb.DMatrix(X_train, label=y_train)
    # dtest = xgb.DMatrix(X_test, label=y_test)

    xgb_params = {
        'objective': 'multi:softmax',
        'num_class': 10,
        'eval_metric': 'mlogloss',
    }
    # evals = [(dtrain, 'train'), (dtest, 'eval')]
    evals_result = {}

    model = xgb.XGBClassifier(
        n_estimators=100,
        **xgb_params
    )

    model.fit(
        X=X_train, y=y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=20,)
    y_pred = model.predict_proba(X_test)

    y_pred_idx = y_pred.argmax(axis=1)
    score = f1_score(y_test, y_pred_idx, average='micro')
    print(score)

    validations.append(pd.concat([
        pd.DataFrame({
            'rating': y_test.values,
        }),
        pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])
    ], axis=1))
pd.concat(validations).to_csv("output/xgboost_valid.csv", index=False)


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[0]	validation_0-mlogloss:2.17535
[1]	validation_0-mlogloss:2.09380
[2]	validation_0-mlogloss:2.03322
[3]	validation_0-mlogloss:1.98821
[4]	validation_0-mlogloss:1.95438
[5]	validation_0-mlogloss:1.92748
[6]	validation_0-mlogloss:1.90671
[7]	validation_0-mlogloss:1.89081
[8]	validation_0-mlogloss:1.87760
[9]	validation_0-mlogloss:1.86602
[10]	validation_0-mlogloss:1.85712
[11]	validation_0-mlogloss:1.85042
[12]	validation_0-mlogloss:1.84474
[13]	validation_0-mlogloss:1.84024
[14]	validation_0-mlogloss:1.83592
[15]	validation_0-mlogloss:1.83203
[16]	validation_0-mlogloss:1.82935
[17]	validation_0-mlogloss:1.82728
[18]	validation_0-mlogloss:1.82463
[19]	validation_0-mlogloss:1.82294
[20]	validation_0-mlogloss:1.82199
[21]	validation_0-mlogloss:1.81893
[22]	validation_0-mlogloss:1.81822
[23]	validation_0-mlogloss:1.81715
[24]	validation_0-mlogloss:1.81616
[25]	validation_0-mlogloss:1.81574
[26]	validation_0-mlogloss:1.81513
[27]	validation_0-mlogloss:1.81458
[28]	validation_0-mlogloss:1.8

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[0]	validation_0-mlogloss:2.17382
[1]	validation_0-mlogloss:2.09188
[2]	validation_0-mlogloss:2.03094
[3]	validation_0-mlogloss:1.98701
[4]	validation_0-mlogloss:1.95208
[5]	validation_0-mlogloss:1.92551
[6]	validation_0-mlogloss:1.90518
[7]	validation_0-mlogloss:1.88756
[8]	validation_0-mlogloss:1.87483
[9]	validation_0-mlogloss:1.86441
[10]	validation_0-mlogloss:1.85589
[11]	validation_0-mlogloss:1.84912
[12]	validation_0-mlogloss:1.84214
[13]	validation_0-mlogloss:1.83797
[14]	validation_0-mlogloss:1.83410
[15]	validation_0-mlogloss:1.83090
[16]	validation_0-mlogloss:1.82741
[17]	validation_0-mlogloss:1.82387
[18]	validation_0-mlogloss:1.82169
[19]	validation_0-mlogloss:1.81991
[20]	validation_0-mlogloss:1.81875
[21]	validation_0-mlogloss:1.81747
[22]	validation_0-mlogloss:1.81624
[23]	validation_0-mlogloss:1.81557
[24]	validation_0-mlogloss:1.81455
[25]	validation_0-mlogloss:1.81338
[26]	validation_0-mlogloss:1.81274
[27]	validation_0-mlogloss:1.81204
[28]	validation_0-mlogloss:1.8

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[0]	validation_0-mlogloss:2.17511
[1]	validation_0-mlogloss:2.09301
[2]	validation_0-mlogloss:2.03336
[3]	validation_0-mlogloss:1.98835
[4]	validation_0-mlogloss:1.95407
[5]	validation_0-mlogloss:1.92633
[6]	validation_0-mlogloss:1.90570
[7]	validation_0-mlogloss:1.88893
[8]	validation_0-mlogloss:1.87559
[9]	validation_0-mlogloss:1.86518
[10]	validation_0-mlogloss:1.85652
[11]	validation_0-mlogloss:1.84952
[12]	validation_0-mlogloss:1.84319
[13]	validation_0-mlogloss:1.83763
[14]	validation_0-mlogloss:1.83418
[15]	validation_0-mlogloss:1.83061
[16]	validation_0-mlogloss:1.82741
[17]	validation_0-mlogloss:1.82521
[18]	validation_0-mlogloss:1.82316
[19]	validation_0-mlogloss:1.82160
[20]	validation_0-mlogloss:1.82034
[21]	validation_0-mlogloss:1.81910
[22]	validation_0-mlogloss:1.81730
[23]	validation_0-mlogloss:1.81634
[24]	validation_0-mlogloss:1.81523
[25]	validation_0-mlogloss:1.81415
[26]	validation_0-mlogloss:1.81322
[27]	validation_0-mlogloss:1.81250
[28]	validation_0-mlogloss:1.8

In [21]:
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])
test = test_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx'])
X_test = test.drop('id', axis=1)

dtrain = xgb.DMatrix(X, label=y)
dtest = xgb.DMatrix(X_test)

model = xgb.XGBClassifier(
    n_estimators=model.best_iteration,
    **xgb_params,
)
model.fit(
    X=X, y=y,
)
y_pred = model.predict_proba(X_test)


y_pred_idx = y_pred.argmax(axis=1)

pred = pd.concat([test[['id']], pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])], axis=1)
pred['rating'] = y_pred_idx + 1

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [22]:
assert pred['rating'].min() == 1 and pred['rating'].max() == 10
pred.to_csv('output/xgboost.csv', index=False)
pred[['id', 'rating']].to_csv('output/xgboost_submission.csv', index=False)