In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

  from pandas import MultiIndex, Int64Index


In [2]:
books = pd.read_csv('input/transformed/books.csv', usecols=['book_idx', 'title_idx', 'author_idx', 'publisher_idx'])
users = pd.read_csv('input/transformed/users.csv', usecols=['user_idx', 'age', 'city_idx', 'province_idx', 'country_idx'])
train_ratings = pd.read_csv('input/transformed/train_ratings.csv', usecols=['id', 'user_idx', 'book_idx', 'rating'])
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])

In [3]:
for c in ['city_idx', 'province_idx', 'country_idx']:
    users[c] = users[c].fillna(-1).astype(int)
for c in ['author_idx', 'publisher_idx']:
    books[c] = books[c].fillna(-1).astype(int)

In [4]:
train = train_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx']).sort_values(by='id').drop('id', axis=1)
train['rating'] -= 1

In [5]:
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
X, y = train.drop('rating', axis=1), train['rating']

validations = []

cat_features = ['city_idx', 'province_idx', 'country_idx', 'title_idx', 'author_idx', 'publisher_idx']

for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # dtrain = xgb.DMatrix(X_train, label=y_train)
    # dtest = xgb.DMatrix(X_test, label=y_test)

    xgb_params = {
        'objective': 'multi:softmax',
        'num_class': 10,
        'eval_metric': 'mlogloss',
        'learning_rate': 0.05,
        'tree_method': 'gpu_hist',
    }
    # evals = [(dtrain, 'train'), (dtest, 'eval')]
    evals_result = {}

    model = xgb.XGBClassifier(
        n_estimators=1000,
        **xgb_params
    )

    model.fit(
        X=X_train, y=y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=20,)
    y_pred = model.predict_proba(X_test)

    y_pred_idx = y_pred.argmax(axis=1)
    score = f1_score(y_test, y_pred_idx, average='micro')
    print(score)

    validations.append(pd.concat([
        pd.DataFrame({
            'rating': y_test.values,
        }),
        pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])
    ], axis=1))
pd.concat(validations).to_csv("output/xgboost_valid.csv", index=False)


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[0]	validation_0-mlogloss:2.27969
[1]	validation_0-mlogloss:2.25854
[2]	validation_0-mlogloss:2.23884
[3]	validation_0-mlogloss:2.22051
[4]	validation_0-mlogloss:2.20313
[5]	validation_0-mlogloss:2.18699
[6]	validation_0-mlogloss:2.17156
[7]	validation_0-mlogloss:2.15712
[8]	validation_0-mlogloss:2.14343
[9]	validation_0-mlogloss:2.13034
[10]	validation_0-mlogloss:2.11798
[11]	validation_0-mlogloss:2.10614
[12]	validation_0-mlogloss:2.09486
[13]	validation_0-mlogloss:2.08426
[14]	validation_0-mlogloss:2.07408
[15]	validation_0-mlogloss:2.06422
[16]	validation_0-mlogloss:2.05503
[17]	validation_0-mlogloss:2.04605
[18]	validation_0-mlogloss:2.03771
[19]	validation_0-mlogloss:2.02962
[20]	validation_0-mlogloss:2.02176
[21]	validation_0-mlogloss:2.01435
[22]	validation_0-mlogloss:2.00711
[23]	validation_0-mlogloss:2.00031
[24]	validation_0-mlogloss:1.99369
[25]	validation_0-mlogloss:1.98736
[26]	validation_0-mlogloss:1.98133
[27]	validation_0-mlogloss:1.97551
[28]	validation_0-mlogloss:1.9

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[3]	validation_0-mlogloss:2.22011
[4]	validation_0-mlogloss:2.20283
[5]	validation_0-mlogloss:2.18613
[6]	validation_0-mlogloss:2.17077
[7]	validation_0-mlogloss:2.15623
[8]	validation_0-mlogloss:2.14223
[9]	validation_0-mlogloss:2.12922
[10]	validation_0-mlogloss:2.11685
[11]	validation_0-mlogloss:2.10487
[12]	validation_0-mlogloss:2.09382
[13]	validation_0-mlogloss:2.08324
[14]	validation_0-mlogloss:2.07309
[15]	validation_0-mlogloss:2.06318
[16]	validation_0-mlogloss:2.05411
[17]	validation_0-mlogloss:2.04534
[18]	validation_0-mlogloss:2.03663
[19]	validation_0-mlogloss:2.02864
[20]	validation_0-mlogloss:2.02085
[21]	validation_0-mlogloss:2.01350
[22]	validation_0-mlogloss:2.00643
[23]	validation_0-mlogloss:1.99939
[24]	validation_0-mlogloss:1.99287
[25]	validation_0-mlogloss:1.98674
[26]	validation_0-mlogloss:1.98077
[27]	validation_0-mlogloss:1.97474
[28]	validation_0-mlogloss:1.96908
[29]	validation_0-mlogloss:1.96379
[30]	validation_0-mlogloss:1.95856
[31]	validation_0-mlogloss:

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[3]	validation_0-mlogloss:2.22049
[4]	validation_0-mlogloss:2.20320
[5]	validation_0-mlogloss:2.18690
[6]	validation_0-mlogloss:2.17166
[7]	validation_0-mlogloss:2.15715
[8]	validation_0-mlogloss:2.14320
[9]	validation_0-mlogloss:2.13009
[10]	validation_0-mlogloss:2.11766
[11]	validation_0-mlogloss:2.10593
[12]	validation_0-mlogloss:2.09479
[13]	validation_0-mlogloss:2.08393
[14]	validation_0-mlogloss:2.07375
[15]	validation_0-mlogloss:2.06408
[16]	validation_0-mlogloss:2.05466
[17]	validation_0-mlogloss:2.04587
[18]	validation_0-mlogloss:2.03731
[19]	validation_0-mlogloss:2.02931
[20]	validation_0-mlogloss:2.02149
[21]	validation_0-mlogloss:2.01399
[22]	validation_0-mlogloss:2.00690
[23]	validation_0-mlogloss:2.00011
[24]	validation_0-mlogloss:1.99366
[25]	validation_0-mlogloss:1.98717
[26]	validation_0-mlogloss:1.98122
[27]	validation_0-mlogloss:1.97535
[28]	validation_0-mlogloss:1.96981
[29]	validation_0-mlogloss:1.96444
[30]	validation_0-mlogloss:1.95938
[31]	validation_0-mlogloss:

In [6]:
test_ratings = pd.read_csv('input/transformed/test_ratings.csv', usecols=['id', 'user_idx', 'book_idx'])
test = test_ratings.merge(users, on=['user_idx']).merge(books, on=['book_idx'])
X_test = test.drop('id', axis=1)

dtrain = xgb.DMatrix(X, label=y)
dtest = xgb.DMatrix(X_test)

model = xgb.XGBClassifier(
    n_estimators=model.best_iteration,
    **xgb_params,
)
model.fit(
    X=X, y=y,
)
y_pred = model.predict_proba(X_test)


y_pred_idx = y_pred.argmax(axis=1)

pred = pd.concat([test[['id']], pd.DataFrame(y_pred, columns=[f'class{i}' for i in range(10)])], axis=1)
pred['rating'] = y_pred_idx + 1

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [None]:
assert pred['rating'].min() == 1 and pred['rating'].max() == 10
pred.to_csv('output/xgboost.csv', index=False)
pred[['id', 'rating']].to_csv('output/xgboost_submission.csv', index=False)