# Load data

Note: I'm relatively new to XGBoost, please feel free to fix my code in the comments.

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("../input/tabular-playground-series-jun-2021/train.csv")

# thanks to https://www.kaggle.com/remekkinas/keras-tuner-knn-features-simplex-optimization?scriptVersionId=66707477
features = np.load("../input/tps6-boost-your-score-with-knn-features/add_feat_train.npy")
features_df = pd.DataFrame(features, columns=['knn_1', 'knn_2', 'knn_3', 'knn_4', 'knn_5', 'knn_6', 'knn_7', 'knn_8', 'knn_9'])

df.drop("id", axis=1, inplace=True)

In [None]:
features_df

In [None]:
df = pd.concat([df, features_df], axis=1)

In [None]:
try:
    df.drop("Unnamed: 0", axis=1, inplace=True)
except:
    pass

In [None]:
df

# Data preprocessing

In [None]:
from sklearn.preprocessing import MinMaxScaler

sc = MinMaxScaler()

scaled = sc.fit_transform(df.drop("target", axis=1))

In [None]:
import numpy as np

columns = np.array(df.columns)

columns_without_target = np.delete(columns, np.where(columns == "target"))
target = df["target"] # we must do this **before** we reset df
df = pd.DataFrame(scaled, columns=columns_without_target)
df["target"] = target

In [None]:
df

In [None]:
# Make target variable more machine-friendly
df["target"] = df["target"].map(lambda x: int(x.replace("Class_", "")) - 1)

In [None]:
df

# Train/validation split

In [None]:
# Split into training and validation
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.1, stratify=df["target"])

# Training

Let's run a hyperparameter optimization to get the best parameters. This takes a while.

Thank you https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning, https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn, https://www.kaggle.com/alexisbcook/xgboost and https://www.kaggle.com/dstuerzer/optimization-of-xgboost!

In [None]:
import xgboost as xgb
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

space = {
    "classifier_params": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.3, 0.7),
        "gamma": hp.uniform("gamma", 0, 0.5),
        "learning_rate": hp.uniform("learning_rate", 0.03, 0.3), # default 0.1 
        "max_depth": hp.randint("max_depth", 2, 6), # default 3
        "n_estimators": hp.randint("n_estimators", 100, 650), # default 100
        "subsample": hp.uniform("subsample", 0.4, 0.6),
        "reg_alpha" : hp.quniform("reg_alpha", 40,180,1),
        "reg_lambda" : hp.uniform("reg_lambda", 0,1),
        "min_child_weight" : hp.quniform("min_child_weight", 0, 10, 1),
    },
    "fit_params": {
        "early_stopping_rounds": hp.randint("early_stopping_rounds", 3, 15),
    },
}

def objective(space):

    # https://stackoverflow.com/a/62302697
    xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42, use_label_encoder=False, tree_method='gpu_hist', gpu_id=0, verbosity=0, n_jobs=-1, **space["classifier_params"])
    xgb_model.fit(train_df.drop("target", axis=1), train_df["target"],
                  eval_set=[(val_df.drop("target", axis=1), val_df["target"])],
                  verbose=False,
                  **space["fit_params"]
    )
    return {"loss": xgb_model.evals_result()["validation_0"]["mlogloss"][-1], "status": STATUS_OK}

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [None]:
print("The best hyperparameters are(early_stopping_rounds should be passed into fit(), others should be passed into the XGBClassifier class): ","\n")
print(best_hyperparams)

Great! Let's take save these parameters (and also the random state) for other people to use.

In [None]:
import json

params = {**best_hyperparams, "random_state": 42}

with open("best-params.json", "w") as f:
    f.write(json.dumps(params))

Now, we need to recreate the model:

In [None]:
# recreate the model
import copy

clf_params = copy.deepcopy(best_hyperparams)
del clf_params["early_stopping_rounds"]

# https://stackoverflow.com/a/62302697
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42, use_label_encoder=False, tree_method='gpu_hist', gpu_id=0, verbosity=0, **clf_params)
xgb_model.fit(train_df.drop("target", axis=1), train_df["target"],
                eval_set=[(val_df.drop("target", axis=1), val_df["target"])],
                verbose=False,
                early_stopping_rounds=best_hyperparams["early_stopping_rounds"]
)

We have the exact same results:

In [None]:
xgb_model.evals_result()["validation_0"]["mlogloss"][-1]

Let's also save the model.

In [None]:
# https://mljar.com/blog/xgboost-save-load-python/
xgb_model.save_model("model.json")
xgb_model.save_model("model.txt")