# XGB

In [3]:
import os
import pandas as pd
import numpy as np
import copy
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error, r2_score, make_scorer
from xgboost import XGBRegressor
import joblib, json


In [4]:
#load

csv_path = "./data/train.csv"
df = pd.read_csv(csv_path)
df.head(10)

Unnamed: 0,time,A,B,C,D,E,F,G,H,I,J,K,L,M,N,Y1,Y2
0,0,0.207366,-0.159951,-0.634176,-0.580962,-0.266505,0.060173,-0.475257,-1.486516,-0.332594,-0.671466,-0.226149,-0.187624,-0.780237,-0.785965,-0.935902,-0.310081
1,1,0.188828,-0.265508,0.042143,-0.550442,-0.132319,-0.185219,0.028295,0.09321,-0.518139,-0.251917,-0.347845,-0.359069,-0.161254,0.020401,-0.089707,-0.305374
2,2,-0.144261,-0.577142,-0.214634,-0.747391,-0.184255,-0.464831,-0.085181,0.700449,-0.603438,0.197773,-0.566696,-0.580799,0.202726,0.135261,-0.077855,-0.631485
3,3,0.208982,-0.310449,0.513708,-0.562868,0.742308,-0.305487,0.762246,1.36302,-0.384575,0.525556,-0.348514,-0.428099,0.548993,0.471031,0.941271,-0.535212
4,4,0.09332,-0.358156,0.173188,-0.687296,-0.161461,-0.116062,-0.245748,0.863372,-0.655588,-0.263358,-0.557428,-0.481214,0.083602,0.003087,-0.039582,-0.490561
5,5,1.78297,-0.252194,0.116323,-0.102982,0.378195,1.149888,0.581119,1.363041,-0.28758,0.36912,0.021691,-0.107325,0.559566,0.410777,0.789008,0.653683
6,6,0.91553,-0.155423,-0.250044,-0.374257,-0.236949,-0.00497,-0.777184,-0.32488,-0.546901,-0.261098,-0.250561,-0.264035,-0.013362,-0.539118,-0.348067,-0.189152
7,7,1.235649,-0.125195,1.158698,0.2718,1.991149,0.399857,1.059887,2.958697,-0.410572,1.137326,-0.376244,0.287351,1.201722,1.075639,1.320876,-0.307015
8,8,0.269377,-0.248585,-0.443582,-0.394082,-1.274764,-0.300611,-0.322923,-1.195282,-0.469093,-0.432799,-0.563879,-0.209269,-1.044754,-0.45409,-0.590008,-0.312297
9,9,0.226154,-0.534394,-0.594548,-0.599246,-0.753823,-0.379495,-0.342842,-0.666217,-0.469873,-0.475859,-0.577494,-0.288018,-0.812432,-0.440029,-0.872044,-0.494128


In [5]:
# ===== Y1 ======
X = df.loc[:, "A":"N"].copy()
y = df["Y1"].copy()

# split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state = 1480061
)



In [None]:
model = XGBRegressor(
    n_estimators = 2000,
    learning_rate = 0.03,
    max_depth = 14,
    subsample = 0.7,
    colsample_bytree = 0.7,
    reg_lambda = 1.0,
    reg_alpha = 0.0,
    objective = "reg:squarederror",
    tree_method = "hist",
    # device = "cuda",
    eval_metric = "rmse",
    early_stopping_rounds = 100,
    random_state = 1480061,
)

model.fit(
    X_train, y_train,
    eval_set = [(X_valid, y_valid)],
    verbose = 50
)



[0]	validation_0-rmse:0.95471
[50]	validation_0-rmse:0.57155
[100]	validation_0-rmse:0.47648
[150]	validation_0-rmse:0.45483
[200]	validation_0-rmse:0.45070
[250]	validation_0-rmse:0.45083
[300]	validation_0-rmse:0.45187
[317]	validation_0-rmse:0.45221


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7
,device,'cuda'
,early_stopping_rounds,100
,enable_categorical,False


In [7]:
y_pred = model.predict(X_valid)
rmse = mean_squared_error(y_valid, y_pred)
r2 = r2_score(y_valid, y_pred)
print({"RMSE for Y1": round(rmse, 10)})
print({"r2 score Y1": round(r2, 10)})



{'RMSE for Y1': 0.2027449344}
{'r2 score Y1': 0.7830956785}


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


In [8]:
# ====== Y2 ======
X2 = df.loc[:, "A":"N"].copy()
y2 = df["Y2"].copy()

X2_train, X2_valid, y2_train, y2_valid = train_test_split(
    X2, y2, test_size = 0.2, random_state = 1480061
)



In [9]:
model_y2 = XGBRegressor(
    n_estimators = 2000,
    learning_rate = 0.03,
    max_depth = 6,
    subsample = 0.9,
    colsample_bytree = 0.9,
    reg_lambda = 1.0,
    reg_alpha = 0.0,
    objective = "reg:squarederror",
    tree_method = "hist",
    eval_metric = "rmse",
    early_stopping_rounds = 100,
    random_state = 1480061,
)

model_y2.fit(
    X2_train, y2_train,
    eval_set = [(X2_valid, y2_valid)],
    verbose = 200,
)

[0]	validation_0-rmse:0.92449
[200]	validation_0-rmse:0.47753
[250]	validation_0-rmse:0.47783


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,100
,enable_categorical,False


In [10]:
y2_pred = model_y2.predict(X2_valid)
rmse_y2 = np.sqrt(mean_squared_error(y2_valid, y2_pred))
r2_y2 = r2_score(y2_valid, y2_pred)
print(f"rmse {rmse_y2}, r2 {r2_y2}")

rmse 0.47694439491191787, r2 0.7443000780526772


In [11]:
fi_y2 = pd.DataFrame({
    "feature": X2.columns,
    "importance": model_y2.feature_importances_
}).sort_values("importance", ascending = False)
display(fi_y2)

Unnamed: 0,feature,importance
10,K,0.293022
0,A,0.158857
1,B,0.139778
3,D,0.132175
8,I,0.068738
5,F,0.040254
11,L,0.030056
12,M,0.025603
4,E,0.022672
6,G,0.021089


# Gridsearch

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better = False)

cv = TimeSeriesSplit(n_splits = 5, gap = 10)

# base model

xgb_base = XGBRegressor(
    objective = "reg:squarederror",
    tree_method = "hist",
    device = "cuda",
    random_state = 1480061
)

param_dist = {
    "n_estimators": [400, 800, 1200, 1600, 2000],
    "learning_rate": [0.005, 0.01, 0.02, 0.03, 0.05],
    "max_depth": [3, 4, 5, 6, 8, 10, 12],
    "min_child_weight": [1, 3, 5, 7, 10],
    "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
    "reg_lambda": [0.5, 1.0, 2.0, 3.0, 5.0],
    "reg_alpha": [0.0, 0.1, 0.2, 0.5]
}

search = RandomizedSearchCV(
    estimator = xgb_base,
    param_distributions = param_dist,
    n_iter = 40,
    scoring = rmse_scorer,
    cv = cv,
    n_jobs = -1,
    verbose = 1,
    refit = True,
    random_state = 1480061
)

search.fit(X_train, y_train)

print(f"best: {search.best_params_}")
print(f"best cv rmse: {-search.best_score_}")

""" KFold
Fitting 5 folds for each of 40 candidates, totalling 200 fits
best: {'subsample': 0.9, 'reg_lambda': 1.0, 'reg_alpha': 0.2, 'n_estimators': 400, 'min_child_weight': 10, 'max_depth': 8, 'learning_rate': 0.01, 'colsample_bytree': 0.7}
best cv rmse: 0.4897944136307305
"""


Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [None]:
best_params = search.best_params_.copy()

model_tuned = XGBRegressor(
    **best_params,
    objective = "reg:squarederror",
    tree_method = "hist",
    #device = "cuda",
    random_state = 1480061,
    eval_metric = "rmse",
    early_stopping_rounds = 100
)

model_tuned.fit(
    X_train, y_train,
    eval_set = [(X_valid, y_valid)],
    verbose = 50
)

y_valid_pred = model_tuned.predict(X_valid)
rmse_valid = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
r2_valid = r2_score(y_valid, y_valid_pred)
print(f"valid rmse: {rmse_valid}, valid r2: {r2_valid}, best iteration: {model_tuned.best_iteration}")


""" KFold
[0]	validation_0-rmse:0.96041
[50]	validation_0-rmse:0.69884
[100]	validation_0-rmse:0.56274
[150]	validation_0-rmse:0.49735
[200]	validation_0-rmse:0.46739
[250]	validation_0-rmse:0.45373
[300]	validation_0-rmse:0.44798
[350]	validation_0-rmse:0.44525
[399]	validation_0-rmse:0.44417
valid rmse: 0.44417100426208156, valid r2: 0.7889338437963596, best iteration: 399
"""

[0]	validation_0-rmse:0.96041
[50]	validation_0-rmse:0.69884
[100]	validation_0-rmse:0.56274
[150]	validation_0-rmse:0.49735
[200]	validation_0-rmse:0.46739
[250]	validation_0-rmse:0.45373
[300]	validation_0-rmse:0.44798
[350]	validation_0-rmse:0.44525
[399]	validation_0-rmse:0.44417
valid rmse: 0.44417100426208156, valid r2: 0.7889338437963596, best iteration: 399


In [None]:
# save
# os.makedirs("artifacts", exist_ok = True)
model_tuned.save_model("artifacts/xgb_Y1_tuned.json")
joblib.dump(model_tuned, "artifacts/xgb_Y1_tuned.pkl")

with open("artifacts/xgb_Y1_features.json", "w") as f:
    json.dump(list(X_train.columns), f)
    
meta = {
    "best_params": best_params,
    "best_iteration": int(getattr(model_tuned, "best_iteration", -1))
}

with open("artifacts/xgb_Y1_meta.json", "w") as f:
    json.dump(meta, f, indent = 2)
    

In [None]:
# load
model_y1 = XGBRegressor()
model_y1.load_model("artifacts/xgb_Y1_tuned.json")

with open("artifacts/xgb_Y1_features.json") as f:
    feat_order = json.load(f)

In [None]:
# plot performance vs hyperparameter

import matplotlib.pyplot as plt

cvres = pd.DataFrame(search.cv_results_)

cvres["rmse"] = -cvres["mean_test_score"]

def plot(param_name):
    x = cvres["param_" + param_name]
    y = cvres["rmse"]
    plt.figure(figsize = (6, 4))
    plt.scatter(x, y)
    plt.xlabel(param_name)
    plt.ylabel("cv rmse")
    plt.title(f"cv rmse vs {param_name}")
    plt.tight_layout()
    plt.show()
    
for p in ["n_estimators", "learning_rate", "max_depth", "min_child_weight", "subsample", "colsample_bytree", "reg_lambda", "reg_alpha"]:
    pass
    #plot(p)

In [None]:
# y2 grid search

xgb_base_y2 = XGBRegressor(
    objective = "reg:squarederror",
    tree_method = "hist",
    # device = "cuda",
    eval_metric = "rmse",
    random_state = 1480061
)

param_dist_y2 = {
    "n_estimators": [400, 800, 1200, 1600, 2000],
    "learning_rate": [0.005, 0.01, 0.02, 0.03, 0.05],
    "max_depth": [3, 4, 5, 6, 8, 10, 12],
    "min_child_weight": [1, 3, 5, 7, 10],
    "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
    "reg_lambda": [0.5, 1.0, 2.0, 3.0, 5.0],
    "reg_alpha": [0.0, 0.1, 0.2, 0.5]
}

search_y2 = RandomizedSearchCV(
    estimator = xgb_base_y2,
    param_distributions = param_dist_y2,
    n_iter = 40,
    scoring = rmse_scorer,
    cv = cv,
    n_jobs = -1,
    verbose = 1,
    refit = True,
    random_state = 1480061
)

search_y2.fit(X2_train, y2_train)

print(f"best: {search.best_params_}")
print(f"best cv rmse: {-search.best_score_}")


""" KFold
Fitting 5 folds for each of 40 candidates, totalling 200 fits
best: {'subsample': 0.9, 'reg_lambda': 1.0, 'reg_alpha': 0.2, 'n_estimators': 400, 'min_child_weight': 10, 'max_depth': 8, 'learning_rate': 0.01, 'colsample_bytree': 0.7}
best cv rmse: 0.4897944136307305
"""

Fitting 5 folds for each of 40 candidates, totalling 200 fits
best: {'subsample': 0.9, 'reg_lambda': 1.0, 'reg_alpha': 0.2, 'n_estimators': 400, 'min_child_weight': 10, 'max_depth': 8, 'learning_rate': 0.01, 'colsample_bytree': 0.7}
best cv rmse: 0.4897944136307305


In [None]:
best_params_y2 = search_y2.best_params_.copy()
model_y2_tuned = XGBRegressor(
    **best_params_y2,
    objective = "reg:squarederror",
    tree_method = "hist",
    eval_metric = "rmse",
    random_state = 1480061,
    
    early_stopping_rounds = 100
)

model_y2_tuned.fit(
    X2_train, y2_train,
    eval_set = [(X2_valid, y2_valid)],
    verbose = 50
    
)

y2_pred_valid = model_y2_tuned.predict(X2_valid)
rmse_y2_valid = rmse(y2_valid, y2_pred_valid)
r2_y2_valid = r2_score(y2_valid, y2_pred_valid)
print(rmse_y2_valid, r2_y2_valid, getattr(model_y2_tuned, "best_iteration"))


"""KFold
[0]	validation_0-rmse:0.93691
[50]	validation_0-rmse:0.71001
[100]	validation_0-rmse:0.59085
[150]	validation_0-rmse:0.53053
[200]	validation_0-rmse:0.50154
[250]	validation_0-rmse:0.48729
[300]	validation_0-rmse:0.48064
[350]	validation_0-rmse:0.47709
[399]	validation_0-rmse:0.47513
0.47513021355670604 0.7462416200044015 399
"""


[0]	validation_0-rmse:0.93691
[50]	validation_0-rmse:0.71001
[100]	validation_0-rmse:0.59085
[150]	validation_0-rmse:0.53053
[200]	validation_0-rmse:0.50154
[250]	validation_0-rmse:0.48729
[300]	validation_0-rmse:0.48064
[350]	validation_0-rmse:0.47709
[399]	validation_0-rmse:0.47513
0.47513021355670604 0.7462416200044015 399


In [None]:
model_y2_tuned.save_model("artifacts/xgb_Y2_tuned.json")
joblib.dump(model_y2_tuned, "artifacts/xgb_Y2_tuned.pkl")

with open("artifacts/xgb_Y2_features.json", "w") as f:
    json.dump(list(X2_train.columns), f)
    
meta = {
    "best_params": best_params_y2,
    "best_iteration": int(getattr(model_y2_tuned, "best_iteration", -1))
}

with open("artifacts/xgb_Y2_meta.json", "w") as f:
    json.dump(meta, f, indent = 2)

In [None]:
# Output 

test_path = "./data/test.csv"
test_df = pd.read_csv(test_path)

with open("artifacts/xgb_Y1_features.json") as f:
    feat_y1 = json.load(f)

with open("artifacts/xgb_Y2_features.json") as f:
    feat_y2 = json.load(f)
    
m1 = XGBRegressor()
m1.load_model("artifacts/xgb_Y1_tuned.json")
m2 = XGBRegressor()
m2.load_model("artifacts/xgb_Y2_tuned.json")

ids = test_df["id"].astype(int)
X1 = test_df.loc[:, feat_y1]
X2 = test_df.loc[:, feat_y2]

y1_pred = m1.predict(X1)
y2_pred = m2.predict(X2)

submission = pd.DataFrame({
    "id": ids,
    "Y1": y1_pred,
    "Y2": y2_pred
})

submission.to_csv("submission.csv", index = False)
