# XGBoost Sparse

In [14]:
tol = 1e-6
nonzero = coef_table[abs.(coef_table.coef) .> tol, :]

selected_features = Symbol.(nonzero.feature)

println("Using ", length(selected_features), " non-zero (sparse) features for XGBoost.")

Using 32 non-zero (sparse) features for XGBoost.


In [9]:
# Subset X_train and X_test to selected features only
X_train_xgb = select(X_train, selected_features)
X_test_xgb  = select(X_test,  selected_features)

# Labels: use original binary/int targets from train_df / test_df
y_train_xgb = collect(y_train_raw)   
y_test_xgb  = collect(y_test_raw)

println("Size of XGBoost training set: ", size(X_train_xgb), ", labels length: ", length(y_train_xgb))
println("Size of XGBoost test set:     ", size(X_test_xgb),  ", labels length: ", length(y_test_xgb))

Size of XGBoost training set: (137343, 32), labels length: 137343
Size of XGBoost test set:     (5168, 32), labels length: 5168


In [10]:
# XGBoost model w grid search
seed = 42

max_depths        = [3, 5, 7, 9]
min_child_weights = [5, 10, 20, 30]
n_estimators_list = [20, 25, 50, 100]

grid_search_xgb = IAI.GridSearch(
    IAI.XGBoostClassifier(
        random_seed = seed,
    ),
    max_depth        = max_depths,
    min_child_weight = min_child_weights,
    n_estimators     = n_estimators_list,
)

IAI.fit_cv!(
    grid_search_xgb,
    X_train_xgb,
    y_train_xgb;
    validation_criterion = :auc,
    positive_label       = 1,             # HighSeverity = 1
    sample_weight        = :autobalance,  # handle class imbalance
    n_folds              = 5,
    verbose              = true,
)

# Best model + hyperparameters
best_model_xgb = IAI.get_learner(grid_search_xgb)

println("\nBest XGBoost hyperparameters:")
println("  Max depth:        ", best_model_xgb.max_depth)
println("  Min child weight: ", best_model_xgb.min_child_weight)
println("  n_estimators:     ", best_model_xgb.n_estimators)

fold 1; train: 0.5439; valid: 0.8811; min_child_weight=>5 n_estimators=>20 max_depth=>3
fold 1; train: 0.5415; valid: 0.8808; min_child_weight=>10 n_estimators=>20 max_depth=>3
fold 1; train: 0.539; valid: 0.8807; min_child_weight=>20 n_estimators=>20 max_depth=>3
fold 1; train: 0.5364; valid: 0.879; min_child_weight=>30 n_estimators=>20 max_depth=>3
fold 1; train: 0.5439; valid: 0.8811; min_child_weight=>5 n_estimators=>25 max_depth=>3
fold 1; train: 0.5415; valid: 0.8808; min_child_weight=>10 n_estimators=>25 max_depth=>3
fold 1; train: 0.539; valid: 0.8807; min_child_weight=>20 n_estimators=>25 max_depth=>3
fold 1; train: 0.5364; valid: 0.879; min_child_weight=>30 n_estimators=>25 max_depth=>3
fold 1; train: 0.5439; valid: 0.8811; min_child_weight=>5 n_estimators=>50 max_depth=>3
fold 1; train: 0.5415; valid: 0.8808; min_child_weight=>10 n_estimators=>50 max_depth=>3
fold 1; train: 0.539; valid: 0.8807; min_child_weight=>20 n_estimators=>50 max_depth=>3
fold 1; train: 0.5364; valid:

In [11]:
# Metrics = AUC, Accuracy, Recall
# Training metrics 
train_auc_xgb = IAI.score(
    best_model_xgb, X_train_xgb, y_train_xgb;
    criterion      = :auc,
    positive_label = 1,
)

train_acc_xgb = IAI.score(
    best_model_xgb, X_train_xgb, y_train_xgb;
    criterion      = :accuracy,
    positive_label = 1,
)

train_recall_xgb = IAI.score(
    best_model_xgb, X_train_xgb, y_train_xgb;
    criterion      = :sensitivity,   # recall
    positive_label = 1,
)

println("\nXGBoost Training AUC:      ", round(train_auc_xgb, digits=4))
println("XGBoost Training Accuracy: ", round(train_acc_xgb, digits=4))
println("XGBoost Training Recall:   ", round(train_recall_xgb, digits=4))

# Test metrics
test_auc_xgb = IAI.score(
    best_model_xgb, X_test_xgb, y_test_xgb;
    criterion      = :auc,
    positive_label = 1,
)

test_acc_xgb = IAI.score(
    best_model_xgb, X_test_xgb, y_test_xgb;
    criterion      = :accuracy,
    positive_label = 1,
)

test_recall_xgb = IAI.score(
    best_model_xgb, X_test_xgb, y_test_xgb;
    criterion      = :sensitivity,   # recall
    positive_label = 1,
)

println("\nXGBoost Test AUC:          ", round(test_auc_xgb, digits=4))
println("XGBoost Test Accuracy:     ", round(test_acc_xgb, digits=4))
println("XGBoost Test Recall:       ", round(test_recall_xgb, digits=4))

# Predictions & probabilities
y_pred_train_xgb = IAI.predict(best_model_xgb, X_train_xgb)
y_pred_test_xgb  = IAI.predict(best_model_xgb, X_test_xgb)

y_prob_train_xgb = IAI.predict_proba(best_model_xgb, X_train_xgb)
y_prob_test_xgb  = IAI.predict_proba(best_model_xgb, X_test_xgb)


XGBoost Training AUC:      0.9994
XGBoost Training Accuracy: 0.9834
XGBoost Training Recall:   1.0

XGBoost Test AUC:          0.6819
XGBoost Test Accuracy:     0.9646
XGBoost Test Recall:       0.0


Row,0,1
Unnamed: 0_level_1,Float64,Float64
1,0.777604,0.222396
2,0.982732,0.0172677
3,0.377769,0.622231
4,0.971726,0.0282745
5,0.739691,0.260309
6,0.472083,0.527917
7,0.996734,0.00326593
8,0.939842,0.0601579
9,0.855948,0.144052
10,0.998596,0.0014041


In [12]:
# Take P(y=1) from train and test
prob_train = y_prob_train_xgb[:, 2]
prob_test  = y_prob_test_xgb[:, 2]

# Train on top, test 2023 on bottom
prob_all = vcat(prob_train, prob_test)

prob_df = DataFrame(
    prob_class1 = prob_all,
)

CSV.write("sparseyhat_xg.csv", prob_df)

"sparseyhat_xg.csv"