# XGBoost Classifier Baseline (Model Comparison) v3

- 모델 종류 체급 비교 목적

평가 (Test set):
- PR-AUC
- Recall (Churn)
- Accuracy
- Confusion Matrix
- Classification Report
- Feature Importance


In [79]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', None)


In [80]:
df = pd.read_parquet('../data/kkbox_train_feature_v3.parquet')


In [81]:
RANDOM_STATE = 719

ID_COL = "msno"
TARGET_COL = "is_churn"

CATEGORICAL_COLS = [
    "city",
    "gender",
    "registered_via",
    "last_payment_method",
    "has_ever_paid",
    "has_ever_cancelled",
    "is_auto_renew_last",
    "is_free_user",
]

NUMERICAL_COLS = [
    "reg_days",
    "num_days_active_w7",
    "total_secs_w7",
    "avg_secs_per_day_w7",
    "std_secs_w7",
    "num_songs_w7",
    "avg_songs_per_day_w7",
    "num_unq_w7",
    "num_25_w7",
    "num_100_w7",
    "short_play_w7",
    "skip_ratio_w7",
    "completion_ratio_w7",
    "short_play_ratio_w7",
    "variety_ratio_w7",
    "num_days_active_w14",
    "total_secs_w14",
    "avg_secs_per_day_w14",
    "std_secs_w14",
    "num_songs_w14",
    "avg_songs_per_day_w14",
    "num_unq_w14",
    "num_25_w14",
    "num_100_w14",
    "short_play_w14",
    "skip_ratio_w14",
    "completion_ratio_w14",
    "short_play_ratio_w14",
    "variety_ratio_w14",
    "num_days_active_w21",
    "total_secs_w21",
    "avg_secs_per_day_w21",
    "std_secs_w21",
    "num_songs_w21",
    "avg_songs_per_day_w21",
    "num_unq_w21",
    "num_25_w21",
    "num_100_w21",
    "short_play_w21",
    "skip_ratio_w21",
    "completion_ratio_w21",
    "short_play_ratio_w21",
    "variety_ratio_w21",
    "num_days_active_w30",
    "total_secs_w30",
    "avg_secs_per_day_w30",
    "std_secs_w30",
    "num_songs_w30",
    "avg_songs_per_day_w30",
    "num_unq_w30",
    "num_25_w30",
    "num_100_w30",
    "short_play_w30",
    "skip_ratio_w30",
    "completion_ratio_w30",
    "short_play_ratio_w30",
    "variety_ratio_w30",
    "secs_trend_w7_w30",
    "secs_trend_w14_w30",
    "days_trend_w7_w14",
    "days_trend_w7_w30",
    "songs_trend_w7_w30",
    "songs_trend_w14_w30",
    "skip_trend_w7_w30",
    "completion_trend_w7_w30",
    "days_since_last_payment",
    "days_since_last_cancel",
    "last_plan_days",
    "total_payment_count",
    "total_amount_paid",
    "avg_amount_per_payment",
    "unique_plan_count",
    "subscription_months_est",
    "payment_count_last_30d",
    "payment_count_last_90d",
]

FEATURE_COLS = CATEGORICAL_COLS + NUMERICAL_COLS

X = df[FEATURE_COLS].copy()
y = df[TARGET_COL].astype(int).copy()

## 1. Train / Test Split


In [82]:
from sklearn.model_selection import train_test_split

X = df[FEATURE_COLS]
y = df[TARGET_COL].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y,
)

print(X_train.shape, X_test.shape)


(688772, 83) (172194, 83)


## 2. Column Groups


In [83]:
cat_cols = [c for c in CATEGORICAL_COLS if c in X_train.columns]
num_cols = [c for c in NUMERICAL_COLS if c in X_train.columns]

print(f"num_cols: {len(num_cols)}")
print(f"cat_cols: {len(cat_cols)}")


num_cols: 75
cat_cols: 8


## 3. Preprocessing (XGBoost Native Categorical)


In [84]:
X_train_xgb = X_train.copy()
X_test_xgb = X_test.copy()

for col in cat_cols:
    X_train_xgb[col] = X_train_xgb[col].astype(str)
    X_test_xgb[col] = X_test_xgb[col].astype(str)

    X_train_xgb[col] = X_train_xgb[col].astype("category")
    X_test_xgb[col] = X_test_xgb[col].astype("category")


## 4. XGBoost Classifier Model


In [85]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="aucpr",
    tree_method="hist",
    enable_categorical=True,
    reg_lambda=1.0,
    reg_alpha=0.0,
    random_state=RANDOM_STATE,
)


## 5. Train Model


In [86]:
xgb_model.fit(
    X_train_xgb,
    y_train,
    eval_set=[(X_test_xgb, y_test)],
    verbose=False,
)


0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'binary:logistic'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.8
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,True


## 6. Test Evaluation


In [87]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    average_precision_score,
    recall_score,
    accuracy_score,
    confusion_matrix,
    classification_report,
)

y_proba = xgb_model.predict_proba(X_test_xgb)[:, 1]
y_pred  = xgb_model.predict(X_test_xgb)

print(f"PR-AUC: {average_precision_score(y_test, y_proba):.4f}")
print(f"Recall (Churn): {recall_score(y_test, y_pred):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


PR-AUC: 0.9360
Recall (Churn): 0.8469
Accuracy: 0.9751


## 7. Confusion Matrix


In [88]:
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Pred 0", "Pred 1"])


Unnamed: 0,Pred 0,Pred 1
Actual 0,154107,1797
Actual 1,2494,13796


## 8. Classification Report


In [89]:
print(classification_report(y_test, y_pred, digits=4))


              precision    recall  f1-score   support

           0     0.9841    0.9885    0.9863    155904
           1     0.8848    0.8469    0.8654     16290

    accuracy                         0.9751    172194
   macro avg     0.9344    0.9177    0.9258    172194
weighted avg     0.9747    0.9751    0.9748    172194



## 9. Feature Importance (Gain)


In [90]:
feature_names = X_train_xgb.columns
importances = xgb_model.feature_importances_

imp_xgb_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances,
}).sort_values("importance", ascending=False)

imp_xgb_df


Unnamed: 0,feature,importance
6,is_auto_renew_last,0.364567
5,has_ever_cancelled,0.229721
82,payment_count_last_90d,0.082512
75,last_plan_days,0.071337
80,subscription_months_est,0.051099
81,payment_count_last_30d,0.030167
76,total_payment_count,0.020505
74,days_since_last_cancel,0.019381
73,days_since_last_payment,0.012221
3,last_payment_method,0.010772


In [91]:
# imp_xgb_df.to_csv(
#     "../data/model_df/xgb_feature_importance.csv",
#     index=False
# )
