In [266]:
import pandas as pd
import numpy as np
from src.process_bank_churn import preprocess_new_data, preprocess_data, split_features_target, split_train_test
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb

In [267]:
raw_df = pd.read_csv("bank-customer-churn-prediction-dlu-course-c-2/train.csv")

In [268]:
cat_cols = raw_df.select_dtypes('object').columns
raw_df[cat_cols] = raw_df[cat_cols].astype('category')

X_cols = raw_df.drop(columns=['id', 'CustomerId', 'Exited']).columns
y_col = raw_df[['Exited']].columns[0]

train_df, test_df, _, __ = split_train_test(raw_df, raw_df[y_col])
(train_inputs, train_targets), (val_inputs, val_targets) = split_features_target(train_df, X_cols, y_col), split_features_target(test_df, X_cols, y_col)

In [269]:
# preproc_data = preprocess_data(raw_df, 'Exited', ["id", "CustomerId"])

# train_inputs = preproc_data["train_X"]
# train_targets = preproc_data["train_y"]
# val_inputs = preproc_data["test_X"]
# val_targets = preproc_data["test_y"]

In [270]:
best = {
    'colsample_bytree': 0.82,
    'learning_rate': 0.05,
    'max_depth': 1,
    'min_child_weight': 9,
    'n_estimators': 400,
    'num_leaves': 101,
    'reg_alpha': 0.51,
    'reg_lambda': 0.8,
    'subsample': 0.76
}

# LightGBM model with best hyperparameters
lgb_model = lgb.LGBMClassifier(
    n_estimators=best['n_estimators'],
    learning_rate=best['learning_rate'],
    max_depth=best['max_depth'],
    min_child_weight=best['min_child_weight'],
    num_leaves=best['num_leaves'],
    subsample=best['subsample'],
    random_state=42
)

lgb_model.fit(train_inputs, train_targets)

train_auc = roc_auc_score(train_targets, lgb_model.predict_proba(train_inputs)[:, 1])
test_auc = roc_auc_score(val_targets, lgb_model.predict_proba(val_inputs)[:, 1])

print(f"Train AUC: {train_auc:.4f}")
print(f"Test AUC: {test_auc:.4f}")

[LightGBM] [Info] Number of positive: 2442, number of negative: 9558
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000690 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1316
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.203500 -> initscore=-1.364561
[LightGBM] [Info] Start training from score -1.364561
Train AUC: 0.9375
Test AUC: 0.9329


In [271]:
best = {
    'alpha': 0.7,
    'colsample_bytree': 0.6,
    'gamma': 0.05,
    'learning_rate': 0.035,
    'max_depth': 6,
    'min_child_weight': 2,
    'n_estimators': 150,
    'subsample': 0.72
}

# XGBoost model with parameters from best dict
xgb_model = xgb.XGBClassifier(
    tree_method="hist",
    eval_metric="auc",
    missing=np.nan,
    enable_categorical=True,
    random_state=42,
    n_estimators=best['n_estimators'],
    learning_rate=best['learning_rate'],
    max_depth=best['max_depth'],
    subsample=best['subsample'],
    colsample_bytree=best['colsample_bytree'],
    min_child_weight=best['min_child_weight'],
    gamma=best['gamma'],
    alpha=best['alpha']
)

xgb_model.fit(train_inputs, train_targets)

train_auc = roc_auc_score(train_targets, xgb_model.predict_proba(train_inputs)[:, 1])
test_auc = roc_auc_score(val_targets, xgb_model.predict_proba(val_inputs)[:, 1])

print(f"Train AUC: {train_auc:.4f}")
print(f"Test AUC: {test_auc:.4f}")

Train AUC: 0.9601
Test AUC: 0.9402


In [272]:
test_raw_df = pd.read_csv("bank-customer-churn-prediction-dlu-course-c-2/test.csv")

ids = test_raw_df["id"].values

test_raw_df = test_raw_df.drop(['id', 'CustomerId'], axis=1)
test_raw_df[cat_cols] = test_raw_df[cat_cols].astype('category')

prediction_probs = xgb_model.predict_proba(test_raw_df)[:, 1]

# Формування submission.csv
sample_raw_df = pd.DataFrame({'id': ids})
sample_raw_df['Exited'] = prediction_probs
sample_raw_df.to_csv("bank-customer-churn-prediction-dlu-course-c-2/submission_log_reg.csv", index=False)