In [1]:
import pandas as pd
from src.process_bank_churn import preprocess_new_data, preprocess_data
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb

In [2]:
raw_df = pd.read_csv("bank-customer-churn-prediction-dlu-course-c-2/train.csv")

In [3]:
preproc_data = preprocess_data(raw_df)

train_inputs = preproc_data["train_X"]
train_targets = preproc_data["train_y"]
val_inputs = preproc_data["test_X"]
val_targets = preproc_data["test_y"]

In [4]:
# XGBoost model
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42,
)

# LightGBM model
lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42,
    min_child_samples=5,
    subsample=0.7,
)

# Gradient Boosting model
gb_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42,
    min_samples_split=5,
    subsample=0.7
)

In [5]:
gb_model.fit(train_inputs, train_targets)

train_auc = roc_auc_score(train_targets, gb_model.predict_proba(train_inputs)[:, 1])
test_auc = roc_auc_score(val_targets, gb_model.predict_proba(val_inputs)[:, 1])

print(f"Train AUC: {train_auc:.4f}")
print(f"Test AUC: {test_auc:.4f}")

Train AUC: 0.9469
Test AUC: 0.9394


In [6]:
xgb_model.fit(train_inputs, train_targets)

train_auc = roc_auc_score(train_targets, xgb_model.predict_proba(train_inputs)[:, 1])
test_auc = roc_auc_score(val_targets, xgb_model.predict_proba(val_inputs)[:, 1])

print(f"Train AUC: {train_auc:.4f}")
print(f"Test AUC: {test_auc:.4f}")

Train AUC: 0.9445
Test AUC: 0.9406


In [7]:
lgb_model.fit(train_inputs, train_targets)

train_auc = roc_auc_score(train_targets, lgb_model.predict_proba(train_inputs)[:, 1])
test_auc = roc_auc_score(val_targets, lgb_model.predict_proba(val_inputs)[:, 1])

print(f"Train AUC: {train_auc:.4f}")
print(f"Test AUC: {test_auc:.4f}")

[LightGBM] [Info] Number of positive: 2289, number of negative: 8961
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000635 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1173
[LightGBM] [Info] Number of data points in the train set: 11250, number of used features: 140
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.203467 -> initscore=-1.364767
[LightGBM] [Info] Start training from score -1.364767
Train AUC: 0.9470
Test AUC: 0.9398


In [8]:
test_raw_df = pd.read_csv("bank-customer-churn-prediction-dlu-course-c-2/test.csv")

test_X = preprocess_new_data(test_raw_df, preproc_data['encoder'], preproc_data['scaler'])
prediction_probs = xgb_model.predict_proba(test_X)[:, 1]

# Формування submission.csv
sample_raw_df = pd.DataFrame({'id': test_raw_df["id"].values})
sample_raw_df['Exited'] = prediction_probs
sample_raw_df.to_csv("bank-customer-churn-prediction-dlu-course-c-2/submission_log_reg.csv", index=False)