In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (train_test_split, StratifiedKFold)
from category_encoders import TargetEncoder
from sklearn.metrics import roc_auc_score
from sklearn.base import clone

import xgboost as xgb
from xgboost import XGBClassifier

import cupy as cp

ModuleNotFoundError: No module named 'category_encoders'

In [None]:
SEED = 44
N_SPLITS = 10

train_path = "train.csv"
test_path = "test.csv"
sub_path = "sample_submission.csv"
original_path = "/kaggle/input/bank-marketing-dataset-full/bank-full.csv"

In [3]:
def add_features(dataset: pd.DataFrame) -> pd.DataFrame:
    df = dataset.copy()
    
    df["arcsinh_balance"] = np.arcsinh(df["balance"])
    df["arcsinh_duration"] = np.arcsinh(df["duration"])
    df["balance/age"] = df["balance"] * df["age"]
    df["arcsinh_balance/age"] = df["arcsinh_balance"] * df["age"]
    
    df["day_sin"] = np.sin(2 * np.pi * df["day"] / 31)
    df["day_cos"] = np.cos(2 * np.pi * df["day"] / 31)
    
    month_map = {'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6,
                 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}
    df["month_num"] = df["month"].map(month_map).astype("int64")
    df = df.drop("month", axis=1)
    df = df.rename(columns={"month_num": "month"})
    
    df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
    df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)
    
    return df

def cross_val_skf_and_pred(model,
                           X_train: cp.array,
                           y_train: cp.array,
                           X_test: cp.array,                  
                           n_splits: int=None,
                           shuffle: bool=True):
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=SEED)
    oof_train = np.zeros(len(X_train))
    oof_test = np.zeros(len(X_test))
    scores = []

    X_train_cpu = cp.asnumpy(X_train)
    y_train_cpu = cp.asnumpy(y_train)
    
    for i, (tr_idx, vl_idx) in enumerate(skf.split(X_train_cpu, y_train_cpu), 1):
        X_tr, X_vl = X_train[tr_idx], X_train[vl_idx]
        y_tr, y_vl = y_train[tr_idx], y_train[vl_idx]
        
        model.fit(X_tr,
                y_tr,
                eval_set=[(X_vl, y_vl)],
                verbose=False)
        
        best_iter = model.best_iteration + 1
        y_probs = model.predict_proba(X_vl, iteration_range=(0, best_iter))[:, 1]
        oof_train[vl_idx] = y_probs
        oof_test += model.predict_proba(X_test, iteration_range=(0, best_iter))[:, 1] / n_splits
        
        auc = roc_auc_score(y_vl.get(), y_probs)
        scores.append(auc)
        
        print(f"Fold {i}/{n_splits} | Roc Auc: {auc}")
        
    return {"scores": scores,
            "train": oof_train,
            "test": oof_test}

In [4]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
submmission = pd.read_csv(sub_path)
original = pd.read_csv(original_path, sep=";")

print("\n-----Train data-----")
display(train.head())
train.info()
print(train.isna().sum())
display(train.describe())
print("\n-----Original data-----")
display(original.head())
original.info()
print(original.isna().sum())
display(original.describe())
print("\n-----Test data-----")
display(test.head())
test.info()
print(test.isna().sum())
display(test.describe())



-----Train data-----


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         750000 non-null  int64 
 1   age        750000 non-null  int64 
 2   job        750000 non-null  object
 3   marital    750000 non-null  object
 4   education  750000 non-null  object
 5   default    750000 non-null  object
 6   balance    750000 non-null  int64 
 7   housing    750000 non-null  object
 8   loan       750000 non-null  object
 9   contact    750000 non-null  object
 10  day        750000 non-null  int64 
 11  month      750000 non-null  object
 12  duration   750000 non-null  int64 
 13  campaign   750000 non-null  int64 
 14  pdays      750000 non-null  int64 
 15  previous   750000 non-null  int64 
 16  poutcome   750000 non-null  object
 17  y          750000 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 103.0+ MB
id           0
age          0
job    

Unnamed: 0,id,age,balance,day,duration,campaign,pdays,previous,y
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,374999.5,40.926395,1204.067397,16.117209,256.229144,2.577008,22.412733,0.298545,0.120651
std,216506.495284,10.098829,2836.096759,8.250832,272.555662,2.718514,77.319998,1.335926,0.325721
min,0.0,18.0,-8019.0,1.0,1.0,1.0,-1.0,0.0,0.0
25%,187499.75,33.0,0.0,9.0,91.0,1.0,-1.0,0.0,0.0
50%,374999.5,39.0,634.0,17.0,133.0,2.0,-1.0,0.0,0.0
75%,562499.25,48.0,1390.0,21.0,361.0,3.0,-1.0,0.0,0.0
max,749999.0,95.0,99717.0,31.0,4918.0,63.0,871.0,200.0,1.0



-----Original data-----


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB
age          0
job          0
marital      0
education    0
default      0
balance      0
housing  

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0



-----Test data-----


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,750000,32,blue-collar,married,secondary,no,1397,yes,no,unknown,21,may,224,1,-1,0,unknown
1,750001,44,management,married,tertiary,no,23,yes,no,cellular,3,apr,586,2,-1,0,unknown
2,750002,36,self-employed,married,primary,no,46,yes,yes,cellular,13,may,111,2,-1,0,unknown
3,750003,58,blue-collar,married,secondary,no,-1380,yes,yes,unknown,29,may,125,1,-1,0,unknown
4,750004,28,technician,single,secondary,no,1950,yes,no,cellular,22,jul,181,1,-1,0,unknown


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 17 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         250000 non-null  int64 
 1   age        250000 non-null  int64 
 2   job        250000 non-null  object
 3   marital    250000 non-null  object
 4   education  250000 non-null  object
 5   default    250000 non-null  object
 6   balance    250000 non-null  int64 
 7   housing    250000 non-null  object
 8   loan       250000 non-null  object
 9   contact    250000 non-null  object
 10  day        250000 non-null  int64 
 11  month      250000 non-null  object
 12  duration   250000 non-null  int64 
 13  campaign   250000 non-null  int64 
 14  pdays      250000 non-null  int64 
 15  previous   250000 non-null  int64 
 16  poutcome   250000 non-null  object
dtypes: int64(8), object(9)
memory usage: 32.4+ MB
id           0
age          0
job          0
marital      0
education    0
def

Unnamed: 0,id,age,balance,day,duration,campaign,pdays,previous
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,874999.5,40.932332,1197.426352,16.116068,255.34226,2.573548,22.280028,0.303728
std,72168.927986,10.081613,2741.520699,8.258509,271.404326,2.709661,76.915879,1.384574
min,750000.0,18.0,-8019.0,1.0,3.0,1.0,-1.0,0.0
25%,812499.75,33.0,0.0,9.0,91.0,1.0,-1.0,0.0
50%,874999.5,39.0,631.0,17.0,133.0,2.0,-1.0,0.0
75%,937499.25,48.0,1389.0,21.0,353.0,3.0,-1.0,0.0
max,999999.0,95.0,98517.0,31.0,4918.0,58.0,871.0,150.0


In [5]:
original["y"] = original["y"].map({"no": 0, "yes": 1})
all_data = pd.concat([train, original, test], ignore_index=True)

all_data = add_features(all_data)

all_y = all_data["y"]
all_data = all_data.drop(["id", "y"], axis=1)

num_features = all_data.select_dtypes(include=np.number).columns.to_list()
cat_features = all_data.select_dtypes("object").columns.to_list()

train_and_orig = all_data.iloc[:len(train)+len(original)]
y = all_y[:len(train)+len(original)]
X_test = all_data.iloc[-len(test):]

preprocer = ColumnTransformer([
    ("num", "passthrough", num_features),
    ("cat", TargetEncoder(), cat_features)
])

X_train = preprocer.fit_transform(train_and_orig, y)
y_train = np.array(y)
X_test = preprocer.transform(X_test)

X_train.shape, y_train.shape, X_test.shape

((795211, 24), (795211,), (250000, 24))

In [6]:
X_train_gpu = cp.array(X_train, dtype=cp.float32)
y_train_gpu = cp.array(y_train, dtype=cp.float32)
X_test_gpu = cp.array(X_test, dtype=cp.float32)

In [7]:
# hyperparameters (selected Optuna)
xgb_params = {"objective": "binary:logistic",
               "eval_metric": "auc",
               "n_estimators": 1421,
               "max_bin": 11619,
               "max_depth": 9,
               "learning_rate": 0.060542156678920725,
               "subsample": 0.8860705042275745,
               "colsample_bytree": 0.5890358175215191,
               "reg_alpha": 0.004559752117634602,
               "reg_lambda": 0.004194941299345613,
               "min_child_weight": 5,
               "gamma": 0.2226553985484643,
               "scale_pos_weight": 1.2631169451535507,
               "grow_policy" : "lossguide",
               "tree_method": "hist",
               "early_stopping_rounds": 100,
               "device": "cuda",
               "seed": SEED}

In [None]:
model = XGBClassifier(**xgb_params)

results = cross_val_skf_and_pred(model,
                                 X_train_gpu,
                                 y_train_gpu,
                                 X_test_gpu,
                                 N_SPLITS)

print(f"Average roc auc: {np.mean(results['scores'])}")

Fold 1/10 | Roc Auc: 0.9722464246491905


In [None]:
feature_importance = pd.DataFrame({"feature": preprocer.get_feature_names_out(),
                                   "importance": model.feature_importances_})
feature_importance = feature_importance.sort_values(by="importance", ascending=False).iloc[:20]

sns.barplot(x=feature_importance["importance"], y=feature_importance["feature"])

In [None]:
submmission["y"] = results["test"]
submmission.to_csv("xgb.csv", index=False)
submmission.head()