# Import

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

train = pd.read_csv('training_dataset.csv')
test = pd.read_csv('validation_set.csv')

train = train.drop(columns=['customer_number'])
test = test.drop(columns=['customer_number'])

target = 'berlangganan_deposito'

In [None]:
train.info()

In [None]:
train.describe().T

In [None]:
# Get all object columns
obj_columns = train.select_dtypes(include=['object']).columns

# Loop through each object column and display unique values and their counts
for col in obj_columns:
    print(f"\n{col}:")
    print(train[col].value_counts())
    print(f"Total unique values: {train[col].nunique()}")

# Baseline Model

## Split to train and validation

In [None]:
train, val = train_test_split(train, test_size=0.2, random_state=42, stratify=train[target])

X_train = train.drop(columns=[target])
y_train = train[target]
X_val = val.drop(columns=[target])
y_val = val[target]

## Encode for lgbm and xgb

In [None]:
le = LabelEncoder()

X_train_le = X_train.copy()
X_val_le = X_val.copy()

for col in obj_columns:
    X_train_le[col] = le.fit_transform(X_train[col])
    X_val_le[col] = le.transform(X_val[col])

## Modelling

In [None]:

ctb = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    cat_features=obj_columns.tolist(),
    eval_metric='AUC',
    random_seed=42,
    verbose=100,
    task_type='GPU',  # Use GPU for training
    devices='0'  # Adjust based on your GPU setup
)

lgb = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    num_leaves=31,
    random_state=42,
    device = 'gpu',
    gpu_platform_id=0,  # Adjust based on your GPU setup
    gpu_device_id=0  # Adjust based on your GPU setup
)


xgb = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    tree_method='hist',  # Use GPU for training
    device='cuda',
    eval_metric='auc',
)

## Fit

In [None]:
ctb.fit(
    X_train, 
    y_train, 
    eval_set=(X_val, y_val), 
    early_stopping_rounds=100,
    verbose=100,
)

lgb.fit(
    X_train_le,
    y_train,
    eval_set=(X_val_le, y_val),
    categorical_feature=obj_columns.tolist(),
    eval_metric='auc',
)

xgb.fit(
    X_train_le,
    y_train,
    eval_set=[(X_val_le, y_val)],
    verbose=100
)




In [None]:
# make 1 plot with 3 lines for each model
def plot_roc_curves(models, X_val, X_val_le, y_val):
    plt.figure(figsize=(10, 6))
    
    for model in models:
        if model == ctb:
            y_proba = model.predict_proba(X_val)[:, 1]
        else:
            y_proba = model.predict_proba(X_val_le)[:, 1]
        fpr, tpr, _ = roc_curve(y_val, y_proba)
        roc_auc = roc_auc_score(y_val, y_proba)
        plt.plot(fpr, tpr, label=f'{model.__class__.__name__} (AUC = {roc_auc:.2f})')
    
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend()
    plt.grid()
    plt.show()

plot_roc_curves([ctb, lgb, xgb], X_val, X_val_le, y_val)

def plot_precision_recall_curves(models, X_val, X_val_le, y_val):
    plt.figure(figsize=(10, 6))
    
    for model in models:
        if model == ctb:
            y_proba = model.predict_proba(X_val)[:, 1]
        else:
            y_proba = model.predict_proba(X_val_le)[:, 1]
        precision, recall, _ = precision_recall_curve(y_val, y_proba)
        auc_pr = np.trapz(precision, recall)
        plt.plot(recall, precision, label=f'{model.__class__.__name__} (AUC PR = {auc_pr:.2f})')
    
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curves')
    plt.legend()
    plt.grid()
    plt.show()

plot_precision_recall_curves([ctb, lgb, xgb], X_val, X_val_le, y_val)

# Weighted Model

## Calculate class weights and scale_pos

In [None]:
classcount = y_train.value_counts()

# calculate class weights for lgbm and catboost
classweights = {0: classcount[1] / classcount[0], 1: 1.0}

# calculate scale_pos_weight for xgboost
scaleposweight = classcount[0] / classcount[1]

print(f"Class Weights: {classweights}")
print(f"Scale Pos Weight: {scaleposweight}")

## Modelling

In [None]:

ctb = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    cat_features=obj_columns.tolist(),
    eval_metric='AUC',
    random_seed=42,
    verbose=100,
    class_weights=classweights,
    task_type='GPU',  # Use GPU for training
    devices='0'  # Adjust based on your GPU setup
)

lgb = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    num_leaves=31,
    random_state=42,
    class_weight=classweights,
    device = 'gpu',
    gpu_platform_id=0,  # Adjust based on your GPU setup
    gpu_device_id=0  # Adjust based on your GPU setup
)


xgb = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    scale_pos_weight=scaleposweight,
    tree_method='hist',  # Use GPU for training
    device='cuda',
    eval_metric='auc',
)

In [None]:
ctb.fit(
    X_train, 
    y_train, 
    eval_set=(X_val, y_val), 
    early_stopping_rounds=100,
    verbose=100,
)

lgb.fit(
    X_train_le,
    y_train,
    eval_set=(X_val_le, y_val),
    categorical_feature=obj_columns.tolist(),
    eval_metric='auc',
)

xgb.fit(
    X_train_le,
    y_train,
    eval_set=[(X_val_le, y_val)],
    verbose=100
)




In [None]:
# make 1 plot with 3 lines for each model
def plot_roc_curves(models, X_val, X_val_le, y_val):
    plt.figure(figsize=(10, 6))
    
    for model in models:
        if model == ctb:
            y_proba = model.predict_proba(X_val)[:, 1]
        else:
            y_proba = model.predict_proba(X_val_le)[:, 1]
        fpr, tpr, _ = roc_curve(y_val, y_proba)
        roc_auc = roc_auc_score(y_val, y_proba)
        plt.plot(fpr, tpr, label=f'{model.__class__.__name__} (AUC = {roc_auc:.2f})')
    
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend()
    plt.grid()
    plt.show()

plot_roc_curves([ctb, lgb, xgb], X_val, X_val_le, y_val)

def plot_precision_recall_curves(models, X_val, X_val_le, y_val):
    plt.figure(figsize=(10, 6))
    
    for model in models:
        if model == ctb:
            y_proba = model.predict_proba(X_val)[:, 1]
        else:
            y_proba = model.predict_proba(X_val_le)[:, 1]
        precision, recall, _ = precision_recall_curve(y_val, y_proba)
        auc_pr = np.trapz(precision, recall)
        plt.plot(recall, precision, label=f'{model.__class__.__name__} (AUC PR = {auc_pr:.2f})')
    
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curves')
    plt.legend()
    plt.grid()
    plt.show()

plot_precision_recall_curves([ctb, lgb, xgb], X_val, X_val_le, y_val)