In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

SEED = 42

import matplotlib.pyplot as plt
%matplotlib inline

# Read data & remove cols

In [None]:
train = pd.read_csv("/kaggle/input/credit-card-customers/BankChurners.csv", low_memory=True)
train.drop(columns=[col for col in train.columns if col.startswith("Naive_")] , inplace=True) # Let's drop the Naive_Bayes... column
target = "Attrition_Flag" # Target to train on
ids = ["CLIENTNUM"] # IDs to drop, or use only to identify data (else will overfit)
train.drop(columns=ids, inplace=True)

train.head()

# Build tailored split

Valid for early stopping
Test should be untouched to evaluate performance (and compare with valid)

In [None]:
if "Set" not in train.columns:
    print("Building tailored column")
    train_valid_index, test_index = next(
        StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=SEED).split(
            range(train[target].shape[0]), train[target].values
        )
    )
    train_index, valid_index = next(
        StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=SEED).split(
            train_valid_index, train[target].values[train_valid_index]
        )
    )
    train["Set"] = "train"
    train["Set"][valid_index] = "valid"
    train["Set"][test_index] = "test"

In [None]:
train_indices = train[train.Set == "train"].index
valid_indices = train[train.Set == "valid"].index
test_indices = train[train.Set == "test"].index

In [None]:
y = train[target].astype("category").cat.codes
train.drop(columns=[target, "Set"], inplace=True)

In [None]:
# Identify categorical columns + label encode

In [None]:
cat_idxs = []

In [None]:
for i, col in enumerate(train.columns):
    if train[col].dtype == "object":
        train[col] = train[col].astype("category").cat.codes
        cat_idxs.append(i)

In [None]:
# No Nan, no need for imputing

# Simple LGBM + performance

In [None]:
clf = LGBMClassifier(num_leaves=7, importance_type="gain", n_estimators=20000, random_state=SEED)
clf.fit(
    train.values[train_indices],
    y[train_indices],
    eval_set=[(train.values[valid_indices], y[valid_indices])],
    early_stopping_rounds=20,
    categorical_feature=cat_idxs,
    verbose=10
)

In [None]:
roc_auc_score(
    y_true=y[valid_indices],
    y_score=clf.predict(train.values[valid_indices]).reshape(-1),
)

In [None]:
roc_auc_score(
    y_true=y[test_indices],
    y_score=clf.predict(train.values[test_indices]).reshape(-1),
)

In [None]:
importances = clf.feature_importances_ / clf.feature_importances_.sum()

In [None]:
def explain_plot(importances, columns):
    selection = np.argsort(np.absolute(importances))
    performance = importances[selection]
    y_pos = np.arange(performance.shape[0])

    plt.barh(y_pos, performance, align="center", alpha=0.5)
    plt.yticks(y_pos, columns[selection])
    plt.title("Feature importance")

    plt.show()

In [None]:
explain_plot(importances, train.columns)

# Feature selection (keep 99% of variance)

In [None]:
indexes = np.argsort(-np.absolute(importances))
indexes

In [None]:
to_keep = []
variance = 0
i = 0
while variance < 0.99:
    variance += importances[indexes[i]]
    to_keep.append(indexes[i])
    i+=1
to_keep

In [None]:
len(train.columns) - len(to_keep)

In [None]:
new_cat_idxs = [i for i, idx in enumerate(to_keep) if idx in cat_idxs]
new_cat_idxs

In [None]:
clf_selected = LGBMClassifier(num_leaves=7, importance_type="gain", n_estimators=20000, random_state=SEED)
clf_selected.fit(
    train.values[train_indices][:, to_keep],
    y[train_indices],
    eval_set=[(train.values[valid_indices][:, to_keep], y[valid_indices])],
    early_stopping_rounds=20,
    categorical_feature=new_cat_idxs,
    verbose=10
)

In [None]:
model_auc = roc_auc_score(
    y_true=y[valid_indices],
    y_score=clf_selected.predict(train.values[valid_indices][:, to_keep]).reshape(-1),
)
model_auc

In [None]:
model_auc = roc_auc_score(
    y_true=y[test_indices],
    y_score=clf_selected.predict(train.values[test_indices][:, to_keep]).reshape(-1),
)
model_auc

In [None]:
importances = clf_selected.feature_importances_ / clf_selected.feature_importances_.sum()

In [None]:
explain_plot(importances, train.columns[to_keep])

# Trying decision tree

Results seems too good, looks like this is not real data.
Decision already has good results

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier(random_state=42, max_depth=5)
tree.fit(train.values[train_indices], y.values[train_indices])

In [None]:
model_auc = roc_auc_score(
    y_true=y[valid_indices],
    y_score=tree.predict(train.values[valid_indices]).reshape(-1),
)
model_auc

In [None]:
model_auc = roc_auc_score(
    y_true=y[test_indices],
    y_score=tree.predict(train.values[test_indices]).reshape(-1),
)
model_auc

In [None]:
from sklearn.tree import plot_tree

plot_tree(tree, feature_names=train.columns.tolist()) #, max_depth=5)
plt.show()

In [None]:
n_nodes = tree.tree_.node_count
children_left = tree.tree_.children_left
children_right = tree.tree_.children_right
feature = tree.tree_.feature
threshold = tree.tree_.threshold

node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
while len(stack) > 0:
    # `pop` ensures each node is only visited once
    node_id, depth = stack.pop()
    node_depth[node_id] = depth

    # If the left and right child of a node is not the same we have a split
    # node
    is_split_node = children_left[node_id] != children_right[node_id]
    # If a split node, append left and right children and depth to `stack`
    # so we can loop through them
    if is_split_node:
        stack.append((children_left[node_id], depth + 1))
        stack.append((children_right[node_id], depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has {n} nodes and has "
      "the following tree structure:\n".format(n=n_nodes))
for i in range(n_nodes):
    if is_leaves[i]:
        print("{space}node={node} is a leaf node.".format(
            space=node_depth[i] * "\t", node=i))
    else:
        print("{space}node={node} is a split node: "
              "go to node {left} if X[:, {feature}] <= {threshold} "
              "else to node {right}.".format(
                  space=node_depth[i] * "\t",
                  node=i,
                  left=children_left[i],
                  feature=feature[i],
                  threshold=threshold[i],
                  right=children_right[i]))