In [None]:
import pandas as pd
import numpy as np
import shap
from sklearn import set_config
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
path = "https://raw.githubusercontent.com/sharmaroshan/Churn-Modelling-Dataset/master/Churn_Modelling.csv"

In [None]:
# set config

set_config(transform_output="pandas")

In [None]:
# load the data

df = pd.read_csv(path)

df.head()

In [None]:
# shape of the data

df.shape

In [None]:
# check for missing values

df.isna().sum()

In [None]:
# check for duplicates

df.duplicated().sum()

# EDA

In [None]:
df

In [None]:
# drop the first three columns

columns_to_drop = df.columns[0:3]

columns_to_drop

In [None]:
# drop the columns

df.drop(columns=columns_to_drop, inplace=True)

In [None]:
df

In [None]:
# lowercase the column names

df.rename(columns=str.lower, inplace=True)

In [None]:
df.columns

In [None]:
# make our X and y

X = df.drop(columns=["exited"])

y = df["exited"]

In [None]:
X

In [None]:
y

In [None]:
# datatype

df.dtypes

In [None]:
# list of columns

numerical_cols = ["creditscore", "age", "balance", "estimatedsalary"]

categorical_cols = ["gender", "geography"]

In [None]:
numerical_cols

In [None]:
remainder_cols = [column_name for column_name in X.columns if column_name not in numerical_cols + categorical_cols]

remainder_cols

In [None]:
df[numerical_cols]

In [None]:
df[categorical_cols]

In [None]:
df[remainder_cols]

In [None]:
from matplotlib.gridspec import GridSpec

In [None]:
def plots_for_numerical_columns(column_name, bins, kde=True):
    # set the figure size
    fig = plt.figure(figsize=(8,6))
    # gridspec
    grid = GridSpec(nrows=2, ncols=2, height_ratios=[0.6,0.4])

    # define the axes
    ax1 = plt.subplot(grid[0,:])
    ax2 = plt.subplot(grid[1,:])

    # plot the histogram
    sns.histplot(data=X, x=column_name, bins=bins, kde=kde, ax=ax1)

    # plot the boxplot
    sns.boxplot(data=df,x=column_name,ax=ax2)

    plt.show()

In [None]:
# bins for the histogram

col_bins = {
    column_name:bin_val for column_name, bin_val in zip(numerical_cols, [50,15,"auto","auto"])
}

col_bins

In [None]:
for num_col in numerical_cols:
    print(f"{num_col.title()}", end="\n\n")
    plots_for_numerical_columns(column_name=num_col, bins=col_bins[num_col])
    print("\n\n")

In [None]:
# people with 0 bal

(
    X.loc[
        (X["balance"].eq(0)), :
    ]
)

In [None]:
X["balance"].eq(0)

In [None]:
# new col for 0 bal customers

X["iszerobal"] = X["balance"].eq(0).astype(int)

In [None]:
X

In [None]:
def plots_for_categorical_cols(column_name):
    display(X[column_name].value_counts(normalize=True))
    print(f"There are {X[column_name].nunique()} unique values in the column {column_name} which are {X[column_name].unique()}")
    # plot the count plot
    sns.countplot(data=X,x=column_name)
    plt.show()

In [None]:
for cat_col in categorical_cols:
    print(f"{cat_col.title()}")
    plots_for_categorical_cols(cat_col)
    print("\n\n")

In [None]:
X

In [None]:
# distribution of target

y.value_counts(normalize=True)

# FE and Model training

In [None]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [None]:
# split the data

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=30,stratify=y)

In [None]:
print(f"The shape of training data is {X_train.shape}")
print(f"The shape of test data is {X_test.shape}")

In [None]:
# column transformer

scaler = MinMaxScaler()

encoder = OneHotEncoder(handle_unknown="ignore",sparse_output=False)

preprocessor = ColumnTransformer(transformers=[
    ("scaler",scaler,numerical_cols),
    ("encoder",encoder,categorical_cols)
], remainder="passthrough",verbose_feature_names_out=False)

In [None]:
preprocessor

In [None]:
# apply the transformation

X_train = preprocessor.fit_transform(X_train)

X_test = preprocessor.transform(X_test)

In [None]:
X_train

In [None]:
X_test

**Model Building**

In [None]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
# define models

rf = RandomForestClassifier(class_weight="balanced", random_state=42)

xgb = XGBClassifier(class_weight="balanced", random_state=42)

lgbm = LGBMClassifier(class_weight="balanced", random_state=42)

In [None]:
# build a voting classifier

model = VotingClassifier(estimators=[
    ("rf",rf),
    ("xgb",xgb),
    ("lgbm",lgbm)
], voting="soft",n_jobs=-1)

model

In [None]:
# fit the model on the dataset

model.fit(X_train, y_train)

In [None]:
# get the predictions

y_pred = model.predict(X_test)

In [None]:
model.predict_proba(X_test.iloc[0:5,:])

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

**HP Tuning**

In [None]:
!pip install optuna

In [None]:
import optuna
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [None]:
model

In [None]:
model.estimators

In [None]:
def objective(trial: optuna.Trial):

    params_dict = {
        "rf__n_estimators": trial.suggest_int("rf__n_estimators", 50, 200),
        "rf__max_depth": trial.suggest_int("rf__max_depth", 5, 12),
        "rf__max_samples": trial.suggest_float("rf__max_samples", 0.5, 1.0),
        "xgb__learning_rate": trial.suggest_float("xgb__learning_rate", 0.01, 0.5),
        "xgb__max_depth": trial.suggest_int("xgb__max_depth", 5, 15),
        "xgb__n_estimators": trial.suggest_int("xgb__n_estimators", 30, 200),
        "xgb__subsample": trial.suggest_float("xgb__subsample", 0.5,1.0),
        "xgb__colsample_bynode": trial.suggest_float("xgb__colsample_bynode", 0.7, 1.0),
        "xgb__reg_lambda": trial.suggest_float("xgb__reg_lambda",0.1,10.0),
        "lgbm__max_depth": trial.suggest_int("lgbm__max_depth", 5, 15),
        "lgbm__n_estimators": trial.suggest_int("lgbm__n_estimators", 30, 200),
        "lgbm__learning_rate": trial.suggest_float("lgbm__learning_rate", 0.01, 0.5),
        "lgbm__subsample": trial.suggest_float("lgbm__subsample", 0.5,1.0),
        "lgbm__reg_lambda": trial.suggest_float("lgbm__reg_lambda",0.1,10.0)
    }

    # define models
    rf = RandomForestClassifier(class_weight="balanced", random_state=42)
    xgb = XGBClassifier(class_weight="balanced", random_state=42)
    lgbm = LGBMClassifier(class_weight="balanced", random_state=42)

    # build a voting classifier
    model = VotingClassifier(estimators=[
        ("rf",rf),
        ("xgb",xgb),
        ("lgbm",lgbm)
    ], voting="soft",n_jobs=-1)

    # pass parameters to model
    model.set_params(**params_dict)

    # cross validation
    scores = cross_val_score(estimator=model,
                             X=X_train, y=y_train,
                             scoring="f1",
                             cv=StratifiedKFold(n_splits=5,random_state=10,shuffle=True),
                             n_jobs=-1)

    # mean f1
    mean_f1 = np.mean(scores)

    return mean_f1

In [None]:
# define my study

study = optuna.create_study(direction="maximize")

study.optimize(func=objective,n_trials=20,n_jobs=-1,show_progress_bar=True)

In [None]:
# best parameters

study.best_params

In [None]:
best_params = study.best_params

In [None]:
# best score
study.best_value


In [None]:
# define models
rf = RandomForestClassifier(class_weight="balanced", random_state=42)
xgb = XGBClassifier(class_weight="balanced", random_state=42)
lgbm = LGBMClassifier(class_weight="balanced", random_state=42)

# build a voting classifier
model = VotingClassifier(estimators=[
    ("rf",rf),
    ("xgb",xgb),
    ("lgbm",lgbm)
], voting="soft",n_jobs=-1)

# pass the best params to model
model.set_params(**best_params)

model

In [None]:
# train the model

model.fit(X_train,y_train)

In [None]:
# get the predictions
y_pred = model.predict(X_test)

In [None]:
model.predict_proba(X_test.iloc[0:3, :])

In [None]:
# classification report

print(classification_report(y_test,y_pred))

# SHAP

In [None]:
model

In [None]:
model.predict_proba(X_test.iloc[0:3])[:,1]

In [None]:
def prediction_fn(X):
    predictions = model.predict_proba(X)[:,1]
    return predictions

In [None]:
# model agnostic explainer

explainer = shap.Explainer(model=prediction_fn,
                           masker=X_train,
                           link=shap.links.identity)

In [None]:
X_train.shape[1]

In [None]:
# calculate the shap values

shap_values = explainer(X_test.iloc[[1],:])

shap_values

In [None]:
shap_values.shape

In [None]:
model.predict_proba(X_test.iloc[[1],:])[:,1]

In [None]:
shap.plots.waterfall(shap_values[0])

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
# sample 500 rows from test data

test_data_sample = X_test.sample(500)

In [None]:
test_data_sample.shape

In [None]:
# calculate shap values for the entire test data

shap_values = explainer(test_data_sample)

In [None]:
shap_values.shape

In [None]:
shap_values

**Global Plots**

In [None]:
# bar plot

shap.plots.bar(shap_values)

In [None]:
# summary plot

shap.plots.beeswarm(shap_values)

In [None]:
# violin plot

shap.plots.violin(shap_values)

In [None]:
# heatmap plot

shap.plots.heatmap(shap_values)

**Local Plots**

In [None]:
test_case = X_test[y_test == 1].sample(1)

row_shap_values = explainer(test_case)[0]

In [None]:
row_shap_values.shape

In [None]:
df.loc[test_case.index]

In [None]:
# waterfall plot

shap.plots.waterfall(row_shap_values, max_display=5)

In [None]:
# bar plot--> local

shap.plots.bar(row_shap_values, max_display=5)

In [None]:
# row force plot

shap.initjs()

shap.plots.force(row_shap_values)

In [None]:
row_shap_values

In [None]:
# decision plot

shap.plots.decision(shap_values=row_shap_values.values, base_value=row_shap_values.base_values,
                    feature_names=X_test.columns.tolist())

**Batch Data Plots**

In [None]:
# sample of test data
sample_batch_data = X_test.sample(3)

# shap_values
batch_shap_values = explainer(sample_batch_data)

In [None]:
batch_shap_values.shape

In [None]:
# decision plot

shap.plots.decision(shap_values=batch_shap_values.values, base_value=batch_shap_values.base_values[0],
                    feature_names=X_test.columns.tolist())

# LIME

In [None]:
# test case

test_case_lime = test_case.values.flatten()

In [None]:
!pip install lime

In [None]:
from lime.lime_tabular import LimeTabularExplainer

In [None]:
feature_names = preprocessor.get_feature_names_out().tolist()

In [None]:
preprocessor.get_feature_names_out()

In [None]:
# list of index numbers of categorical features in data

categorical_features = [4, 5, 6, 7, 8, 11, 12, 13]

In [None]:
# create the explainer

lime_explainer = LimeTabularExplainer(
    training_data=X_train.values,
    mode="classification",
    feature_names=feature_names,
    categorical_features=categorical_features
)

In [None]:
# create an explanation

lime_explanation = lime_explainer.explain_instance(data_row=test_case_lime,
                                                   predict_fn=model.predict_proba)

In [None]:
# view my explanations --> as a list

lime_explanation.as_list()

In [None]:
from IPython.core.display import HTML

In [None]:
display(HTML(lime_explanation.as_html()))