In [None]:
!pip install nb_black
%load_ext nb_black

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv(
    "/kaggle/input/health-insurance-cross-sell-prediction/train.csv", index_col="id"
)
df.head()

In [None]:
obj_cols = df.select_dtypes("object").columns
for col in obj_cols:
    print(col, df[col].unique())

In [None]:
df["Vehicle_Age"] = pd.Categorical(
    df["Vehicle_Age"], categories=["< 1 Year", "1-2 Year", "> 2 Years"], ordered=True
)

In [None]:
obj_cols = df.select_dtypes("object").columns
df[obj_cols] = df[obj_cols].astype("category")

In [None]:
df.info()

In [None]:
y = df["Response"]
X = df.drop("Response", axis=1)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.nonparametric.smoothers_lowess import lowess


In [None]:
Xt, Xv, yt, yv = train_test_split(X, y, random_state=0)
dt = lgb.Dataset(Xt, yt)
dv = lgb.Dataset(Xv, yv, free_raw_data=False)
ds = lgb.Dataset(Xt.sample(10000, random_state=0), yt.sample(10000, random_state=0),)

In [None]:
def loguniform(low=0, high=1, size=None, base=10):
    return np.power(base, np.random.uniform(low, high, size))


best_etas = {"eta": [], "score": []}

In [None]:
metric = "binary_logloss"
maximize = False

for _ in range(60):
    eta = loguniform(-4, 0)
    best_etas["eta"].append(eta)
    model = lgb.train(
        {"objective": objective, "metric": metric, "eta": eta},
        ds,
        num_boost_round=10000,
        valid_sets=[ds, dv],
        valid_names=["training", "valid"],
        early_stopping_rounds=50,
        verbose_eval=False,
    )
    best_etas["score"].append(model.best_score["valid"][metric])

best_eta_df = pd.DataFrame.from_dict(best_etas)
lowess_data = lowess(best_eta_df["score"], best_eta_df["eta"],)

# use log scale as it's easier to observe the whole graph
plt.xscale("log")
rounded_data = lowess_data.copy()
rounded_data[:, 1] = rounded_data[:, 1].round(4)
rounded_data = rounded_data[::-1]  # reverse to find first best

# maximize or minimize metric
# e.g. binary loss needs minimizing, whereas AUC requires maximizing
if maximize:
    best = np.argmax
elif not maximize:
    best = np.argmin
good_eta = rounded_data[best(rounded_data[:, 1]), 0]

# plot relationship between learning rate and performance, with an eta selected just before diminishing returns
print(f"Good learning rate: {good_eta:4f}")
plt.axvline(good_eta, color="orange")
plt.title("Smoothed relationship between learning rate and metric.")
plt.xlabel("learning rate")
plt.ylabel(metric)
sns.lineplot(lowess_data[:, 0], lowess_data[:, 1])

In [None]:
model = lgb.train(
    {"objective": "binary", "metric": metric, "eta": good_eta},
    dt,
    num_boost_round=10000,
    valid_sets=[dt, dv],
    valid_names=["training", "valid"],
    early_stopping_rounds=50,
    verbose_eval=100,
)

In [None]:
threshold = 0.75
corr = Xt.corr(method="kendall")
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
upper = upper.stack()
high_upper = upper[(abs(upper) > threshold)]
abs_high_upper = abs(high_upper).sort_values(ascending=False)
pairs = abs_high_upper.index.to_list()
print(f"Correlated features: {pairs if len(pairs) > 0 else None}")

In [None]:
sorted_features = [
    feature
    for _, feature in sorted(
        zip(model.feature_importance(importance_type="gain"), model.feature_name()),
        reverse=False,
    )
]

best_score = model.best_score["valid"][metric]
print(f"starting score: {best_score:.4f}")
drop_unimportant_features = []
for feature in sorted_features:
    drop_unimportant_features.append(feature)
    X_train, X_valid, y_train, y_valid = train_test_split(
        X.drop(drop_unimportant_features, axis=1), y, random_state=0
    )
    dt = lgb.Dataset(X_train, y_train)
    dv = lgb.Dataset(X_valid, y_valid)
    drop_model = lgb.train(
        {"objective": "binary", "metric": metric, "eta": good_eta},
        dt,
        valid_sets=[dt, dv],
        valid_names=["training", "valid"],
        num_boost_round=10000,
        early_stopping_rounds=50,
        verbose_eval=False,
    )
    score = drop_model.best_score["valid"][metric]
    if score > best_score:
        del drop_unimportant_features[-1]  # remove from drop list
        print(f"Dropping {feature} worsened score to {score:.4f}.")
        break
    else:
        best_score = score
print(f"ending score: {best_score:.4f}")
print(
    f"dropped features: {drop_unimportant_features if len(drop_unimportant_features) > 0 else None}"
)

In [None]:
import optuna.integration.lightgbm as lgb

params = {
    "objective": "binary",
    "metric": metric,
    "verbosity": -1,
    "boosting_type": "gbdt",
    "eta": good_eta,
}

dt = lgb.Dataset(Xt, yt)
dv = lgb.Dataset(Xv, yv)


model = lgb.train(
    params,
    dt,
    valid_sets=[dt, dv],
    valid_names=["training", "valid"],
    num_boost_round=10000,
    verbose_eval=False,
    early_stopping_rounds=50,
)

score = model.best_score["valid"][metric]

best_params = model.params
print("Best params:", best_params)
print(f"  {metric} = {score}")
print("  Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))

In [None]:
best_params["eta"] = good_eta

In [None]:
import lightgbm as lgb

In [None]:
model = lgb.train(
    best_params,
    dt,
    num_boost_round=10000,
    valid_sets=[dt, dv],
    valid_names=["training", "valid"],
    early_stopping_rounds=50,
    verbose_eval=100,
)

In [None]:
lgb.plot_importance(model, grid=False, importance_type="gain")

In [None]:
import shap

In [None]:
Xvs = Xv.sample(10000, random_state=0)

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(Xvs)

In [None]:
sorted_features = [
    feature
    for _, feature in sorted(
        zip(model.feature_importance(importance_type="gain"), model.feature_name()),
        reverse=False,
    )
]


for name in reversed(sorted_features):
    shap.dependence_plot(name, shap_values[1], Xvs, display_features=Xvs)

In [None]:
from sklearn.metrics import roc_auc_score

ypred = model.predict(Xv, num_iteration=model.best_iteration)
print(f"AUC: {roc_auc_score(yv, ypred):4f}")