In [None]:
!pip install nb_black
%load_ext nb_black

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import lightgbm as lgb
import seaborn as sns
from category_encoders import OrdinalEncoder
from sklearn.model_selection import train_test_split
from statsmodels.nonparametric.smoothers_lowess import lowess
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/kaggle/input/yeh-concret-data/Concrete_Data_Yeh.csv")

In [None]:
df.info()

In [None]:
y = df["csMPa"]
X = df.drop("csMPa", axis=1)

In [None]:
sns.distplot(y)

In [None]:
Xt, Xv, yt, yv = train_test_split(X, y)
dt = lgb.Dataset(Xt, yt)
dv = lgb.Dataset(Xv, yv)

In [None]:
best_etas = {"eta": [], "score": []}

In [None]:
for _ in range(100):
    eta = np.random.uniform(0.03, 0.1)
    best_etas["eta"].append(eta)
    model = lgb.train(
        {"objective": "regression", "metric": "rmse", "eta": eta},
        dt,
        num_boost_round=10000,
        valid_sets=[dt, dv],
        valid_names=["training", "valid"],
        early_stopping_rounds=50,
        verbose_eval=False,
    )
    best_etas["score"].append(model.best_score["valid"]["rmse"])

best_eta_df = pd.DataFrame.from_dict(best_etas)
lowess_data = lowess(best_eta_df["score"], best_eta_df["eta"],)
sns.lineplot(lowess_data[:, 0], lowess_data[:, 1])
best_eta = lowess_data[lowess_data[:, 1].argmin(), 0]
plt.axvline(best_eta, color="red")

In [None]:
model = lgb.train(
    {"objective": "regression", "metric": "rmse", "eta": best_eta},
    dt,
    num_boost_round=10000,
    valid_sets=[dt, dv],
    valid_names=["training", "valid"],
    early_stopping_rounds=50,
    verbose_eval=100,
)

In [None]:
corr = Xt.corr(method="kendall")
corr = corr.stack()
corr = corr.loc[
    [tup for tup in corr.index if tup[0] != tup[1]]
]  # remove pairs of itself
threshold = 0.75
high_corr = corr[(abs(corr) > threshold)]
abs_high_corr = abs(high_corr)[::2]
pairs = abs_high_corr.sort_values(ascending=False).index.to_list()
print(f"Correlated features: {pairs if len(pairs) > 0 else None}")

In [None]:
sorted_features = [
    feature
    for _, feature in sorted(
        zip(model.feature_importance(), dt.feature_name), reverse=False
    )
]

In [None]:
best_score = model.best_score["valid"]["rmse"]
print(f"starting score: {best_score:.4f}")
unimportant_features = []
for feature in sorted_features:
    unimportant_features.append(feature)
    X_train, X_valid, y_train, y_valid = train_test_split(
        X.drop(unimportant_features, axis=1), y, random_state=0
    )
    dt = lgb.Dataset(X_train, y_train)
    dv = lgb.Dataset(X_valid, y_valid)
    drop_model = lgb.train(
        {"objective": "regression", "metric": "rmse", "eta": best_eta},
        dt,
        valid_sets=[dt, dv],
        valid_names=["training", "valid"],
        num_boost_round=10000,
        early_stopping_rounds=50,
        verbose_eval=False,
    )
    score = drop_model.best_score["valid"]["rmse"]
    if score > best_score:
        del unimportant_features[-1]  # remove from drop list
        print(f"Dropping {feature} worsened score to {score:.4f}.")
        break
    else:
        best_score = score
print(f"ending score: {best_score:.4f}")
print(
    f"dropped features: {unimportant_features if len(unimportant_features) > 0 else None}"
)

In [None]:
import optuna.integration.lightgbm as lgb

params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "eta": best_eta,
}

dt = lgb.Dataset(Xt, yt)
dv = lgb.Dataset(Xv, yv)


model = lgb.train(
    params,
    dt,
    valid_sets=[dt, dv],
    valid_names=["training", "valid"],
    num_boost_round=10000,
    verbose_eval=False,
    early_stopping_rounds=50,
)

score = model.best_score["valid"]["rmse"]

best_params = model.params
print("Best params:", best_params)
print("  rmse = {}".format(score))
print("  Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))

In [None]:
import lightgbm as lgb

In [None]:
model = lgb.train(
    best_params,
    dt,
    num_boost_round=10000,
    valid_sets=[dt, dv],
    valid_names=["training", "valid"],
    early_stopping_rounds=50,
    verbose_eval=100,
)

In [None]:
import shap

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

In [None]:
sorted_features = [
    feature
    for _, feature in sorted(
        zip(model.feature_importance(), dt.feature_name), reverse=True
    )
]

In [None]:
# to make sense of it all
for name in sorted_features:
    shap.dependence_plot(name, shap_values, X)

In [None]:
from sklearn.metrics import r2_score

In [None]:
f"R2: {r2_score(yv, model.predict(Xv, num_iteration=model.best_iteration)):.3f}"