In [None]:
import os
import shutil
import warnings
warnings.filterwarnings('ignore')

while not os.path.isfile("README.md"):
    %cd ..

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import compute_sample_weight
import shap
import lightgbm as lgb
from lib.preprocess import get_data, Preprocess

pd.set_option('display.max_colwidth', None)
BASE_PATH = os.path.abspath("")
FILE_NAME = "lgbm_main"

In [None]:
X_train, y_train, X_test = get_data()

In [None]:
def mean_f1score(preds:np.ndarray,eval_data: lgb.Dataset):
    y_true = eval_data.get_label()
    weight = eval_data.get_weight()
    preds = preds.reshape(len(np.unique(y_true)), -1)
    preds = preds.argmax(axis = 0)
    f1 = f1_score(y_true,preds,average='macro',sample_weight=weight)
    return 'f1',f1,True

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

params = {
    'objective': 'multiclass',
    'metric': None,
    'num_class': 2,
    'seed': 42,
    "boosting_type": "gbdt",
}

In [11]:
best_score = 0 
for fold, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    # split
    print(f"Fold {fold + 1}")
    X_trn = X_train.iloc[trn_idx].copy()
    y_trn = y_train.iloc[trn_idx].copy()
    X_val = X_train.iloc[val_idx].copy()
    y_val = y_train.iloc[val_idx].copy()

    # preprocess
    ppr = Preprocess()
    X_trn = ppr.fit_transform(X_trn, y_trn)
    X_val = ppr.transform(X_val)

    # train
    train_set = lgb.Dataset(X_trn, y_trn)# ,weight=compute_sample_weight(class_weight='balanced',y=y_trn))
    val_set = lgb.Dataset(X_val, y_val, reference=train_set)
    model = lgb.train(
        params,
        train_set,
        num_boost_round=10000,
        valid_sets=[val_set],
        valid_names=["val"],
        feval=mean_f1score,
        callbacks=[
            # lgb.early_stopping(500,)
        ],
    )

    # inference
    preds = model.predict(X_val)
    preds = preds.reshape(len(np.unique(y_val)), -1)
    preds = preds.argmax(axis = 0)
    print("0, 1: ",sum(preds==0), sum(preds==1))
    score = f1_score(y_val, preds, average='macro')
    if score > best_score:
        best_score = score
        best_xval = X_val
        best_yval = y_val
        best_ppr = ppr
        best_model = model
    print(f"Score: {score}")
    print("-" * 100)

[1761]	val's multi_logloss: 0.367162	val's f1: 0.636748
[1762]	val's multi_logloss: 0.367163	val's f1: 0.636748
[1763]	val's multi_logloss: 0.367234	val's f1: 0.636748
[1764]	val's multi_logloss: 0.367333	val's f1: 0.636748
[1765]	val's multi_logloss: 0.367472	val's f1: 0.636748
[1766]	val's multi_logloss: 0.3675	val's f1: 0.636748
[1767]	val's multi_logloss: 0.367569	val's f1: 0.636913
[1768]	val's multi_logloss: 0.367646	val's f1: 0.63762
[1769]	val's multi_logloss: 0.367757	val's f1: 0.63762
[1770]	val's multi_logloss: 0.36787	val's f1: 0.63762
[1771]	val's multi_logloss: 0.367962	val's f1: 0.636913
[1772]	val's multi_logloss: 0.368052	val's f1: 0.636913
[1773]	val's multi_logloss: 0.368189	val's f1: 0.636913
[1774]	val's multi_logloss: 0.368262	val's f1: 0.636913
[1775]	val's multi_logloss: 0.368345	val's f1: 0.638325
[1776]	val's multi_logloss: 0.368414	val's f1: 0.63762
[1777]	val's multi_logloss: 0.368472	val's f1: 0.63762
[1778]	val's multi_logloss: 0.368454	val's f1: 0.637786


KeyboardInterrupt: 

In [None]:
# calibration
print(f"Best CV score before calibration: {best_score}")
zero_ratio = sum(best_yval==0) / len(best_yval)
n_zero = int(zero_ratio * len(best_yval))
y_prob = best_model.predict(best_xval)
y_prob = y_prob.reshape(len(np.unique(best_yval)), -1)
y_prob = pd.DataFrame(y_prob.T, columns=["0", "1"], index=best_xval.index)
y_pred = pd.DataFrame([1]*len(best_xval), columns=["pred"], index=best_xval.index)
y_pred.loc[y_prob.sort_values("0").index[:n_zero]] = 0
score = f1_score(best_yval, y_pred, average='macro')
print(f"Best CV score after calibration: {score}")

In [None]:
# submission
n_zero = int(zero_ratio * len(X_test))
X_test = best_ppr.transform(X_test)
probs = best_model.predict(X_test)
probs = probs.reshape(len(np.unique(y_train)), -1)
probs = pd.DataFrame(probs.T, columns=["0", "1"], index=X_test.index)
preds = pd.DataFrame([1]*len(X_test), columns=["pred"], index=X_test.index)
preds.loc[probs.sort_values("0").index[:n_zero]] = 0

In [None]:
result_dir = f"results/{FILE_NAME}"
if os.path.exists(result_dir):
    idx = 1
    result_dir += f"_v{idx}"
    while os.path.exists(result_dir):
        idx += 1
        result_dir = result_dir.split('_v')[0]
        result_dir += f"_v{idx}"
        
os.makedirs(result_dir, exist_ok=False)
shutil.copytree("lib", os.path.join(result_dir, "lib"))
shutil.copyfile(f"notebooks/{FILE_NAME}.ipynb", os.path.join(result_dir, f"{FILE_NAME}.ipynb"))
preds.to_csv(os.path.join(result_dir, f"submission.csv"), index=True, header=False)

In [None]:
shap_values = shap.TreeExplainer(best_model).shap_values(best_xval)
shap.summary_plot(shap_values, best_xval, plot_type="bar", show=True)

In [None]:
%pip install plyer
from plyer import notification

notification.notify(
    title="From Python",
    message="Executed Successfully",
    app_name='Python',
    app_icon=os.path.join(BASE_PATH,'lib/notification.ico'),
)

LB: 0.4969328