In [204]:
import numpy as np
import pandas as pd
import os
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
from lib.preprocess import Preprocess, data_augmentation
from sklearn.manifold import TSNE
from sklearn.utils import compute_sample_weight
import shap
import optuna
from functools import partial

In [223]:
SEED = 314
datasrc = "data/official/"
data = pd.read_csv(os.path.join(datasrc, "train.csv"), index_col=0)
x_test = pd.read_csv(os.path.join(datasrc, "test.csv"), index_col=0)
train, valid = train_test_split(data, test_size=0.2, random_state=42, stratify=data["health"])

In [224]:
x_train = train.drop("health", axis=1)
y_train = train["health"]
x_valid = valid.drop("health", axis=1)
y_valid = valid["health"]

In [207]:
ignore_columns = [
    "nta_name",
    "boro_ct",
    "spc_latin",
]
# get object columns
object_columns = [col for col in x_train.select_dtypes(include=["object"]).columns.tolist() if col not in ignore_columns]
config = {
    "object_columns": object_columns,
    "is_target_encode": False,
}

In [208]:
def mean_f1score(preds:np.ndarray,eval_data: lgb.Dataset):
    y_true = eval_data.get_label()
    weight = eval_data.get_weight()
    preds = preds.reshape(len(np.unique(y_true)), -1)
    preds = preds.argmax(axis = 0)
    f1 = f1_score(y_true,preds,average='macro',sample_weight=weight)
    return 'f1',f1,True

In [209]:
preprocess = Preprocess(config)
skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
noisy_idx = list()

In [210]:
for train_index, valid_index in skfold.split(x_train, y_train):
    x_train_fold = x_train.iloc[train_index]
    y_train_fold = y_train.iloc[train_index]
    x_valid_fold = x_train.iloc[valid_index]
    y_valid_fold = y_train.iloc[valid_index]
    x_train_fold = preprocess.fit_transform(x_train_fold, y_train_fold)
    x_valid_fold = preprocess.transform(x_valid_fold)
    lgb_train = lgb.Dataset(x_train_fold, y_train_fold, weight=compute_sample_weight("balanced", y_train_fold))
    params = {
        'objective': 'multiclass',
        'metric': None,
        'num_class': 3,
        'seed': 42,
        'num_threads': -1,
    }
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train],
        valid_names=['train'],
        num_boost_round=1000,
        # early_stopping_rounds=100,
        # verbose_eval=100,
        feval=mean_f1score,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            # lgb.reset_parameter(learning_rate=lambda current_round: 0.01 * 0.995 ** current_round),
        ]
    )
    y_pred = model.predict(x_valid_fold)
    y_pred = pd.Series(y_pred.argmax(axis=1), index=y_valid_fold.index)
    print(f1_score(y_valid_fold, y_pred, average='macro'))
    # noisy_idx+=list(set(y_valid_fold.loc[y_valid_fold==0].index)&set(y_pred.loc[y_pred==1].index))
    # noisy_idx+=list(set(y_valid_fold.loc[y_valid_fold==0].index)&set(y_pred.loc[y_pred==2].index))
    noisy_idx+=list(set(y_valid_fold.loc[y_valid_fold==1].index)&set(y_pred.loc[y_pred==0].index))
    noisy_idx+=list(set(y_valid_fold.loc[y_valid_fold==1].index)&set(y_pred.loc[y_pred==2].index))
    # noisy_idx+=list(set(y_valid_fold.loc[y_valid_fold==2].index)&set(y_pred.loc[y_pred==0].index))
    # noisy_idx+=list(set(y_valid_fold.loc[y_valid_fold==2].index)&set(y_pred.loc[y_pred==1].index))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1347
[LightGBM] [Info] Number of data points in the train set: 14388, number of used features: 35
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
0.34467962629744325
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1353
[LightGBM] [Info] Number of data points in the train set: 14388, number of used features: 35
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
0.32988295877746426
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info

In [211]:
# 1->2のデータは害悪なので、それを除いて再度学習
len(noisy_idx)

1484

In [212]:
train = train.drop(noisy_idx)
x_train = train.drop("health", axis=1)
y_train = train["health"]

In [213]:
def kfold_cv(params:dict,x_train:pd.DataFrame,y_train:pd.Series,preprocess:Preprocess):
    skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    #この関数内ではuserwarningを非表示にする
    import warnings
    warnings.simplefilter('ignore')

    #損失を記録するリスト
    losses = []
    #各foldで学習
    for train_index,test_index in skf.split(x_train,y_train):
        #train,testのデータを作成
        X_train_fold,X_test_fold = x_train.iloc[train_index],x_train.iloc[test_index]
        Y_train_fold,Y_test_fold = y_train.iloc[train_index],y_train.iloc[test_index]

        #前処理
        X_train_fold = preprocess.fit_transform(X_train_fold, Y_train_fold)
        X_test_fold = preprocess.transform(X_test_fold)

        X_train_fold = pd.DataFrame(X_train_fold,columns=preprocess.get_feature_names_out())
        X_test_fold = pd.DataFrame(X_test_fold,columns=preprocess.get_feature_names_out())


        cat_cols = X_train_fold.filter(like='cat__').columns.tolist()
        #カテゴリー変数をカテゴリー型のデータに変換
        for col in cat_cols:
            X_train_fold[col] = X_train_fold[col].astype('category')
            X_test_fold[col] = pd.Categorical(X_test_fold[col],categories=X_train_fold[col].cat.categories)


        #dataset,add competition weight using sklearn
        trainset = lgb.Dataset(X_train_fold,label=Y_train_fold,weight=compute_sample_weight(class_weight='balanced',y=Y_train_fold))
        # trainset = lgb.Dataset(X_train_fold,label=Y_train_fold)
        testset = lgb.Dataset(X_test_fold,label=Y_test_fold,reference=trainset)

        #モデルを作成
        model = lgb.train(params,trainset,num_boost_round=10000,valid_sets=[trainset,testset],valid_names=['train','test'],callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)],feval=mean_f1score)
        #testデータで予測
        preds = model.predict(X_test_fold,num_iteration=model.best_iteration)
        preds = np.argmax(preds,axis=1)
        #f1スコアを計算
        f1 = f1_score(Y_test_fold,preds,average='macro')
        #f1スコアを記録
        losses.append(f1)
    #f1スコアの平均を返す
    return np.mean(losses)

In [214]:
def objective(x,y,preprocess,trial:optuna.trial):
    # ハイパーパラメータの探索範囲
    params = {
        'objective': 'multiclass',
        'metric': "None",
        'num_class': 3,
        'seed': 42,
        # search space from https://github.com/optuna/optuna-examples/blob/main/lightgbm/lightgbm_integration.py
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    # 交差検証のF1スコアの平均値を返す
    return kfold_cv(params,x,y,preprocess)

In [215]:
objective_fixed = partial(objective,x_train,y_train,preprocess)
study = optuna.create_study(direction='maximize')
study.optimize(
    objective_fixed,
    n_trials=300,
    )

[32m[I 2024-01-06 12:31:51,836][0m A new study created in memory with name: no-name-bc499166-d1c2-4e85-906e-0265149389e4[0m
[32m[I 2024-01-06 12:32:25,441][0m Trial 0 finished with value: 0.3883946164288831 and parameters: {'lambda_l1': 7.278796815409793e-05, 'lambda_l2': 0.0004812898903339017, 'num_leaves': 100, 'feature_fraction': 0.6047489481407352, 'bagging_fraction': 0.9294483514739568, 'bagging_freq': 3, 'min_child_samples': 23}. Best is trial 0 with value: 0.3883946164288831.[0m
[32m[I 2024-01-06 12:33:06,348][0m Trial 1 finished with value: 0.3955519320837259 and parameters: {'lambda_l1': 0.010056757172005099, 'lambda_l2': 0.0023122302347957173, 'num_leaves': 97, 'feature_fraction': 0.483038600836708, 'bagging_fraction': 0.9022549698999205, 'bagging_freq': 6, 'min_child_samples': 59}. Best is trial 1 with value: 0.3955519320837259.[0m
[32m[I 2024-01-06 12:33:38,457][0m Trial 2 finished with value: 0.3910343850191886 and parameters: {'lambda_l1': 1.5176652123971455e-0

In [225]:
x_train, x_eval, y_train, y_eval = train_test_split(x_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

x_train = pd.DataFrame(preprocess.fit_transform(x_train, y_train), index=x_train.index, columns=preprocess.get_feature_names_out())
x_eval = pd.DataFrame(preprocess.transform(x_eval), index=x_eval.index, columns=preprocess.get_feature_names_out())
x_valid = pd.DataFrame(preprocess.transform(x_valid), index=x_valid.index, columns=preprocess.get_feature_names_out())
x_test = pd.DataFrame(preprocess.transform(x_test), index=x_test.index, columns=preprocess.get_feature_names_out())
# %%
cat_cols = x_train.filter(like='cat__').columns.tolist()
#カテゴリー変数をカテゴリー型のデータに変換
for col in cat_cols:
    x_train[col] = x_train[col].astype('category')
    x_eval[col] = pd.Categorical(x_eval[col],categories=x_train[col].cat.categories)
    x_valid[col] = pd.Categorical(x_valid[col],categories=x_train[col].cat.categories)
    x_test[col] = pd.Categorical(x_test[col],categories=x_train[col].cat.categories)

In [226]:
params = study.best_params
params.update({
            'objective': 'multiclass',
            'metric': "None",
            'num_class': 3,
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'seed': 42
            })

In [230]:
lgb_train = lgb.Dataset(x_train, y_train, weight=compute_sample_weight("balanced", y_train))
lgb_eval = lgb.Dataset(x_eval, y_eval, reference=lgb_train)#", weight=compute_sample_weight("balanced", y_eval))
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_eval],
    valid_names=['evaluate'],
    num_boost_round=1000,
    # early_stopping_rounds=100,
    # verbose_eval=100,
    feval=mean_f1score,
    callbacks=[
        lgb.early_stopping(stopping_rounds=100, verbose=False),
        # lgb.reset_parameter(learning_rate=lambda current_round: 0.01 * 0.995 ** current_round),
    ]
)

In [231]:
y_pred = model.predict(x_valid)
y_pred = pd.Series(y_pred.argmax(axis=1), index=y_valid.index)
print(f1_score(y_valid, y_pred, average='macro'))

0.35910787510880504


In [232]:
y_pred.value_counts(normalize=True)

1    0.724543
0    0.229672
2    0.045784
dtype: float64

In [233]:
submit_name = "metha_lgbm"
predict = pd.Series(model.predict(x_test).argmax(axis=1), index=x_test.index)
predict.to_csv(f"submission/{submit_name}_submission.csv",  header=False)