In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from datetime import datetime

# 自作モジュールの読み込み
sys.path.append(os.path.abspath('..'))
from configs.config import *
from src.runner import Runner
from src.model_LGBM import model_LGBM
from src.util import Logger, Util

In [None]:
# ロガーの設定
logger = Logger(path=DIR_LOG)

def get_run_name(model_type):
    """run名の作成
    """
    run_name = model_type
    suffix = '_' + datetime.now().strftime("%Y%m%d%H%M")
    run_name = run_name + suffix
    return run_name

# LGBM

In [None]:
# メモの設定
memo = "LightGBM 最終調整"

# run_nameの設定
run_name = get_run_name(model_type="lgbm")
# run_name = 'lgbm_202508021646'
run_name

In [None]:
df_all = Util.load_feature('Key').merge(
    Util.load_feature('Target'), how='left', on=['社員番号', 'category']
).merge(
    Util.load_feature('CategoryFeature'), how='left', on=['category']
).merge(
    Util.load_feature('CareerFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('DxFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('DxCategoryEmbeddingFeature'), how='left', on=['社員番号'] 
).merge(
    Util.load_feature('DxNameEmbeddingFeature'), how='left', on=['社員番号'] 
).merge(
    Util.load_feature('HrFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('HrCategoryEmbeddingFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('HrNameEmbeddingFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('PositionHistoryFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('UdemyActivityFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('UdemyTimeseriesFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('UdemyTitleEmbedding'), how='left', on=['社員番号']
).merge(
    Util.load_feature('UdemyIDEmbedding'), how='left', on=['社員番号']
).merge(
    Util.load_feature('OvertimeWorkByMonthFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('OvertimeWorkByMonthTimeseriesFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('UdemyCategorySimilarityFeature'), how='left', on=['社員番号', 'category']
).merge(
    Util.load_feature('UdemyTitleSimilarityFeature'), how='left', on=['社員番号', 'category']
).merge(
    Util.load_feature('DxSimilarityFeature'), how='left', on=['社員番号', 'category']
).merge(
    Util.load_feature('HrSimilarityFeature'), how='left', on=['社員番号', 'category']
)

In [None]:
# train/test
df_train = df_all[df_all['target'].notnull()]
df_test = df_all[df_all['target'].isnull()]
print('train shape: {}'.format(df_train.shape))
print('test shape: {}'.format(df_test.shape))

In [None]:
pos = sum(df_train['target'] == 1)
neg = sum(df_test['target'] != 1)
print(neg / pos)

In [None]:
def after_predict_process(df_pred, target_col):
    """予測後に行う処理
    Args:
        df_pred(pd.DataFrame): 予測データ[key_cols, 予測値]
        target_col(str): 予測値のカラム名
    Returns:    
        df_pred(pd.DataFrame): 予測データ[key_cols, 予測値]
    """
    return df_pred

def after_split_process(tr, va):
    """データセットの分割後に行う処理
    Args:
        tr(pd.DataFrame): 訓練データ
        va(pd.DataFrame): 検証データ
    returns:
        tr(pd.DataFrame): 訓練データ
        va(pd.DataFrame): 検証データ
    """
    return tr, va

model_params_lgb = {
    #### run params
    "key_cols": KEY_COL,                    # ユニークキー
    "target_col": TARGET_COL,              # 目的変数（0 or 1）
    "remove_cols": [],
    "tune": [True, 30],              # パラメータチューニングを行うか否か
    #### model train params
    "num_boost_round": 10000,
    "early_stopping_rounds": 100,
    "verbose": -1,
    "period": 100,
    "log_level": 'error',
    "verbosity": -1,
    #### model core params (for binary classification)
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',     # 'aus or binary_logloss',
    # "is_unbalance": True,         # 不均衡データ対応
    # "scale_pos_weight": 10,       # 不均衡データ対応
    "max_depth": 15,               # 木の最大深さ（デフォルト=-1=制限なし）
    "num_leaves": 6,               # 葉の数（デフォルト=31、過学習を抑制した設定）
    "feature_fraction": 0.9,       # 学習時の列サンプリング比率（デフォルト=1.0）
    "bagging_freq": 1,             # バギングの頻度（デフォルト=0、0は使わない）
    "learning_rate": 0.01,         # 学習率（デフォルト=0.1、小さいほど安定だが学習遅い）
    "bagging_fraction": 0.8,       # 学習時の行サンプリング比率（デフォルト=1.0）
    "random_state": 42,            # 乱数シード（デフォルト=None）
    "colsample_bytree": 0.9,       # ツリー単位の列サブサンプリング（※alias、デフォルト=1.0）
    "colsample_bynode": 0.6,       # ノード単位の列サブサンプリング（デフォルト=1.0）
    "lambda_l1": 3.5,              # L1正則化（デフォルト=0.0）
    "lambda_l2": 1.5,              # L2正則化（デフォルト=0.0）
    "min_data_in_leaf": 50,        # 葉に必要な最小データ数（デフォルト=20）
    "device": "cpu"                # デバイス指定（"gpu"/"cpu"、デフォルト="cpu"）
}
run_setting = {
    "after_predict_process": after_predict_process,
    'after_split_process': after_split_process,  
}
cv_setting = {
    "target_col": TARGET_COL,
    "group_col": "社員番号",  # グループ化するカラム
    "n_splits": 4,  # 分割数,
    "shuffle": True,  # シャッフルするか否か
    "random_state": 42,  # ランダムシード
}

In [None]:
import importlib
from src import runner
from src import model_LGBM
from src import model
from configs import config

importlib.reload(runner)
importlib.reload(model_LGBM)
importlib.reload(model)
importlib.reload(config)

from src.model import Model
from src.runner import Runner
from src.model_LGBM import model_LGBM
from configs.config import *

In [None]:
ml_runner = Runner(
    run_name,
    model_LGBM,
    model_params_lgb,
    df_train,
    df_test,
    run_setting,
    cv_setting,
    logger,
    memo,
)

In [None]:
# ml_runner.tune_params(30)

In [None]:
ml_runner.run_train_cv()

In [None]:
ml_runner.run_metric_cv()

In [None]:
ml_runner.run_predict_cv()

In [None]:
ml_runner.plot_feature_importance_cv()

In [None]:
tr, va = ml_runner.create_train_valid_dateset(0)
print("train shape: {}".format(tr.shape))
print("valid shape: {}".format(va.shape))

# LGBM AUC

In [None]:
# run_nameの設定
run_name_auc = get_run_name(model_type="lgbm_auc")
# run_name = 'lgbm_202508021646'
run_name_auc

In [None]:
model_params_lgb_auc = model_params_lgb.copy()
model_params_lgb_auc["metric"] = "auc"

In [None]:
ml_runner_auc = Runner(
    run_name_auc,
    model_LGBM,
    model_params_lgb_auc,
    df_train,
    df_test,
    run_setting,
    cv_setting,
    logger,
    memo,
)

In [None]:
ml_runner_auc.run_train_cv()

In [None]:
ml_runner_auc.run_predict_cv()

In [None]:
ml_runner_auc.run_metric_cv()

# Submissionの作成

In [None]:
runner = ml_runner

In [None]:
# ml_runnerの出力を保存
df_te_pred = pd.read_pickle(os.path.join(runner.out_dir_name, "te_pred.pkl"))
df_prep_test = pd.read_pickle(os.path.join(DIR_INTERIM, "df_prep_test.pkl"))
df_pred = pd.merge(df_prep_test, df_te_pred, on=["社員番号", "category"], how="left")
df_submit = df_pred[['target']]
path_submit = os.path.join(DIR_SUBMISSIONS, f"{runner.run_name}_submition.csv")
df_submit.to_csv(path_submit, header=True, index=False)
print(path_submit)
# pd.read_csv(path_submit)

# ml_runnerとml_runner_aucの出力をアンサンブルして保存
df_te_pred_lgbm = pd.read_pickle(os.path.join(ml_runner.out_dir_name, "te_pred.pkl"))
df_te_pred_lgbm_auc = pd.read_pickle(os.path.join(ml_runner_auc.out_dir_name, "te_pred.pkl"))
df_te_pred = pd.merge(df_te_pred_lgbm, df_te_pred_lgbm_auc, on=['社員番号', 'category'], how='left', suffixes=('_lgbm', '_lgbm_auc'))   
df_te_pred['target'] = (df_te_pred['target_lgbm'] + df_te_pred['target_lgbm_auc']) / 2
df_prep_test = pd.read_pickle(os.path.join(DIR_INTERIM, "df_prep_test.pkl"))
df_pred = pd.merge(df_prep_test, df_te_pred, on=["社員番号", "category"], how="left")
df_submit = df_pred[['target']]
path_submit = os.path.join(DIR_SUBMISSIONS, f"{ml_runner.run_name}_ansamble_submition.csv")
df_submit.to_csv(path_submit, header=True, index=False)
print(path_submit)
pd.read_csv(path_submit)

# モデル分析

In [None]:
# 統計値
df_submit['target'].describe()


In [None]:
# 予測値の分布
df_submit['target'].hist(bins=50)

In [None]:
# categoryごとのAUC
df_va_pred = pd.read_pickle(os.path.join(ml_runner.out_dir_name, "va_pred.pkl"))
df_va_target = pd.merge(df_va_pred, Util.load_feature('Target'), on=['社員番号', 'category'], how='left', suffixes=('_pred', '_true'))

auc_scores = {}
auc_scores['all'] = ml_runner.metric(df_va_target['target_true'], df_va_target['target_pred'])
for category in df_va_target['category'].unique():
    df_category = df_va_target[df_va_target['category'] == category]
    auc = ml_runner.metric(df_category['target_true'], df_category['target_pred'])
    auc_scores[category] = auc

pd.DataFrame(data=auc_scores, index=[0]).T.rename(columns={0: 'AUC'})

In [None]:
# ml_runnerとml_runner_aucの出力をアンサンブルしてcategoryごとのAUC
df_va_pred = pd.read_pickle(os.path.join(ml_runner.out_dir_name, "va_pred.pkl"))
df_va_pred_auc = pd.read_pickle(os.path.join(ml_runner_auc.out_dir_name, "va_pred.pkl"))
df_va_pred = pd.merge(df_va_pred, df_va_pred_auc, on=['社員番号', 'category'], how='left', suffixes=('_logloss', '_pred_auc'))
df_va_pred['target'] = (df_va_pred['target_logloss'] + df_va_pred['target_pred_auc']) / 2
df_va_target = pd.merge(df_va_pred, Util.load_feature('Target'), on=['社員番号', 'category'], how='left', suffixes=('_pred', '_true'))

auc_scores = {}
auc_scores['all'] = ml_runner.metric(df_va_target['target_true'], df_va_target['target_pred'])
for category in df_va_target['category'].unique():
    df_category = df_va_target[df_va_target['category'] == category]
    auc = ml_runner.metric(df_category['target_true'], df_category['target_pred'])
    auc_scores[category] = auc

pd.DataFrame(data=auc_scores, index=[0]).T.rename(columns={0: 'AUC'})

In [None]:
logger.result(f"run_name: {ml_runner.run_name}, {ml_runner_auc.run_name}")
logger.result(f"all: {auc_scores['all']}")