In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from datetime import datetime

# 自作モジュールの読み込み
sys.path.append(os.path.abspath('..'))
from configs.config import *
from src.runner import Runner
from src.model_LGBM import model_LGBM
from src.util import Logger, Util

In [None]:
# ロガーの設定
logger = Logger(path=DIR_LOG)

def get_run_name(model_type):
    """run名の作成
    """
    run_name = model_type
    suffix = '_' + datetime.now().strftime("%Y%m%d%H%M")
    run_name = run_name + suffix
    return run_name

# LGBM ユニバース分割

In [None]:
import importlib
from src import runner
from src import model_LGBM
from src import model
from configs import config

# runnerモジュールをリロード
importlib.reload(runner)
importlib.reload(model_LGBM)
importlib.reload(model)
importlib.reload(config)

# Runnerクラスを再インポート
from src.model import Model
from src.runner import Runner
from src.model_LGBM import model_LGBM
from configs.config import *

In [None]:
df_all = Util.load_feature('Key').merge(
    Util.load_feature('Target'), how='left', on=['社員番号', 'category']
).merge(
    Util.load_feature('CategoryFeature'), how='left', on=['category']
).merge(
    Util.load_feature('CareerFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('DxFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('HrFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('PositionHistoryFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('UdemyActivityFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('UdemyTimeseriesFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('UdemyTitleEmbedding'), how='left', on=['社員番号']
# ).merge(
#     Util.load_feature('UdemyIDEmbedding'), how='left', on=['社員番号']
# ).merge(
#     Util.load_feature('OvertimeWorkByMonthFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('OvertimeWorkByMonthTimeseriesFeature'), how='left', on=['社員番号']
).merge(
    Util.load_feature('UdemyCategorySimilarityFeature'), how='left', on=['社員番号', 'category']
).merge(
    Util.load_feature('UdemyTitleSimilarityFeature'), how='left', on=['社員番号', 'category']
)

In [None]:
# train/test
df_train = df_all[df_all['target'].notnull()]
df_test = df_all[df_all['target'].isnull()]
print('train shape: {}'.format(df_train.shape))
print('test shape: {}'.format(df_test.shape))

In [None]:
# run_nameの設定
run_name = get_run_name(model_type="lgbm_unv")
# run_name = 'lgbm_unv_202507311522'
run_name

In [None]:
def after_predict_process(df_pred, target_col):
    """予測後に行う処理
    Args:
        df_pred(pd.DataFrame): 予測データ[key_cols, 予測値]
        target_col(str): 予測値のカラム名
    Returns:    
        df_pred(pd.DataFrame): 予測データ[key_cols, 予測値]
    """
    return df_pred

def after_split_process(tr, va):
    """データセットの分割後に行う処理
    Args:
        tr(pd.DataFrame): 訓練データ
        va(pd.DataFrame): 検証データ
    returns:
        tr(pd.DataFrame): 訓練データ
        va(pd.DataFrame): 検証データ
    """
    return tr, va

model_params_lgb = {
    #### run params
    "key_cols": KEY_COL,                    # ユニークキー
    "target_col": TARGET_COL,              # 目的変数（0 or 1）
    "remove_cols": [],
    #### model train params
    "num_boost_round": 5000,
    "early_stopping_rounds": 100,
    "verbose": -1,
    "period": 100,
    "log_level": 'error',
    "verbosity": -1,
    #### model core params (for binary classification)
    # "scale_pos_weight": 10,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',     # 'aus or binary_logloss',
    # "is_unbalance": True,         # 不均衡データ対応
    # "scale_pos_weight": 10,       # 不均衡データ対応
    "max_depth": 15,               # 木の最大深さ（デフォルト=-1=制限なし）
    "num_leaves": 6,               # 葉の数（デフォルト=31、過学習を抑制した設定）
    "feature_fraction": 0.9,       # 学習時の列サンプリング比率（デフォルト=1.0）
    "bagging_freq": 1,             # バギングの頻度（デフォルト=0、0は使わない）
    "learning_rate": 0.01,         # 学習率（デフォルト=0.1、小さいほど安定だが学習遅い）
    "bagging_fraction": 1.0,       # 学習時の行サンプリング比率（デフォルト=1.0）
    "random_state": 42,            # 乱数シード（デフォルト=None）
    "colsample_bytree": 0.9,       # ツリー単位の列サブサンプリング（※alias、デフォルト=1.0）
    "colsample_bynode": 0.6,       # ノード単位の列サブサンプリング（デフォルト=1.0）
    "lambda_l1": 3.5,              # L1正則化（デフォルト=0.0）
    "lambda_l2": 1.5,              # L2正則化（デフォルト=0.0）
    "min_data_in_leaf": 20,        # 葉に必要な最小データ数（デフォルト=20）
    "device": "cpu"                # デバイス指定（"gpu"/"cpu"、デフォルト="cpu"）
}

run_setting = {
    'calc_shap': False,     # shap値を計算するか否か
    "tune_params": True,           # パラメータチューニング、lgb_hopt,xgb_hopt,nn_hopt,False
    "after_predict_process": after_predict_process,
    'after_split_process': after_split_process,  
}
cv_setting = {
    "target_col": TARGET_COL,
    "group_col": "社員番号",  # グループ化するカラム
    "n_splits": 3,  # 分割数,
    "shuffle": True,  # シャッフルするか否か
    "random_state": 42,  # ランダムシード
}

In [None]:
memo = "LightGBM categoryでユニバース分割して学習"
ml_runner_content = Runner(
    f'{run_name}_content', model_LGBM, model_params_lgb,
    df_train[df_train['category']=='コンテンツ・サービス・デザイン'],
    df_test[df_test['category']=='コンテンツ・サービス・デザイン'],
    run_setting, cv_setting, logger, memo,
)
ml_runner_corp = Runner(
    f'{run_name}_corp', model_LGBM, model_params_lgb,
    df_train[df_train['category']=='コーポレート管理部門/技術・データ・BPR'],
    df_test[df_test['category']=='コーポレート管理部門/技術・データ・BPR'],
    run_setting, cv_setting, logger, memo,
)
ml_runner_product = Runner(
    f'{run_name}_product', model_LGBM, model_params_lgb,
    df_train[df_train['category']=='プロダクトマネジメント'],
    df_test[df_test['category']=='プロダクトマネジメント'],
    run_setting, cv_setting, logger, memo,
)
ml_runner_marketing = Runner(
    f'{run_name}_marketing', model_LGBM, model_params_lgb,
    df_train[df_train['category']=='マーケティング'],
    df_test[df_test['category']=='マーケティング'],
    run_setting, cv_setting, logger, memo,
)
ml_runner_dev = Runner(
    f'{run_name}_dev', model_LGBM, model_params_lgb,
    df_train[df_train['category']=='事業企画・開発・研究'],
    df_test[df_test['category']=='事業企画・開発・研究'],
    run_setting, cv_setting, logger, memo,
)
ml_runner_sales = Runner(
    f'{run_name}_sales', model_LGBM, model_params_lgb,
    df_train[df_train['category']=='営業'],
    df_test[df_test['category']=='営業'],
    run_setting, cv_setting, logger, memo,
)

In [None]:
# ml_runner.tune_params(30)

In [None]:
ml_runner_content.run_train_cv()
ml_runner_corp.run_train_cv()
ml_runner_product.run_train_cv()
ml_runner_marketing.run_train_cv()
ml_runner_dev.run_train_cv()
ml_runner_sales.run_train_cv()

In [None]:
ml_runner_content.run_metric_cv()
ml_runner_corp.run_metric_cv()
ml_runner_product.run_metric_cv()
ml_runner_marketing.run_metric_cv()
ml_runner_dev.run_metric_cv()
ml_runner_sales.run_metric_cv()


In [None]:
ml_runner_content.run_predict_cv()
ml_runner_corp.run_predict_cv()
ml_runner_product.run_predict_cv()  
ml_runner_marketing.run_predict_cv()
ml_runner_dev.run_predict_cv()
ml_runner_sales.run_predict_cv()

In [None]:
# ml_runner_content.plot_feature_importance_cv()
# ml_runner_corp.plot_feature_importance_cv()
# ml_runner_product.plot_feature_importance_cv()
# ml_runner_marketing.plot_feature_importance_cv()
# ml_runner_dev.plot_feature_importance_cv()
# ml_runner_sales.plot_feature_importance_cv()

In [None]:
tr, va = ml_runner_content.create_train_valid_dateset(0)
print("train shape: {}".format(tr.shape))
print("valid shape: {}".format(va.shape))

# Submissionの作成

In [None]:
df_te_pred_content = pd.read_pickle(os.path.join(ml_runner_content.out_dir_name, "te_pred.pkl"))
df_te_pred_corp = pd.read_pickle(os.path.join(ml_runner_corp.out_dir_name, "te_pred.pkl"))
df_te_pred_product = pd.read_pickle(os.path.join(ml_runner_product.out_dir_name, "te_pred.pkl"))
df_te_pred_marketing = pd.read_pickle(os.path.join(ml_runner_marketing.out_dir_name, "te_pred.pkl"))
df_te_pred_dev = pd.read_pickle(os.path.join(ml_runner_dev.out_dir_name, "te_pred.pkl"))
df_te_pred_sales = pd.read_pickle(os.path.join(ml_runner_sales.out_dir_name, "te_pred.pkl"))
df_te_pred = pd.concat([
    df_te_pred_content,
    df_te_pred_corp,
    df_te_pred_product,
    df_te_pred_marketing,
    df_te_pred_dev,
    df_te_pred_sales,
], axis=0, ignore_index=True)

In [None]:
df_prep_test = pd.read_pickle(os.path.join(DIR_INTERIM, "df_prep_test.pkl"))
df_pred = pd.merge(df_prep_test, df_te_pred, on=["社員番号", "category"], how="left")
df_submit = df_pred[['target']]

In [None]:
path_submit = os.path.join(DIR_SUBMISSIONS, f"{run_name}_submition.csv")
df_submit.to_csv(path_submit, header=True, index=False)
print(path_submit)
pd.read_csv(path_submit)

In [None]:
df_submit['target'].describe()


In [None]:
df_submit['target'].hist()

In [None]:
df_va_pred_content = pd.read_pickle(os.path.join(ml_runner_content.out_dir_name, "va_pred.pkl"))
df_va_pred_corp = pd.read_pickle(os.path.join(ml_runner_corp.out_dir_name, "va_pred.pkl"))
df_va_pred_product = pd.read_pickle(os.path.join(ml_runner_product.out_dir_name, "va_pred.pkl"))
df_va_pred_marketing = pd.read_pickle(os.path.join(ml_runner_marketing.out_dir_name, "va_pred.pkl"))
df_va_pred_dev = pd.read_pickle(os.path.join(ml_runner_dev.out_dir_name, "va_pred.pkl"))
df_va_pred_sales = pd.read_pickle(os.path.join(ml_runner_sales.out_dir_name, "va_pred.pkl"))
df_va_pred = pd.concat([
    df_va_pred_content,
    df_va_pred_corp,
    df_va_pred_product,
    df_va_pred_marketing,
    df_va_pred_dev,
    df_va_pred_sales,
], axis=0, ignore_index=True)

In [None]:
target = Util.load_feature('Target')
df_va_pred_target = pd.merge(df_va_pred, target, on=['社員番号', 'category'], how='left', suffixes=('_pred', '_true'))

In [None]:
# categoryごとのAUCを計算
auc_scores = {}
auc_scores['all'] = ml_runner_content.metric(df_va_pred_target['target_true'], df_va_pred_target['target_pred'])
for category in df_va_pred_target['category'].unique():
    df_category = df_va_pred_target[df_va_pred_target['category'] == category]
    auc = ml_runner_content.metric(df_category['target_true'], df_category['target_pred'])
    auc_scores[category] = auc

pd.DataFrame(data=auc_scores, index=[0]).T.rename(columns={0: 'AUC'})