# ベースライン

## 必要なライブラリのインポート

In [1]:
import datetime as dt

# データの取り扱いに関するライブラリ
import numpy as np # 高速計算
import pandas as pd # 表データの扱い

# 可視化に関するライブラリ
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib

# 前処理に関するライブラリ
from sklearn.preprocessing import StandardScaler

# モデル学習に関するライブラリ
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.ERROR)

In [2]:
today = dt.datetime.today().strftime('%Y%m%d')
submit_ver = '1'

## ファイルの読み込み

In [3]:
# 自身がファイルを格納したディレクトリを指定
root_dir = '../src/'
train_file_path = root_dir + 'input/train.csv'
test_file_path = root_dir + 'input/test.csv'
submit_file_path = root_dir + 'input/submit_sample.csv'
output_path = root_dir + 'output/'
intermediate_path = root_dir + '中間ファイル/'

In [4]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

In [5]:
target_cols = 'y'

## 前処理

#### 欠損値補完

In [6]:
# NOTE: TEを行うのでunknownはカテゴリとして残す
# # unknownのフラグ化
for df in [train_df, test_df]:
#     df['job_unknown'] = (df['job'] == 'unknown').astype('int8')
#     df.loc[df['job'] == 'unknown', 'job'] = np.nan

#     df['education_unknown'] = (df['education'] == 'unknown').astype('int8')
#     df.loc[df['education'] == 'unknown', 'education'] = np.nan

    df['contact_unknown'] = (df['contact'] == 'unknown').astype('int8')
    df.loc[df['contact'] == 'unknown', 'contact'] = np.nan

#     df['poutcome_unknown'] = (df['poutcome'] == 'unknown').astype('int8')
#     df.loc[df['poutcome'] == 'unknown', 'poutcome'] = np.nan


#### 分布の修正

In [7]:
# クリッピング
clip_prefix = '_clip'

for df in [train_df, test_df]:
    upper = df['balance'].quantile(0.99)
    df['balance' + clip_prefix] = df['balance'].clip(upper=upper)
    df['campaign' + clip_prefix] = df['campaign'].clip(upper=10)
    df['previous' + clip_prefix] = df['previous'].clip(upper=5)

    df['campaign_over_10'] = (df['campaign']>10).astype('int8')
    df['previous_over_5'] = (df['previous']>5).astype('int8')

In [8]:
# log変換
log_prefix = '_log'

for df in [train_df, test_df]:
    df['duration' + log_prefix] = np.log1p(df['duration'])

#### エンコーディング

In [9]:
# monthを数値型にする
month_map = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
    'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
    'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}

for df in [train_df, test_df]:
    df['month'] = df['month'].map(month_map)

In [10]:
binary_cols = ['default', 'housing', 'loan']

for df in [train_df, test_df]:
    df[binary_cols] = (
        df[binary_cols]
        .replace({'yes': 1, 'no': 0})
        .astype('int8')
    )

In [11]:
# contactはunknown以外は記録ありとして集約
for df in [train_df, test_df]:
    df['contact_flg'] = (df['contact'] != 'unknown').astype('int8')

In [12]:
# 新規フラグ
for df in [train_df, test_df]:
    df['new_flg'] = (df['pdays']==-1).astype('int8')

#### ageのビン化

In [13]:
bins = [0, 25, 35, 45, 55, 65, 100]
labels = ['<25', '25-34', '35-44', '45-54', '55-64', '65+']

for df in [train_df, test_df]:
    df['age_bin'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

#### jobの集約

In [14]:
job_groups = {
    'is_non_worker': ['student', 'retired'],
    'is_blue_collar': ['blue-collar'],
    'is_white_collar': ['management', 'admin.', 'technician', 'services'],
    'is_service_worker': ['services'],
    'is_unemployed': ['unemployed'],
    'is_self_employed': ['entrepreneur', 'self-employed'],
    'is_housemaid': ['housemaid'],
}

for df in [train_df, test_df]:
    for col_name, jobs in job_groups.items():
        df[col_name] = df['job'].isin(jobs).astype('int8')

## FE

#### monthの交絡因子

In [15]:
confounder_cols = ['new_flg', 'campaign_clip', 'contact_unknown']
month_col = 'month'
months = range(1, 13)

for df in [train_df, test_df]:
    month_dummies = pd.get_dummies(df[month_col], prefix='month')
    month_dummies = month_dummies.reindex(
        columns=[f'month_{m}' for m in months],
        fill_value=0
    ).astype('int8')

    for c in confounder_cols:
        for m in month_dummies.columns:
            df[f'{m}_x_{c}'] = month_dummies[m] * df[c]

#### 営業要因のディスカウント特徴量

In [16]:
for df in [train_df, test_df]:
    # 1) poutcome が other / unknown のフラグ
    df['poutcome_other_or_unknown'] = df['poutcome'].isin(['other', 'unknown']).astype('int8')

    # 2) 交互作用（営業の雑さ × しつこさ / 新規 / 過去の接触）
    df['contact_unknown_x_campaign_clip'] = (df['contact_unknown'] * df['campaign_clip']).astype('int16')
    df['contact_unknown_x_new_flg']       = (df['contact_unknown'] * df['new_flg']).astype('int8')

    # 交互作用（雑な営業x前回のしつこさ）
    df['poutcome_other_or_unknown_x_previous_clip'] = (df['poutcome_other_or_unknown'] * df['previous_clip']).astype('int16')


#### 顧客要因のディスカウント特徴量

In [17]:
for df in [train_df, test_df]:
    df['is_hard_customer'] = (df['campaign_clip'] >= 4).astype('int8')
    df['hard_stage'] = np.select(
        [
            df['campaign_clip'] <= 2,
            df['campaign_clip'].between(3, 4),
            df['campaign_clip'] >= 5
        ],
        [0, 1, 2]
    ).astype('int8')

#### poutcome が悪いが duration が極端に長い

In [18]:
for df in [train_df, test_df]:
    df['duration_x_campaign'] = (
        df['duration_log'] * df['campaign_clip']
    )
    df['duration_x_hard'] = (
        df['duration_log'] * df['is_hard_customer']
    )

#### month 的に不利な月での成約

In [19]:
for df in [train_df, test_df]:
    df['month_x_duration'] = (
        df['month'] * df['duration_log']
    )
    # month × poutcomeは交差検証の中で

#### duration が長いが結局 NG

In [20]:
for df in [train_df, test_df]:
    df['duration_x_contact_unknown'] = (
        df['duration_log'] * df['contact_unknown']
    )
    # duration_log × poutcomeは交差検証の中で

## モデルの学習

#### パラメータの設定

In [21]:
base_lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.02,
    'num_leaves': 64,
    'max_depth': -1,
    'min_data_in_leaf': 50,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 1.0,
    'lambda_l2': 1.0,
    'verbosity': -1
}

# base_lgb_params = {
#     'objective': 'binary',
#     'metric': 'auc',
#     'learning_rate': 0.02,

#     'num_leaves': 128,
#     'max_depth': -1,
#     'min_data_in_leaf': 20,

#     'feature_fraction': 0.85,
#     'bagging_fraction': 0.85,
#     'bagging_freq': 1,

#     'lambda_l1': 1.0,
#     'lambda_l2': 1.0,

    # ここは好みだが、入れておいて良い
    # 'class_weight': 'balanced',

#     'verbosity': -1,
# }


#### 特徴量の選択

In [None]:
fe_cols = [
    # 1. 顧客の潜在的な口座開設意欲（静的要因）
    # 1-1. 年齢・ライフステージ
    'age', 'age_bin',
    'marital',
    'education', # 'education_unknown',

    # 1-2. 経済的余力・信用
    'balance_clip',
    'default',
    'housing',
    'loan',

    # 1-3. 職業による安定性
    'job', # 'job_unknown',
    'is_non_worker',
    'is_blue_collar',
    'is_white_collar',
    'is_service_worker',
    'is_unemployed',
    'is_self_employed',
    'is_housemaid',

    # 2. マーケティング接触の質（動的要因）
    # 2-1. 接触チャネル
    'contact_flg', # 'contact_unknown',
    
    # 2-2. 接触時間
    'duration_log',

    # 2-3. 接触回数
    'campaign_clip', 'campaign_over_10',

    # 2-4. 新規客
    'new_flg',
    
    # 2-5. 雑な営業
    'contact_unknown_x_campaign_clip',
    'contact_unknown_x_new_flg',
    'poutcome_other_or_unknown_x_previous_clip',

    # 2-6. 硬化した顧客
    'is_hard_customer', 'hard_stage',

    # 3. 過去のキャンペーンの履歴・学習効果
    # 3-1. 過去接触有無
    'previous_clip', 'previous_over_5',

    # 3-2. 前回の結果
    'poutcome', # 'poutcome_unknown', 'poutcome_other_or_unknown',

    # 3-3. 前回からの経過日数
    'pdays',

    # 4. タイミング要因（季節性）
    'month',

    # 交絡因子
    'month_1_x_new_flg',
    'month_2_x_new_flg',
    'month_3_x_new_flg',
    'month_4_x_new_flg',
    'month_5_x_new_flg',
    'month_6_x_new_flg',
    'month_7_x_new_flg',
    'month_8_x_new_flg',
    'month_9_x_new_flg',
    'month_10_x_new_flg',
    'month_11_x_new_flg',
    'month_12_x_new_flg',

    'month_1_x_campaign_clip',
    'month_2_x_campaign_clip',
    'month_3_x_campaign_clip',
    'month_4_x_campaign_clip',
    'month_5_x_campaign_clip',
    'month_6_x_campaign_clip',
    'month_7_x_campaign_clip',
    'month_8_x_campaign_clip',
    'month_9_x_campaign_clip',
    'month_10_x_campaign_clip',
    'month_11_x_campaign_clip',
    'month_12_x_campaign_clip',

    'month_1_x_contact_unknown',
    'month_2_x_contact_unknown',
    'month_3_x_contact_unknown',
    'month_4_x_contact_unknown',
    'month_5_x_contact_unknown',
    'month_6_x_contact_unknown',
    'month_7_x_contact_unknown',
    'month_8_x_contact_unknown',
    'month_9_x_contact_unknown',
    'month_10_x_contact_unknown',
    'month_11_x_contact_unknown',
    'month_12_x_contact_unknown',

    # 偽陰性対策
    'duration_x_hard',

    # 偽陽性対策
]

In [23]:
cat_cols = [
    'marital',
    'education',
    'job',
    'poutcome'
]

# for c in cat_cols:
#     train_df[c] = train_df[c].astype('category')
#     test_df[c]  = test_df[c].astype('category')

In [24]:
X = train_df[fe_cols]
y = train_df['y']

#### 交差検証

In [25]:
n_splits = 20
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1234)

oof_pred = np.zeros(len(X))
aucs = []
fi_list = []

In [26]:
alpha = 30

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    X_tr_raw = X.iloc[train_idx].copy()
    X_val_raw = X.iloc[valid_idx].copy()
    y_tr = y.iloc[train_idx]
    y_val = y.iloc[valid_idx]

    X_tr = X_tr_raw.copy()
    X_val = X_val_raw.copy()

    global_mean = y_tr.mean()

    for col in cat_cols:
        # train fold内で sum/count を計算
        stats = (
            pd.DataFrame({'key': X_tr_raw[col], 'y': y_tr})
            .groupby('key')['y']
            .agg(['sum', 'count'])
        )

        # スムージングした TE を作成
        te_map = (stats['sum'] + alpha * global_mean) / (stats['count'] + alpha)

        # 適用（未出現カテゴリは global_mean）
        X_tr[col] = X_tr_raw[col].map(te_map).fillna(global_mean)
        X_val[col] = X_val_raw[col].map(te_map).fillna(global_mean)
    
    X_tr['month_x_poutcome'] = (
        X_tr['month'] * X_tr['poutcome']
    )
    X_val['month_x_poutcome'] = (
        X_val['month'] * X_val['poutcome']
    )

    model = lgb.LGBMClassifier(**base_lgb_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(50, verbose=False)],
    )

    y_val_pred = model.predict_proba(X_val)[:, 1]
    oof_pred[valid_idx] = y_val_pred

    auc = roc_auc_score(y_val, y_val_pred)
    aucs.append(auc)
    print(f'fold {fold:02d} | AUC = {auc:.5f}')

    # ============
    # feature importance を保存
    # ============
    # sklearn API なら feature_importances_ は split 重要度がデフォルト
    # gain を使うなら booster から取る
    booster = model.booster_
    fi_fold = pd.Series(
        booster.feature_importance(importance_type='gain'),
        index=booster.feature_name(),
        name=f'fold_{fold:02d}'
    )
    fi_list.append(fi_fold)

oof_auc = roc_auc_score(y, oof_pred)
print(f'\nOOF AUC = {oof_auc:.5f}')
print(f'CV mean = {np.mean(aucs):.5f}, std = {np.std(aucs):.5f}')


fold 00 | AUC = 0.91614
fold 01 | AUC = 0.92579
fold 02 | AUC = 0.93156
fold 03 | AUC = 0.93295
fold 04 | AUC = 0.92834
fold 05 | AUC = 0.93557
fold 06 | AUC = 0.92829
fold 07 | AUC = 0.91685
fold 08 | AUC = 0.92840
fold 09 | AUC = 0.91950
fold 10 | AUC = 0.93345
fold 11 | AUC = 0.93233
fold 12 | AUC = 0.94637
fold 13 | AUC = 0.91267
fold 14 | AUC = 0.92828
fold 15 | AUC = 0.91972
fold 16 | AUC = 0.91337
fold 17 | AUC = 0.92299
fold 18 | AUC = 0.92382
fold 19 | AUC = 0.92669

OOF AUC = 0.92553
CV mean = 0.92615, std = 0.00811


In [27]:
# 通常のTE
# OOF AUC = 0.91765
# CV mean = 0.92520, std = 0.00796

# hard_stage + duration_x_hard + month_x_poutcome NOTE:採用
# OOF AUC = 0.92553
# CV mean = 0.92615, std = 0.00811

#### モデルの予測傾向の確認

In [29]:
fi_mat = pd.concat(fi_list, axis=1).fillna(0.0)

fi_df = pd.DataFrame({
    'fi_mean': round(fi_mat.mean(axis=1)),
    'fi_std': round(fi_mat.std(axis=1)),
}).sort_values('fi_mean', ascending=False).reset_index()

fi_df.to_csv(intermediate_path + '特徴量重要度.csv', index=False)

In [30]:
oof_df = train_df.copy()
oof_df['oof_pred'] = oof_pred

oof_df.to_csv(intermediate_path + '交差検証の予測結果.csv', index=False)

## 最終モデル学習

#### ターゲットエンコーディング

In [31]:
def target_encoding_full(X, y, test, cat_cols, alpha=30):
    X_te = X.copy()
    test_te = test.copy()
    global_mean = y.mean()

    for col in cat_cols:
        stats = (
            pd.DataFrame({col: X[col], 'y': y})
            .groupby(col)['y']
            .agg(['sum', 'count'])
        )

        te_map = (stats['sum'] + alpha * global_mean) / (stats['count'] + alpha)

        X_te[col] = X[col].map(te_map).fillna(global_mean)
        test_te[col] = test[col].map(te_map).fillna(global_mean)

    return X_te, test_te

In [32]:
# 学習データ全てに対してtarget encodingを実施して、それをtest_dfに適用させる
X_te, X_test_te = target_encoding_full(
    X,
    y,
    test_df,
    cat_cols
)

In [38]:
X_te['month_x_poutcome'] = (
        X_te['month'] * X_te['poutcome']
)
X_test_te['month_x_poutcome'] = (
    X_test_te['month'] * X_test_te['poutcome']
)

#### モデルの学習

In [39]:
model_base = lgb.LGBMClassifier(**base_lgb_params)
model_base.fit(
    X_te, y,
    # categorical_feature=cat_cols
)

## 予測データの予測

In [40]:
# test 側も new / existing に分ける
X_test_te = X_test_te[fe_cols + ['month_x_poutcome']]

In [41]:
# 予測
test_df['pred'] = model_base.predict_proba(X_test_te)[:, 1]

In [42]:
# 問題なく予測できているか一部を出力
print(test_df[['id', 'pred']].head(3))

   id      pred
0   1  0.543244
1   2  0.461759
2   3  0.020007


In [48]:
test_df.to_csv(intermediate_path + 'testデータの予測結果.csv', index=False)

## 提出用ファイルの作成・出力

In [43]:
submit_df = pd.read_csv(submit_file_path, header=None)
submit_df.columns = ['id', 'pred']

In [44]:
submit_df.head(3)

Unnamed: 0,id,pred
0,1,0.236
1,2,0.128
2,3,0.903


In [45]:
# prd の列に予測値を代入
submit_df['pred'] = test_df['pred']

# 出力確認
print(submit_df.head())
print(submit_df.shape)

   id      pred
0   1  0.543244
1   2  0.461759
2   3  0.020007
3   4  0.019597
4   5  0.094043
(18083, 2)


In [46]:
# 提出用のファイルを出力
submit_df.to_csv(
    root_dir + f'{output_path}submit_{today}_v{submit_ver}.csv',
    index=False,
    header=False
)