# 作成中。。。

# 引用元: https://www.kaggle.com/code/tgwstr/baselinelgbm001

# Pipeline

## Library

In [None]:
import os
import json
import warnings
import shutil
import logging
import joblib
import random
import datetime
import pytz
import sys
import re

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
from tqdm.auto import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, mean_squared_error

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
from tensorflow.keras import backend as K
from lightgbm import LGBMRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from tensorflow.keras.utils import plot_model

from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

import pickle
import glob

# import shap
import xgboost
from scipy.stats import spearmanr
from sklearn.ensemble import (
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)

import lightgbm as lgb
# from lightgbm import log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns

warnings.filterwarnings("ignore")

## Config

## 作成・編集途中です。

In [None]:
COLAB = "google.colab" in sys.modules

In [None]:
class Config:
    # notebookのタイトル取得
    if COLAB:
        from requests import get
        name = get('http://172.28.0.2:9000/api/sessions').json()[0]['name'].split('.')[0]  
    else:
        name = "baseline_lgbm001"  # kaggle環境ならば自分で記入

    # 予測のみ/学習+予測の選択
    only_inference = False
    if only_inference:
        task = 'infer'
    else:
        task = 'train'

    # クロスバリデーション設定
    n_fold = 5
    trn_fold = list(range(n_fold))

    seed = 42
    
    target_col = "target"  # ⚠️コンペごとに更新
    categ_feats = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    # debug = False

    # lgb params
    def get_lgb_params() -> dict:
        lgb_params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'learning_rate': 0.05,
            'num_leaves': 64,
            'force_col_wise': True,
            'bagging_freq': 1,
            'seed': 2112,
            'verbosity': 0,
            'first_metric_only': True,
            'bin_construct_sample_cnt': 100000000,
            'feature_pre_filter': False,
            'bagging_fraction': 0.9,
            'feature_fraction': 0.2,
            'lambda_l1': 0.1,
            'lambda_l2': 0.1,
            'min_data_in_leaf': 1000,
            'path_smooth': 10,
            'max_bin': 255,
            }
        return lgb_params

    # 解凍ファイル
    # zip_file = 'foursquare-location-matching.zip'  # ⚠️コンペごとに更新

    # Colab Env
    upload_from_colab = True
    api_path = "/content/drive/MyDrive/kaggle/kaggle.json"
    drive_path = "/content/drive/MyDrive/kaggle/AmericanExpress"  # ⚠️コンペごとに更新
    
    # Kaggle Env
    kaggle_input_path = "../input/expression-chinchilla"  # ⚠️コンペごとに更新
    kaggle_dataset_path = None

## Utils

In [None]:
# log を txtファイルに出力させるためのクラス
class Logger:
    # 参考) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py
    def __init__(self, path, log_title='Experiment'):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, f'{log_title}.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info(f'[{self.now_string()}] - {message}')

    @staticmethod
    def now_string():
        return str(datetime.datetime.now(pytz.timezone('Asia/Tokyo')).strftime('%Y-%m-%d %H:%M:%S'))

In [None]:
# シード固定用関数
def seed_everything(seed=42):
#  参考) https://qiita.com/kaggle_grandmaster-arai-san/items/d59b2fb7142ec7e270a5
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
def get_feats_cols(train_df: pd.DataFrame, *drop_cols) -> list:
    return list(train_df.drop(labels=list(drop_cols), axis=1).columns)

## SetUp

### 環境ごとのセットアップ

In [None]:
# -------------------------------colab 環境の場合-------------------------------
if COLAB:
    print("-------------------------------This environment is Google Colab-------------------------------")
    
    # mount
    from google.colab import drive
    if not os.path.isdir("/content/drive"):
        drive.mount('/content/drive') 

    # my-modules のPath設定
    import sys
    sys.path.append('/content/drive/MyDrive/Colab Notebooks/my-modules')

    # use kaggle api (need kaggle token)
    f = open(Config.api_path, 'r')
    json_data = json.load(f) 
    os.environ["KAGGLE_USERNAME"] = json_data["username"]
    os.environ["KAGGLE_KEY"] = json_data["key"]
    
    # set dirs
    DRIVE = Config.drive_path
    EXP = (Config.name if Config.name is not None 
           else get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6])
    INPUT = os.path.join(DRIVE, "INPUT")
    FEATURES = os.path.join(INPUT, "FEATURES")
    OUTPUT = os.path.join(DRIVE, "OUTPUT")
    SUBMISSION = os.path.join(DRIVE, "SUBMISSION")
    OUTPUT_EXP = os.path.join(OUTPUT, EXP) 
    EXP_MODEL = os.path.join(OUTPUT_EXP, "MODEL")
    EXP_FIG = os.path.join(OUTPUT_EXP, "FIG")
    EXP_PREDS = os.path.join(OUTPUT_EXP, "PREDS")

    # make dirs
    for d in [INPUT, FEATURES, SUBMISSION, EXP_MODEL, EXP_FIG, EXP_PREDS]:
        os.makedirs(d, exist_ok=True)

    # if not os.path.isfile(os.path.join(INPUT, Config.zip_file)):
    #     # download dataset
    #     # kaggle をインストール
    #     # アクセスパーミッションのため、以下を打ち込みます。
    #     ! chmod 600 /root/.kaggle/kaggle.json
    #     ! pip install kaggle
    #     ! kaggle competitions download -c foursquare-location-matching -p $INPUT  # ⚠️コンペごとに更新
    #     # 上記でdownloadしてきたZIPファイルを解凍
    #     ! apt-get install p7zip-full -y
    #     ! 7za x os.path.join(INPUT, Config.zip_file)
    # else:
    #     print('DS for competition has been already installed.') 
    
    # utils
    logger = Logger(OUTPUT_EXP)
    
    sys.path.append('/content/drive/MyDrive/Colab Notebooks/my-modules')


# -------------------------------kaggle 環境の場合-------------------------------
else:
    print("-------------------------------This environment is Kaggle Kernel-------------------------------")
    
    # set dirs
    INPUT = Config.kaggle_input_path  # ⚠️コンペごとに更新
    EXP, OUTPUT, SUBMISSION = "./", "./", "./"
    EXP_MODEL = os.path.join(EXP, "model")
    EXP_FIG = os.path.join(EXP, "fig")
    EXP_PREDS = os.path.join(EXP, "preds")
    
    # copy dirs
    if Config.kaggle_dataset_path is not None:
        KD_MODEL = os.path.join(Config.kaggle_dataset_path, "model")
        KD_EXP_PREDS = os.path.join(Config.kaggle_dataset_path, "preds")
        shutil.copytree(KD_MODEL, EXP_MODEL)
        shutil.copytree(KD_EXP_PREDS, EXP_PREDS)

    # make dirs
    for d in [EXP_MODEL, EXP_FIG, EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
        
    # utils
    logger = Logger(EXP)

# utils
warnings.filterwarnings("ignore")
sns.set(style='whitegrid')
seed_everything(seed=Config.seed)

## コンペ説明

レストランでの食事やコンサートのチケット購入など、現代の生活では日々の買い物にクレジットカードの利便性が欠かせません。
クレジットカードがあれば、多額の現金を持ち歩く必要がなく、また、買い物の全額を前払いして、長期にわたって支払うことができます。
しかし、カード発行会社は、私たちが請求した金額をきちんと返済してくれることをどうやって確認するのでしょうか？
この問題は複雑で、多くの解決策がありますが、このコンペティションでは、さらに多くの改善策が検討されています。

貸し倒れ予測は、消費者金融ビジネスのリスク管理の中心的存在です。
貸し倒れを予測することで、貸し出しの決定を最適化し、より良い顧客体験と健全なビジネス経済を実現することができます。
現在のモデルは、リスク管理を支援するために存在しています。
しかし、現在使用されているモデルを凌駕する、より優れたモデルを作成することは可能です。

アメリカン・エキスプレスは、世界的に統合された決済企業です。
世界最大の決済カード発行会社である同社は、生活を豊かにし、ビジネスの成功をもたらす商品、洞察、体験へのアクセスを顧客に提供しています。

このコンペティションでは、機械学習のスキルを応用して、クレジット・デフォルトを予測します。
具体的には、産業界規模のデータセットを活用し、現在の生産モデルに挑戦する機械学習モデルを構築していただきます。
トレーニング、検証、テストの各データセットには、時系列行動データおよび匿名化された顧客プロファイル情報が含まれます。
特徴量の作成から、モデル内でのデータの有機的な利用まで、最も強力なモデルを作るためのあらゆる手法を自由に探求することができます。

成功すれば、クレジットカードの審査が通りやすくなり、カード会員にとってより良い顧客体験の創造に貢献できます。
優れたソリューションは、世界最大のクレジットカード発行会社が使用しているクレジットデフォルト予測モデルに挑戦し、
賞金やアメリカン・エキスプレスとの面接の機会、そしてやりがいのある新しいキャリアを獲得する可能性があります。

## Load Data

In [None]:
train = pd.read_parquet(os.path.join(INPUT, 'train_agg.parquet'))
test = pd.read_parquet(os.path.join(INPUT, 'test_agg.parquet'))
sample_submission = pd.read_csv(os.path.join(INPUT, 'sample_submission.csv'))  # parquetにすべき？

customer_ID = train['customer_ID']

In [None]:
train.head()

In [None]:
drop_cols = ['customer_ID', 'target']
feats_cols = get_feats_cols(train, *drop_cols)

## Train Models

### CV split

In [None]:
from sklearn.model_selection import StratifiedKFold

train["fold"] = -1

skf = StratifiedKFold(n_splits=Config.n_fold,
                      shuffle=True,
                      random_state=Config.seed)
skf_split = list(skf.split(X=train,
                        y=train[Config.target_col]))

for i_fold, lst in enumerate(skf_split):
    if i_fold in Config.trn_fold:
        train.loc[lst[1].tolist(), "fold"] = i_fold

### Calculate functions

In [None]:
def compute_recall_at4(y_true: np.array, y_pred: np.array) -> float:
    
    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos
    
    # desc sorting by prediction values
    indices = np.argsort(y_pred)[::-1]
    target = y_true[indices]
    
    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    
    # default rate captured at 4%
    d = target[four_pct_mask].sum() / n_pos
    
    return d

def compute_normalized_gini(y_true: np.array, y_pred: np.array) -> float:
    
    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting desc by prediction values
    indices = np.argsort(y_pred)[::-1]
    target = y_true[indices]

    # weighted gini coefficient
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()

    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max
    
    return g
    
def compute_amex_metric(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting desc by prediction values
    indices = np.argsort(y_pred)[::-1]
    target = y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

In [None]:
# metrics in lgbm format

def metric_recall_at4(y_pred: np.ndarray, data: lgb.Dataset):
    y_true = data.get_label()
    # name, result, is_higher_better
    return 'recall_at4', compute_recall_at4(y_true, y_pred), True

def metric_normalized_gini(y_pred: np.ndarray, data: lgb.Dataset):
    y_true = data.get_label()
    # name, result, is_higher_better
    return 'norm_gini', compute_normalized_gini(y_true, y_pred), True

def metric_amex(y_pred: np.ndarray, data: lgb.Dataset):
    y_true = data.get_label()
    # name, result, is_higher_better
    return 'amex_metric', compute_amex_metric(y_true, y_pred), True

### LGBM

In [None]:
def make_lgb_ds(X, y):
    return lgb.Dataset(data=X, 
                       label=y, 
                       feature_name='auto',  # 列名を自動で認識
#                        categorical_feature=Config.categ_feats,
                       free_raw_data=False)

In [None]:
def train_lgbm(df, folds=Config.n_fold, params=Config.get_lgb_params()):
    models = []
    
    for fold in tqdm(range(folds)):
        model_path = os.path.join(EXP_MODEL, f"{Config.name}-seed{Config.seed}-fold{fold}")
        # modelが保存されていない場合はtrainning
        if not os.path.isfile(model_path):
            # train, valid毎の入出力を用意
            X_train = df[df.fold != fold][feats_cols]
            y_train = df[df.fold != fold][Config.target_col]
            X_valid = df[df.fold == fold][feats_cols]
            y_valid = df[df.fold == fold][Config.target_col]
            # train, valid毎にdsへ格納
            train_ds = make_lgb_ds(X_train, y_train)
            valid_ds = make_lgb_ds(X_valid, y_valid)
            # modelの用意
            model = lgb.train(params=params,
                              train_set=train_ds,
                              valid_sets=[train_ds, valid_ds],
                              feval=[metric_amex, 
                                     metric_recall_at4, 
                                     metric_normalized_gini],
                              early_stopping_rounds=20,
                              num_boost_round=3000,  # 最大の分岐回数
                              callbacks=[lgb.log_evaluation(period=50), 
                                         lgb.early_stopping(50)]
                              )
            # fold毎のmodelをpklファイルとして保存
            pickle.dump(model, open(model_path, 'wb'))
            print(f"{Config.name}-seed{Config.seed}-fold{fold} has been saved.")
            
            # modelsへmodelを追加
            models.append(model)
            
            # validモードの予測とAmexMetricを計算
            oof_preds = model.predict(X_valid)
            oof_score = compute_amex_metric(y_valid, oof_preds)

            # fold毎にモデル名とスコア(AmexMetric)を表示
            logger.info(f"model_name:{Config.name}-seed:{Config.seed}-fold:{fold}\
                        \n-X_cols:{X_train.columns.values}-y:{y_train.columns.values} >>>>> Score(AmexMetric)={oof_score}")
            print(f'fold_{fold} has finished.')
            print('-----------------------------')
        # 既に保存済みの場合は保存しない
        else:
            print(f'fold_{fold}: No model trained.')
        
    return models

In [None]:
%%time
lgbm_models = train_lgbm(train, 
                         folds=Config.n_fold, 
                         params=Config.get_lgb_params())

In [None]:
import lightgbm as lgbm

def fit_lgbm(X, y, params=None, es_rounds=20, seed=42, N_SPLITS=5, 
             n_class=None, model_dir=None, folds=None):
    models = []
    oof = np.zeros((len(y), n_class), dtype=np.float64)
    
    for i in tqdm(range(Config.n_fold)):
        print(f"== fold {i} ==")
        trn_idx = (folds!=i)
        val_idx = (folds==i)
        X_train, y_train = X[trn_idx], y.iloc[trn_idx]
        X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]

        if model_dir is None:
            model = lgbm.LGBMClassifier(**params)
            model.fit(
                X_train, y_train, 
                eval_set=[(X_valid, y_valid)],  
                early_stopping_rounds=es_rounds, 
                eval_metric='logloss',  
    #             verbose=-1)
                verbose=50)
        else:
            with open(f'{model_dir}/lgbm_fold{i}.pkl', 'rb') as f:
                model = pickle.load(f)
            
        pred = model.predict_proba(X_valid)
        oof[val_idx] = pred
        models.append(model)
        
        file = f'lgbm_fold{i}.pkl'
        pickle.dump(model, open(file, 'wb'))
        print()

    cv = (oof.argmax(axis=-1) == y).mean()
    print(f"CV-accuracy: {cv}")

    return oof, models

In [None]:
def inference_lgbm(models, feat_df):
    pred = np.array([model.predict_proba(feat_df) for model in models])
    pred = np.mean(pred, axis=0)
    return pred

In [None]:
params = {
    'objective': "logloss",
    'learning_rate': 0.2,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'random_state': 42,

    'max_depth': 7,   
    'num_leaves': 35, 
    'n_estimators': 1000000, 
    "colsample_bytree": 0.9,
}

oof, models = fit_lgbm(train.drop(Config.target_col, axis=1), train[Config.target_col], 
                       params=params, n_class=, 
                       N_SPLITS=Config.n_fold, folds=y_train.values)

### Run - prints rmse for each fold

In [None]:
%%time
lgb_params = Config.get_lgb_params()
lgbm_models = train_lgbm(prices, folds=Config.n_fold, params=lgb_params)

# Make Predictions & Submit

In [None]:
def pred_to_submission(pred):
    pred = pred.sort_values(by = "Prediction", ascending=False)
    pred.Rank = np.arange(0,2000)
    pred = pred.sort_values(by = "SecuritiesCode", ascending=True)
    pred.drop(["Prediction"],axis=1)
    submission = pred[["Date","SecuritiesCode","Rank"]]
    return submission

In [None]:
if not COLAB:
    import jpx_tokyo_market_prediction as JTMP
    env = JTMP.make_env()
    iter_test = env.iter_test()

    for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
        prices["SecuritiesCode"] = enc.fit_transform(prices[["SecuritiesCode"]])

        X_test = prices[["SecuritiesCode", "Open", "High", "Low", "Close"]]
        lgbm_preds = list()
        for model in lgbm_models:
            lgbm_preds.append( model.predict(X_test) )
        lgbm_preds = np.mean(lgbm_preds, axis=0)

        sample_prediction["Prediction"] = lgbm_preds
        submission = pred_to_submission(sample_prediction)

        env.predict(submission)

In [None]:
X_test = prices[["SecuritiesCode", "Open", "High", "Low", "Close"]]
y_pred = 
for model in lgbm_models:
    display(model.predict(X_test))

## Others

In [None]:
# folderのディレクトリ構造可視化ツール
import pathlib
import glob
import os

def tree(path, layer=0, is_last=False, indent_current='　'):
    if not pathlib.Path(path).is_absolute():
        path = str(pathlib.Path(path).resolve())

    # カレントディレクトリの表示
    current = path.split('/')[::-1][0]
    if layer == 0:
        print('<'+current+'>')
    else:
        branch = '└' if is_last else '├'
        print('{indent}{branch}<{dirname}>'.format(indent=indent_current, branch=branch, dirname=current))

    # 下の階層のパスを取得
    paths = [p for p in glob.glob(path+'/*') if os.path.isdir(p) or os.path.isfile(p)]
    def is_last_path(i):
        return i == len(paths)-1

    # 再帰的に表示
    for i, p in enumerate(paths):

        indent_lower = indent_current
        if layer != 0:
            indent_lower += '　　' if is_last else '│　'

        if os.path.isfile(p):
            branch = '└' if is_last_path(i) else '├'
            print('{indent}{branch}{filename}'.format(indent=indent_lower, branch=branch, filename=p.split('/')[::-1][0]))
        if os.path.isdir(p):
            tree(p, layer=layer+1, is_last=is_last_path(i), indent_current=indent_lower)

In [None]:
tree('/content/drive/MyDrive/kaggle/JPXTokyoStock')