## 学習用

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, log_loss, mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
data_all = load_breast_cancer(as_frame=True)
data = data_all["data"]
label = data_all["target"]

In [3]:
train_x, test_x, train_y, test_y = train_test_split(data, label, train_size=0.8)

### 基底クラス

In [4]:
import sys
sys.path.append("../code/models/")
from model import MetaModel

In [5]:
help(MetaModel.__init__)

Help on function __init__ in module model:

__init__(self, name: str, params: dict) -> None
    コンストラクタ
    
    Parameters
    -----
    name : str
        モデルを管理する識別子
    params : dict
        ハイパーパラメータ



In [6]:
help(MetaModel.train)

Help on function train in module model:

train(self, tr_x: pandas.core.frame.DataFrame, tr_y: pandas.core.frame.DataFrame, va_x: pandas.core.frame.DataFrame = None, va_y: pandas.core.frame.DataFrame = None)
    モデルの学習を行う関数
    
    Parameters 
    -----
    tr_x : pd.DataFrame
        学習データの特徴量
    tr_y : pd.DataFrame
        学習データの目的変数
    va_x : pd.DataFrame
        検証データの特徴量
    va_y : pd.DataFrame
        検証データの目的変数



### lightGBM

In [9]:
from __future__ import annotations
import pathlib

import numpy as np
import pandas as pd
import lightgbm as lgb

from model import MetaModel
from utils import Util

class ModelLgb(MetaModel):
    
    """
    LightGBMのラッパークラス

    Attributes
    -----
    name : str
        モデルの識別子
    params : dict
        モデルのハイパーパラメータ
    metric_log : dict
        学習過程を保存する辞書
    """

    def __init__(self, name:str, params:dict):
        """
        初期化処理
        
        Paramters
        -----
        name: str
            モデル名
        params: dict
            モデルの学習に用いるパラメーター
        """
        super().__init__(f"lgb-{name}", params)
        
        self.metric_log = {}
        self.model = None
    
    def train(self, tr_x:pd.DataFrame, tr_y:pd.DataFrame, va_x:pd.DataFrame=None, va_y:pd.DataFrame=None) -> None:
        """
        モデルの学習を行う。
        
        Parameters
        -----
        tr_x, tr_y : pd.DataFrame
            学習用データの特徴量とラベル
        va_x, va_y : pd.DataFrame
            検証用データの特徴量とラベル
            
        Note
        -----
        paramsからモデルのパラメータとしてpopされる
        * num_boost_round 
        * verbose_eval 
        * early_stopping_rounds
        * categorical_feature 
        """
        #初期化
        self.metric_log = {}
        
        if va_x is None and va_y is None: #validationなし
            validate = False
        elif va_x is not None and va_y is not None: #validationあり
            validate = True
        else: #エラー
            raise ValueError("Both va_x and va_y must be None or not None.")
        
        #データの作成
        lgb_tr = lgb.Dataset(tr_x, label=tr_y)
        if validate:
            lgb_va = lgb.Dataset(va_x, label=va_y)
        
        #=====ハイパーパラメータ
        params = dict(self.params)
        #train関数の引数
        num_boost_round = params.pop("num_boost_round")
        verbose_eval = params.pop("verbose_eval") if params.get("verbose_eval") is not None else 10
        early_stopping_rounds= params.pop("early_stopping_rounds") if params.get("early_stopping_rounds") is not None else None
        categorical_feature = params.pop("categorical_feature") if params.get("categorical_feature") is not None else "auto"

        #モデルの訓練
        if validate:
            valid_sets=[lgb_tr, lgb_va]
            valid_names=['train', 'valid']
            
            self.model = lgb.train(
                params
                , train_set=lgb_tr
                , num_boost_round=num_boost_round
                , valid_sets=valid_sets
                , valid_names=valid_names
                , early_stopping_rounds=early_stopping_rounds
                , verbose_eval=verbose_eval
                , evals_result=self.metric_log
            )
            
        else:
            #validationデータの追加
            valid_sets=[lgb_tr]
            valid_names=['train']
            
            self.model = lgb.train(
                params
                , train_set=lgb_tr
                , num_boost_round=num_boost_round
                , valid_sets=valid_sets
                , valid_names=valid_names
                , verbose_eval=verbose_eval
                , evals_result=self.metric_log
            )
        
    def predict(self, test_x: pd.DataFrame) -> np.ndarray:
        """
        予測を行う
        
        Parameters
        -----
        test_x : pd.DataFrame
            予測する特徴量データ
            
        Returns
        predict_val : np.ndarray
            モデルによる予測値
        """
        predict_val = self.model.predict(test_x, num_iteration=self.model.best_iteration)
        
        return predict_val
    
    def save_model(self) -> None:
        """
        モデルを保存する
        
        Note
        -----
        best_iterationを残すためにpkl形式で保存
        """
        model_path = pathlib.Path(f"../model/{self.name}.model")
        #pickleで保存
        Util.dump(self, model_path)
    
    @classmethod
    def load_model(cls, path:pathlib.Path) -> __class__:
        """
        モデルを読み込む
        
        Parameters
        -----
        path : pathlib.Path or str
            path of model file
            
        Returns
        -----
        model : ModelLgb
            指定されたモデルファイルを読み込んだもの
        """
        cls = Util.load(path)
        return cls
   
    def feature_importance(self, importance_type:str="gain") -> pd.DataFrame:
        """
        モデルのfeature importanceを返却する

        Parameters
        -----
        importance_type : str, optional(default="gain")
            importanceの種類
        
        Returns
        -----
        df_importance :pd.DataFrame
            importanceの値
        """
        feature_name = self.model.feature_name()
        feature_importance = self.model.feature_importance(importance_type=importance_type, iteration=self.model.best_iteration)

        #DataFrameに変換
        df_importance = pd.DataFrame(
            [feature_name, feature_importance]
            , index=["feature_name", importance_type]
        ).T
        
        return df_importance

In [7]:
from model_lgb import ModelLgb

In [16]:
default_params = {
    "boosting_type":"gbdt"
    , "objective":"binary"
    , "metric":"binary_logloss"
    , "max_depth":4
    , "num_leaves":15
    , "colsample_bytree":1.0 
    , "feature_fraction_bynode":1.0
    , "subsample":0.9 
    , "min_child_weight":1 
    , "lambda_l1":0 #reg_alpha(L1)
    , "lambda_l2":1 #reg_lambda(L2)
    , "learning_rate":0.1
    , "verbosity":-1
    , "verbose_eval":0
}

training_params = {
    "num_boost_round":100
    , "early_stopping_rounds":5
    , "verbose_eval":10
}

params = default_params.copy()
params.update(training_params)

In [17]:
lgb_clf = ModelLgb("test_lgbm", params)

In [18]:
lgb_clf.train(train_x, train_y, test_x, test_y)

Training until validation scores don't improve for 5 rounds
[10]	train's binary_logloss: 0.238768	valid's binary_logloss: 0.294341
[20]	train's binary_logloss: 0.117775	valid's binary_logloss: 0.189338
[30]	train's binary_logloss: 0.0655705	valid's binary_logloss: 0.152034
[40]	train's binary_logloss: 0.0406951	valid's binary_logloss: 0.14537
Early stopping, best iteration is:
[41]	train's binary_logloss: 0.0389491	valid's binary_logloss: 0.143528


In [30]:
from __future__ import annotations
import pathlib
from collections import defaultdict

import numpy as np
import pandas as pd
import xgboost as xgb

from model import MetaModel
from utils import Util

class ModelXgb(MetaModel):
    
    """
    XGBoostのラッパークラス

    Attributes
    -----
    name : str
        モデルの識別子
    params : dict
        モデルのハイパーパラメータ
    metric_log : dict
        学習過程を保存する辞書
    """

    def __init__(self, name:str, params:dict) -> None:
        """
        初期化処理
        
        Paramters
        -----
        name: str
            モデル名
        params: dict
            モデルの学習に用いるパラメーター
        """
        super().__init__(f"lgb-{name}", params)
        
        self.metric_log = {}
        self.model = None
    
    def train(self, tr_x:pd.DataFrame, tr_y:pd.DataFrame, va_x:pd.DataFrame=None, va_y:pd.DataFrame=None) -> None:
        """
        モデルの学習を行う。
        
        Parameters
        -----
        tr_x, tr_y : pd.DataFrame
            学習用データの特徴量とラベル
        va_x, va_y : pd.DataFrame
            検証用データの特徴量とラベル
            
        Note
        -----
        paramsからモデルのパラメータとしてpopされる
        * num_boost_round 
        * verbose_eval 
        * early_stopping_rounds
        * categorical_feature 
        """
        #初期化
        self.metric_log = {}
        
        if va_x is None and va_y is None: #validationなし
            validate = False
        elif va_x is not None and va_y is not None: #validationあり
            validate = True
        else: #エラー
            raise ValueError("Both va_x and va_y must be None or not None.")
        
        #データの作成
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        if validate:
            dvalid = xgb.DMatrix(va_x, label=va_y)
        
        #=====ハイパーパラメータ
        params = dict(self.params)
        #train関数の引数
        num_boost_round = params.pop("num_boost_round")
        verbose_eval = params.pop("verbose_eval") if params.get("verbose_eval") is not None else 10
        early_stopping_rounds= params.pop("early_stopping_rounds") if params.get("early_stopping_rounds") is not None else None

        #モデルの訓練
        if validate:
            evals = [(dtrain, "train"), (dvalid, "valid")]
            
            self.model = xgb.train(
                params
                , dtrain
                , num_boost_round=num_boost_round
                , evals=evals
                , early_stopping_rounds=early_stopping_rounds
                , verbose_eval = verbose_eval
                , evals_result=self.metric_log
            )
            
        else:
            evals = [(dtrain, "train")]
            
            self.model = xgb.train(
                params
                , dtrain
                , num_boost_round=num_boost_round
                , evals=evals
                , verbose_eval = verbose_eval
                , evals_result=self.metric_log
            )
        
    def predict(self, test_x: pd.DataFrame) -> np.ndarray:
        """
        予測を行う
        
        Parameters
        -----
        test_x : pd.DataFrame
            予測する特徴量データ
            
        Returns
        predict_val : np.ndarray
            モデルによる予測値
        """
        dtest = xgb.DMatrix(test_x)
        predict_val = self.model.predict(dtest, ntree_limit=self.model.best_iteration)
        
        return predict_val
    
    def save_model(self) -> None:
        """
        モデルを保存する
        
        Note
        -----
        best_iterationを残すためにpkl形式で保存
        """
        model_path = pathlib.Path(f"../model/{self.name}.model")
        #pickleで保存
        Util.dump(self, model_path)
    
    @classmethod
    def load_model(cls, path:pathlib.Path) -> __class__:
        """
        モデルを読み込む
        
        Parameters
        -----
        path : pathlib.Path or str
            path of model file
            
        Returns
        -----
        model : ModelXgb
            指定されたモデルファイルを読み込んだもの
        """
        cls = Util.load(path)
        return cls
   
    def feature_importance(self, importance_type:str="gain") -> pd.DataFrame:
        """
        モデルのfeature importanceを返却する

        Parameters
        -----
        importance_type : str, optional(default="gain")
            importanceの種類
        
        Returns
        -----
        df_importance :pd.DataFrame
            importanceの値
        """
        f_importance_map = defaultdict(int)
        f_importance_map.update(self.model.get_score(importance_type=importance_type))

        #DataFrameに変換
        df_importance = pd.DataFrame(
            [[col, f_importance_map[col]] for col in self.model.feature_names]
            , columns=["feature_name", importance_type]
        )
        return df_importance



In [34]:
default_params = {
    "boosting_type":"gbdt"
    , "objective":"binary:logistic"
    , "metric":"logloss"
    , "eta":0.1
    , "max_depth":4
    , "min_child_weight":1
    , "colsample_bytree":1.0 
    , "colsample_bylevel":0.3
    , "subsample":0.9 
    , "gamma":0
    , "lambda":1 #reg_lambda(L2)
    , "alpha":0 #reg_alpha(L1)
    , "verbosity":0
}

training_params = {
    #"num_boost_round":100
    ,"early_stopping_rounds":5
    , "verbose_eval":10
}

params = default_params.copy()
params.update(training_params)

SyntaxError: invalid syntax (<ipython-input-34-1da2e2e26174>, line 19)

In [32]:
xgb_clf = ModelXgb("test_xgb", params)

In [33]:
xgb_clf.train(train_x, train_y, test_x, test_y)

[0]	train-logloss:0.61280	valid-logloss:0.61592
[10]	train-logloss:0.22426	valid-logloss:0.26849
[20]	train-logloss:0.10508	valid-logloss:0.17805
[30]	train-logloss:0.05847	valid-logloss:0.14732
[40]	train-logloss:0.03777	valid-logloss:0.13806
[50]	train-logloss:0.02723	valid-logloss:0.13571
[51]	train-logloss:0.02645	valid-logloss:0.13522
