<a href="https://colab.research.google.com/github/satogen/datascience_tips/blob/main/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import seaborn as sns
import numpy as np
import pandas as pd
 
# データ可視化ライブラリ
import matplotlib.pyplot as plt
%matplotlib inline 
 
 
# LightGBM
import lightgbm as lgb
 
# Scikit-learn（評価算出）
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split


# データの読み込み

In [2]:
multiclass_df = sns.load_dataset('iris')
multiclass_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [16]:
print(multiclass_df["species"].unique())
print(multiclass_df["species"].nunique())

['setosa' 'versicolor' 'virginica']
3


In [17]:
le = preprocessing.LabelEncoder()
multiclass_df["target"] = le.fit_transform(multiclass_df["species"])

In [18]:
train, test = train_test_split(multiclass_df, test_size=0.3, random_state=0)
print(train.shape)
print(test.shape)

(105, 6)
(45, 6)


In [19]:
train_X = train.drop(["target","species"], axis=1)
test_X = test.drop(["target","species"], axis=1)
train_y = train["target"]
test_y = test["target"]

In [20]:
train_y.nunique()

3

## クロスバリデーション

In [21]:
FOLD = 5
kf = KFold(n_splits=FOLD, shuffle=True, random_state=42)

In [58]:
# 学習データに対する「目的変数を知らない」予測値と、テストデータに対する予測値を返す関数
def predict_cv(model, train_x, train_y, test_x, cv, metrics=None):
    preds = []
    preds_test = []
    va_idxes = []

    # クロスバリデーションで学習・予測を行い、予測値とインデックスを保存する
    for i, (tr_idx, va_idx) in enumerate(cv.split(train_x)):
      # データの作成
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        # モデルの学習
        model.fit(tr_x, tr_y, va_x, va_y)

        # 予測結果の格納
        pred = model.predict(va_x)
        preds.append(pred)

        # 精度検証
        if metrics:
          metrics(va_y, pred)

        # テストデータに対する予測
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test


def metrics_log_loss(y, y_pred):
   # 評価関数　タスクによって変更
   score = log_loss(y, y_pred)
   print(f'Metrics: {score}')

## LightGBM

各ハイパーパラメータの調整方法は、こちらを参考
- LightGBM徹底入門  5. LightGBMのハイパーパラメータ
https://www.codexa.net/lightgbm-beginner/

In [26]:
# lightGBM
class ModelLightGBM:

    def __init__(self, params, round, verbose=-1):
        self.model = None
        self.params = params
        self.num_round = round
        self.verbose = verbose

    def fit(self, X_train, y_train, X_valid, y_valid):
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid)

        # モデルの学習
        self.model = lgb.train(
            self.params,
            lgb_train,
            valid_sets=lgb_eval,
            num_boost_round=self.num_round,
            verbose_eval=self.verbose
        )

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

In [59]:
NUM_ROUND = 100
VERBOSE_EVAL = -1

# objective 
## 下記ドキュメントのobjectiveに対応タスクと対応Lossの記述あり
## https://lightgbm.readthedocs.io/en/latest/Parameters.html

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': train_y.nunique(),
    'verbose': -1,
    'early_stopping_round':10

}

model_1a = ModelLightGBM(params, NUM_ROUND)

pred_train_1a, pred_test_1a = predict_cv(model_1a, train_X, train_y, test_X, kf, metrics_log_loss)



Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[39]	valid_0's multi_logloss: 0.25806
Metrics: 0.2580598411678388




Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[50]	valid_0's multi_logloss: 0.175532
Metrics: 0.17553174699769022




Training until validation scores don't improve for 10 rounds.
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.0372936
Metrics: 0.03729356884586127
Training until validation scores don't improve for 10 rounds.




Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.00948126
Metrics: 0.009481257835771584
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[47]	valid_0's multi_logloss: 0.19775
Metrics: 0.19774960680057024




## CatBoost

- 初心者がCatBoostで分類・回帰できるようになるまで
https://ryucoding.com/programming/catboost-beginner

In [36]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [47]:
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from catboost import Pool

# CatBoost
class ModelCatBoost:

    def __init__(self, params, category_features = None):
        self.params = params
        self.model = CatBoost(self.params)
        self.category_features = category_features

    def fit(self, X_train, y_train, X_valid, y_valid):
        if self.category_features:
          ctb_train = Pool(X_train, label=y_train,cat_features=self.category_features)  
          ctb_eval  = Pool(X_valid, label=y_valid,cat_features=self.category_features)  
        else:
          ctb_train = Pool(X_train, label=y_train)  
          ctb_eval  = Pool(X_valid, label=y_valid)  

        self.model.fit(ctb_train)

    def predict(self, x):
        pred = self.model.predict(x)
        return pred

In [60]:
# 指定できるハイパーパラメータはドキュメントから確認
# https://catboost.ai/en/docs/concepts/python-reference_catboost

# loss_functionで学習タスクが決定
# 指定できるタスクはこちら
# https://catboost.ai/en/docs/concepts/loss-functions

params = {
    'loss_function': 'MultiClass',
    'iterations': NUM_ROUND,   
    'verbose': 0, # catboostでは学習経過を出したくない場合は0を指定
     'early_stopping_rounds':10
}

model_1a = ModelCatBoost(params)

pred_train_1a, pred_test_1a = predict_cv(model_1a, train_X, train_y, test_X, kf, metrics_log_loss)

Metrics: 4.934110913558671
Metrics: 1.644703637852891
Metrics: 0.0630850283920817
Metrics: 2.1094237467877994e-15
Metrics: 5.080107250820421


## XGBoost

In [51]:
import xgboost as xgb

In [54]:
# xgboostによるモデル
class ModelXgboost:

    def __init__(self, params, num_round=1000):
        self.model = None
        self.params = params
        self.num_round = num_round

    def fit(self, tr_x, tr_y, va_x, va_y):
        # params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71,
        #           'eval_metric': 'logloss'}
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)

        # xgb.trainのパラメータ一覧
        # https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.training
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        self.model = xgb.train(self.params, dtrain, self.num_round, evals=watchlist)

    def predict(self, x):
        data = xgb.DMatrix(x)
        # 確率を出したい時は、predict_proba
        # https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier.predict_proba
        pred = self.model.predict(data)
        return pred

In [None]:
# 指定できるハイパーパラメータはドキュメントから確認
# https://xgboost.readthedocs.io/en/stable/parameter.html

# objectiveで学習タスクが決定
# 指定できるタスクはこちら
# https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters

params = {'objective': 'multi:softmax', 
          'num_class': train_y.nunique(),
          'verbosity': 0, 'random_state': 71}

model_1a = ModelXgboost(params)

pred_train_1a, pred_test_1a = predict_cv(model_1a, train_X, train_y, test_X, kf)