In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# データ準備
df = sns.load_dataset('titanic')

# 欠損値削除
df.dropna(inplace=True)

# X,y生成
X = df.loc[:, (df.columns!='survived') & (df.columns!='alive')]
y = df['survived']

# ラベルエンコーディング
# .set_output(transform='pandas')：　oeがnumpyで返るのをpandasにする
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
cat_cols = X.select_dtypes(exclude=np.number).columns.to_list()
oe.set_output(transform='pandas')
X[cat_cols] = oe.fit_transform(X[cat_cols])

# 学習/テストデータ分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cat_cols] = oe.fit_transform(X[cat_cols])


In [3]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=True, random_state=0)

In [4]:
for train_idx, test_idx in cv.split(X_train):
    print(train_idx, test_idx)

[  0   1   3   4   5   6   9  11  12  13  14  15  17  18  19  20  21  23
  25  26  27  28  29  31  32  33  34  35  36  37  38  39  41  42  43  44
  45  46  47  49  50  52  53  54  55  56  57  58  60  61  62  63  64  65
  67  68  69  70  72  74  75  76  77  79  80  81  82  83  84  86  87  88
  90  93  94  96  97  99 102 103 104 106 107 108 109 110 111 112 113 114
 115 116 117 118 119 120 121 122 123 124 125] [  2   7   8  10  16  22  24  30  40  48  51  59  66  71  73  78  85  89
  91  92  95  98 100 101 105 126]
[  0   1   2   4   5   7   8   9  10  12  14  15  16  17  19  20  21  22
  23  24  25  28  29  30  31  32  34  35  36  37  38  39  40  41  42  44
  46  47  48  49  51  53  55  56  57  58  59  61  64  65  66  67  69  70
  71  72  73  74  76  77  78  79  80  81  82  83  85  86  87  88  89  90
  91  92  93  95  97  98  99 100 101 102 103 105 106 108 109 111 112 113
 114 115 116 117 118 119 120 122 123 125 126] [  3   6  11  13  18  26  27  33  43  45  50  52  54  60  62  63  68  7

# Section6 stacking

- StackingClasifierCVクラス
    - sklearn.ensemble.StackingClassifierクラスは1層目のモデルの学習の時にCrossValidationを使っていない
    - そのため、CrossValidationに対応するクラスを新しくスクラッチで作る
    - 設計要件
        - 2値分類のみ対応
        - estimator引数: 1層目のモデルのリスト(['model_name', model), ...]
        - final_estimator引数: 2層目のsklearnのモデルインスタンス
        - cv引数: sklearnのcvオブジェクト
        - .fit()および、.predict_probaメソッドを実装

In [11]:
class StackingClassifierCV():
    
    def __init__(self, estimators, final_estimator, cv):
        self.estimators = estimators # [('rf', RandomForestClassifier()), ('knn', KNeighborsCalssifier()), (,), ..]
        self.final_estimator = final_estimator
        self.cv = cv
        
    def fit(self, X, y):
        pred_features = {}
        # 1層目のモデル学習
        for model_name, model in self.estimators:
            preds = []
            new_y = []
            
            for train_idx, val_idx in self.cv.split(X):
                X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                model.fit(X_train, y_train)
                pred = model.predict_proba(X_val)[:, 1].tolist()
                preds += pred
                # cv.splitによりXの順番が変わっているので，それに合わせて新しくyを作成する
                new_y += y_val.tolist()
            model.fit(X, y)
            pred_features[model_name] = preds
        
        # 2層目のモデル学習
        new_X = pd.DataFrame(pred_features)
        self.final_estimator.fit(new_X, new_y)
    
    def predict_proba(self, X):
        # 1層目のモデルで特徴量(予測値)生成
        pred_features = {}
        for model_name, model in self.estimators:
            pred = model.predict_proba(X)[:, 1]
            pred_features[model_name] = pred
        
        new_X = pd.DataFrame(pred_features)
        final_pred = self.final_estimator.predict_proba(new_X)
        return final_pred
        

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=True, random_state=0)
final_estimator = LogisticRegression()
my_stacking = StackingClassifierCV(estimators=[('rf', RandomForestClassifier()), ('knn', KNeighborsClassifier(n_neighbors=5))], 
                                   final_estimator=final_estimator, 
                                   cv=cv)

my_stacking.fit(X_train, y_train)
my_stacking.predict_proba(X_test)

array([[0.44515855, 0.55484145],
       [0.63171789, 0.36828211],
       [0.30593782, 0.69406218],
       [0.20691563, 0.79308437],
       [0.46214456, 0.53785544],
       [0.51421666, 0.48578334],
       [0.56963174, 0.43036826],
       [0.25777583, 0.74222417],
       [0.21484872, 0.78515128],
       [0.17009214, 0.82990786],
       [0.57473886, 0.42526114],
       [0.16634065, 0.83365935],
       [0.21889727, 0.78110273],
       [0.17009214, 0.82990786],
       [0.33181466, 0.66818534],
       [0.52165375, 0.47834625],
       [0.23031403, 0.76968597],
       [0.21484872, 0.78515128],
       [0.23189974, 0.76810026],
       [0.43195548, 0.56804452],
       [0.24050024, 0.75949976],
       [0.37466421, 0.62533579],
       [0.20303108, 0.79696892],
       [0.19495845, 0.80504155],
       [0.26121329, 0.73878671],
       [0.49782919, 0.50217081],
       [0.28364946, 0.71635054],
       [0.25324134, 0.74675866],
       [0.42538882, 0.57461118],
       [0.40800848, 0.59199152],
       [0.

In [13]:
# 一層目のモデル
estimators=[('rf', RandomForestClassifier()), ('knn', KNeighborsClassifier()), ('logistic', LogisticRegression())]
# 二層目のモデル
final_estimator = LogisticRegression()
cv = KFold(n_splits=5, shuffle=True, random_state=0)
stacking_cv = StackingClassifierCV(estimators=estimators,
                                   final_estimator=final_estimator,
                                   cv=cv)
stacking_cv.fit(X_train, y_train)
y_pred_stacking_cv = stacking_cv.predict_proba(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt