In [148]:
import warnings
warnings.filterwarnings('ignore')

In [149]:
import numpy as np
import pandas as pd

In [150]:
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [151]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [152]:
estimator1 = XGBClassifier(max_depth=3, learning_rate=0.5, n_estimators=50, n_jobs=-1)
estimator2 = LGBMClassifier(max_depth=2, learning_rate=0.5, n_estimators=50, n_jobs=-1)
estimator3 = RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1)
estimator4 = LinearSVC()
estimator5 = MLPClassifier(hidden_layer_sizes=(512,256, 32)) #딥러닝

In [153]:
base_estimators = [estimator1, estimator2, estimator3, estimator4, estimator5]

In [154]:
from sklearn.model_selection import train_test_split

In [155]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)  #데이터셋을 나눠줌

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((533, 27), (356, 27), (533,), (356,))

In [156]:
for estimator in base_estimators:
            estimator.fit(X_train, y_train)

In [157]:
base_estimators[0].predict_proba(X_test)

array([[2.85332203e-01, 7.14667797e-01],
       [8.97660255e-02, 9.10233974e-01],
       [8.17856789e-01, 1.82143226e-01],
       [9.35947180e-01, 6.40528426e-02],
       [9.68898475e-01, 3.11015267e-02],
       [8.44986141e-01, 1.55013844e-01],
       [9.69114244e-01, 3.08857746e-02],
       [5.95970035e-01, 4.04029936e-01],
       [9.17286754e-01, 8.27132240e-02],
       [3.60165477e-01, 6.39834523e-01],
       [9.73206580e-01, 2.67933942e-02],
       [6.01920545e-01, 3.98079455e-01],
       [9.11339760e-01, 8.86602476e-02],
       [8.17659140e-01, 1.82340845e-01],
       [9.94729459e-01, 5.27052581e-03],
       [4.66391921e-01, 5.33608079e-01],
       [7.98139751e-01, 2.01860249e-01],
       [4.43748236e-02, 9.55625176e-01],
       [9.35947180e-01, 6.40528426e-02],
       [9.85735893e-01, 1.42640891e-02],
       [9.89826560e-01, 1.01734409e-02],
       [5.63326478e-02, 9.43667352e-01],
       [5.80734015e-03, 9.94192660e-01],
       [8.47778797e-01, 1.52221233e-01],
       [8.687052

In [158]:
meta_train_set = np.array([estimator.predict(X_test) for estimator in base_estimators]).T #예측된 label자체를 하나의 feature로 만듬 

In [159]:
meta_train_set  #각각 모델의 예측값이 하나의 feature!

array([[1., 1., 0., 1., 1.],
       [1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 1., 0., 1., 1.],
       [0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1.]])

In [160]:
from sklearn.model_selection import cross_val_score

In [161]:
for estimator in base_estimators:
    result = cross_val_score(estimator, meta_train_set, y_test, scoring="accuracy" , cv=5).mean()
    print(result)

0.8255689693717863
0.8452503912363067
0.8339816677844846
0.8452906326849987
0.8313223787167449


In [162]:
meta_train_set2 = np.array([base_estimators[i].predict_proba(X_test)[:,1] for i in [0,1,2,4]]).T  #한번 거쳐서 만든 데이터셋 : meta train #probablity로 해봄

In [163]:
meta_train_set2.shape

(356, 4)

In [164]:
for estimator in base_estimators:
    result = cross_val_score(estimator, meta_train_set2, y_test, scoring="accuracy" , cv=5).mean()
    print(result)

0.7976738207019898
0.8060473954840152
0.833943661971831
0.8339425441538116
0.8368388106416276


In [165]:
new_X_test = np.concatenate([X_test, meta_train_set2], axis = 1)  #우리가 뽑아낸 확률값이랑 원래 x랑 합쳐 더 큰 train을 만들어 예측해봄
new_X_test.shape

(356, 31)

In [166]:
for estimator in base_estimators:
    result = cross_val_score(estimator, new_X_test, y_test, scoring="accuracy" , cv=5).mean()
    print(result)

0.8088240554437738
0.8228705566733735
0.8480695282807957
0.8537402190923318
0.8592957746478873
