In [1]:
import numpy as np
import os

In [2]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

#### 데이터 스플릿 해야 아래쪽에서 호출해서 사용할 수 있음!

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=17)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver="liblinear", random_state=17)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=17)
svm_clf = SVC(gamma="auto", random_state=17)

voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting='hard')

In [15]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8
RandomForestClassifier 0.864
SVC 0.848
VotingClassifier 0.84


#### 배깅과 페이스팅

#### 배깅!

In [16]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=17), 
    n_estimators=500,
    max_samples=100, 
    bootstrap=True, 
    n_jobs=-1, # 훈련과 예측에 사용할 cpu 코어수를 지정. -1이면 모든 코어 사용.
    random_state=17
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

#### 랜덤 포레스트

#### RandomForestClassifier 사용

In [18]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=17)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

#### BaggingClassifier 사용

In [19]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16, random_state=42),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)


#### 아다부스트

In [20]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5, random_state=17)

ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.5, n_estimators=200, random_state=17)

#### 그래디언트 부스팅

#### DecisionTreeRegressor를 훈련세트에 학습

In [22]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=17)
tree_reg1.fit(X, y)


DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=17, splitter='best')

#### 첫번째 예측기에서 생긴 잔여 오차에 두번째 DecisionTreeRegressor를 학습


In [24]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=17)
tree_reg2.fit(X, y2)


DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=17, splitter='best')

#### 두번째 예측기가 만든 잔여 오차에 세번째 회귀모형을 훈련


In [25]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=17)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=17, splitter='best')

#### 위에서 만든 모든 트리의 예측을 더해서 새로운 샘플에 대한 예측 생성

In [29]:
X = np.random.rand(100, 1) - 0.5


In [30]:
X

array([[ 0.19871389],
       [ 0.4451319 ],
       [ 0.47078428],
       [ 0.21209839],
       [-0.09621507],
       [ 0.20085806],
       [-0.12902413],
       [-0.17876548],
       [-0.38144876],
       [-0.24331437],
       [-0.15910826],
       [-0.41891445],
       [ 0.48171003],
       [ 0.2451375 ],
       [ 0.42136146],
       [ 0.33520922],
       [ 0.05155667],
       [-0.0513994 ],
       [ 0.3960739 ],
       [ 0.1535026 ],
       [-0.10859481],
       [-0.2353918 ],
       [-0.028911  ],
       [-0.00484537],
       [ 0.44524438],
       [ 0.00576999],
       [ 0.1717017 ],
       [-0.29730957],
       [-0.07427828],
       [-0.34786088],
       [-0.40990521],
       [ 0.11112362],
       [ 0.43857651],
       [-0.46490636],
       [ 0.42767332],
       [ 0.10809301],
       [ 0.46755414],
       [-0.14814398],
       [-0.33163015],
       [ 0.17917822],
       [-0.18403225],
       [ 0.49845487],
       [ 0.08494556],
       [-0.06031211],
       [-0.40722134],
       [-0

In [27]:
X_new = np.array([[0.8]])


In [28]:
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

ValueError: Number of features of the model must match the input. Model n_features is 2 and input n_features is 1 