In [None]:
import pandas as pd
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split

## Bagging
- 單一分類器：https://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble
- 多種分類器：https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html#sklearn.ensemble.VotingClassifier

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
#使用knn當作弱學習器
bcknn = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=3),
                          n_estimators=50,bootstrap=True,bootstrap_features=True,
                          max_samples=0.80,max_features=0.80)
bcknn.fit(train_data, train_label)
pred = bcknn.predict(test_data)
ac_score = metrics.accuracy_score(test_label, pred)
print("訓練準確度",bcknn.score(train_data,train_label),
      "測試準確度",bcknn.score(test_data,test_label))

## Random Forest
- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
- 極端隨機數：https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=50,criterion='entropy')
rfc.fit(train_data, train_label)
pred = rfc.predict(test_data)
ac_score = metrics.accuracy_score(test_label, pred)
print("Training Random Forest Accurancy : ", rfc.score(train_data,train_label))
print("Test Random Forest Accurancy : ", ac_score)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=50,criterion='entropy')
etc.fit(train_data, train_label)
pred = etc.predict(test_data)
ac_score = metrics.accuracy_score(test_label, pred)
print("Extra Trees Accurancy : ", ac_score)

## Bootsing
- Adaboost: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier
- Gradient Boost: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier

### Ada Boosting

In [None]:
#類似RF,但不是使用整個完整樹,而是使用樹樁(stump:一個節點帶兩個葉片)
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=200)
abc.fit(train_data,train_label)
pred = abc.predict(test_data)
ac_score = metrics.accuracy_score(test_label, pred)
print("Training AdaBoosting Accurancy : ", abc.score(train_data,train_label))
print("Test AdaBoosting Accurancy : ", ac_score)

### GDBT

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=200)
gbc.fit(train_data,train_label)
pred = gbc.predict(test_data)
ac_score = metrics.accuracy_score(test_label, pred)
print("Training Gradient Boosting Accurancy : ", gbc.score(train_data,train_label))
print("Test Gradient Boosting Accurancy : ", ac_score)

### XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler

#xgboost 1.6.1需要先將y進行LabelEncoder
le = LabelEncoder()
y_le = le.fit_transform(y)
Xs = StandardScaler().fit_transform(X)
Xn = Normalizer().fit_transform(X)
Xm = MinMaxScaler().fit_transform(X)
Xs_train,Xs_test,y_train,y_test = train_test_split(Xs,y_le,test_size=0.1,random_state=123)
Xn_train,Xn_test,y_train,y_test = train_test_split(Xn,y_le,test_size=0.1,random_state=123)
Xm_train,Xm_test,y_train,y_test = train_test_split(Xn,y_le,test_size=0.1,random_state=123)
train_data, test_data, train_label, test_label = train_test_split(X, y,test_size=0.1,random_state=123)

xgbc = XGBClassifier()
xgbc.fit(Xs_train,y_train)
print("訓練準確度",xgbc.score(Xs_train,y_train),"測試準確度",xgbc.score(Xs_test,y_test))

## Voting

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

est = []
m1 = LogisticRegression(max_iter=3000)
est.append(('Logistic',m1))
m2 = DecisionTreeClassifier()
est.append(('DT',m2))
m3 = SVC()
est.append(('SVC',m3))
ests = [('Logistic', make_pipeline(StandardScaler(),(m1))),
        ('DT', m2),
        ('SVC',make_pipeline(StandardScaler(),(m3)))]

s1=make_pipeline(StandardScaler(),(m1)).fit(train_data,train_label).score(test_data,test_label)
s2=make_pipeline(StandardScaler(),(m2)).fit(train_data,train_label).score(test_data,test_label)
s3=make_pipeline(StandardScaler(),(m3)).fit(train_data,train_label).score(test_data,test_label)
print('----------單一演算法---------')
print('Logisic準確率:{}'.format(s1))
print('DT準確率:{}'.format(s2))
print('SVC準確率:{}'.format(s3))

#個別分類器的預測結果進行多數決
vc = VotingClassifier(ests)
vc.fit(train_data,train_label)
pred = vc.predict(test_data)
sc_score = metrics.accuracy_score(test_label,pred)
print("Voting ensemble Accurancy : ",ac_score)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

m1 = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=3),n_estimators=100,bootstrap=True,bootstrap_features=True,max_samples=0.80,max_features=0.80)
est.append(('Bagging_Knn',m1))
m2 = RandomForestClassifier(n_estimators=100,criterion='entropy')
est.append(('RF',m2))
m3 = GradientBoostingClassifier(n_estimators=100)
est.append(('GBDT',m3))
ests = [('Bagging_Knn', m1),('RandomForest', m2),('GBDT',m3)]

## Stacking

In [None]:
from sklearn import model_selection
from sklearn.ensemble import StackingClassifier #0.22
from sklearn.naive_bayes import GaussianNB 

clf1 = svm.SVC()
clf2 = RandomForestClassifier(n_estimators=100)
clf3 = GaussianNB()
ests = [('SVC',make_pipeline(StandardScaler(),(clf1))),
        ('rf', clf2),
        ('gNB', make_pipeline(StandardScaler(),(clf3)))]

lr = LogisticRegression(max_iter=1000)

sclf = StackingClassifier(estimators=ests, final_estimator=lr)
sclf.fit(train_data,train_label).score(test_data,test_label)

## Grid Search

In [None]:
#grid search範例
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

X, y = make_classification(n_samples=1000,                           
                           n_features=10,                           
                           n_informative=3,                          
                           n_redundant=0,                           
                           n_repeated=0,                           
                           n_classes=2,                           
                           random_state=0,                           
                           shuffle=False)
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = {'n_estimators': [10, 100, 200, 300, 400],'max_features': ['auto', 'sqrt', 'log2'],'max_samples': [0.7,0.8,0.9]}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X, y)

print(CV_rfc.best_params_)
