In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [96]:
#IRIS 데이터 로드
iris = sns.load_dataset('iris')
X = iris.iloc[:,:4]#학습데이터
y = iris.iloc[:,-1] #타겟

In [97]:
def to_dummy(x):
    if x == 'setosa':
        return 0
    elif x=='versicolor':
        return 1
    else:
        return 2
y=y.map(to_dummy)

## split and scaling

In [99]:
#train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 48)

In [100]:
scal = StandardScaler()
X_train = scal.fit_transform(X_train)
X_test = scal.transform(X_test)

In [101]:
# sklearn 
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB     
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

## Logistic, DecisionTree, Kneighbors

In [102]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
print('test acurracy : {0:4f}'.format(accuracy_score(y_test, pred)))

test acurracy : 0.900000


In [103]:
dt = DecisionTreeClassifier(random_state = 318)
dt.fit(X_train, y_train)
pred = dt.predict(X_test)
print('test acurracy : {0:4f}'.format(accuracy_score(y_test, pred)))

test acurracy : 0.900000


In [104]:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
print('test acurracy : {0:4f}'.format(accuracy_score(y_test, pred)))

test acurracy : 0.933333


## Voting

In [105]:
#voting
lr = LogisticRegression()
dt = DecisionTreeClassifier(random_state = 318)
knn = KNeighborsClassifier(n_neighbors=9)

vo_hard = VotingClassifier(estimators=[('lr',lr),('dt',dt),('knn',knn)])
vo_soft = VotingClassifier(estimators=[('lr',lr),('dt',dt),('knn',knn)], voting='soft')

# VotingClassifier 학습/예측/평가
vo_hard.fit(X_train,y_train)
pred_h = vo_hard.predict(X_test)
print('Hard Voting 분류기 정확도: {0:4f}'.format(accuracy_score(y_test, pred_h)))

vo_soft.fit(X_train,y_train)
pred_s = vo_soft.predict(X_test)
print('Soft Voting 분류기 정확도: {0:4f}'.format(accuracy_score(y_test, pred_s)))

Hard Voting 분류기 정확도: 0.933333
Soft Voting 분류기 정확도: 0.900000


## Random Forest

In [106]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [107]:
rf = RandomForestClassifier(n_estimators=100, max_features=4,
                            n_jobs=-1, oob_score = True)

In [108]:
#cv
cross_val_score(rf,X,y,cv = 5).mean()

0.9666666666666668

In [109]:
# Grid search
params ={
    "n_estimators" : [10, 20, 30, 50, 100],
    "max_features" : [1,2,3,4,5,6,7, 10, 15, 20, 25]
    }

grid = GridSearchCV(estimator = rf, param_grid = params, cv=5, n_jobs=-1)
grid = grid.fit(X, y)

In [110]:
# best score & params
print(grid.best_score_)
print(grid.best_params_)

# oob score
print(grid.best_estimator_.oob_score_)

0.9666666666666668
{'max_features': 1, 'n_estimators': 30}
0.94


## Stacking

In [111]:
#stacking
from xgboost import XGBClassifier 
from vecstack import stacking
from lightgbm import LGBMClassifier

In [112]:
# 개별 모델들
dt = DecisionTreeClassifier(random_state = 318)
rf = RandomForestClassifier(n_estimators=100, max_features=4,
                           n_jobs = -1, oob_score = True)
lr = LogisticRegression()

# 최종 모델
xgb = XGBClassifier(n_estimators=1000, max_depth=4, learning_rate=0.5, nthread=7)

dt.fit(X_train,y_train)
rf.fit(X_train,y_train)
lr.fit(X_train,y_train)

LogisticRegression()

In [113]:
dt_pred = dt.predict(X_test)
rf_pred = rf.predict(X_test)
lr_pred = lr.predict(X_test)

print("dt : {0:.4f}, rf : {1:.4f}, lr : {2:.4f}".format( accuracy_score(y_test, dt_pred),accuracy_score(y_test, rf_pred),accuracy_score(y_test, lr_pred)))

dt : 0.9000, rf : 0.9000, lr : 0.9000


In [114]:
new_data = np.array([dt_pred, rf_pred, lr_pred])
new_data = np.transpose(new_data)
new_data.shape

(30, 3)

In [115]:
xgb.fit(new_data, y_test)
xgb_pred = xgb.predict(new_data)
# 예측값으로 실제값을 예측한다.

print('정확도 : {0:.4f}'.format(accuracy_score(y_test, xgb_pred)))

정확도 : 0.8667
