### 모형 결합(model combining)

#### 앙상블 방법론(ensemble methods)
- 예측 성능을 향싱시키기 위하여 하나의 모형이 아닌 복수의 모형을 결합하는 방법
- 단일 모형을 사용하는 것보다 계산량이 증가하지만 성능이 향상될 수 있음(과적합 방지 등)

#### 취합(aggregation)과 부스팅(boosting)
- 취합: 사용할 모형의 집합을 처음부터 고정
    - 다수결(Majority Voting), 배깅(Baggind), 랜덤포레스트(Random Forest)
    - 부스팅: 사용할 모형을 점진적으로 늘려가는 방법
    - 에이다부스트(AdaBoost), 그레디언트 부스트(Gradient Boost)  

#### 다수결 방법
- Hard Voting: 단순 투표, 가장 많이 나온 결과를 채택(디폴트)
- Soft Voting: 가중치 투표, 개별 모형의 조건부 확률들을 합한 것들 중 가장 큰 것을 채택

In [2]:
# 신용카드 거래 데이터 csv 파일을 로딩(원본)

import pandas as pd
df = pd.read_csv("Data/creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
train_cols = df.columns[1:-1]
print(train_cols)
X = df[train_cols] # 독립변수
y = df["Class"]
y.value_counts()

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')


Class
0    284315
1       492
Name: count, dtype: int64

In [5]:
#언더샘플링
from imblearn.under_sampling import RandomUnderSampler

X_sample, y_sample = RandomUnderSampler(random_state=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=train_cols )
y_samp = pd.DataFrame(data=y_sample,columns=['Class'])
df2=pd.concat([X_samp,y_samp],axis=1)
df2.Class.value_counts()

Class
0    492
1    492
Name: count, dtype: int64

In [6]:
X = X_samp[train_cols] # 독립변수
y = y_samp["Class"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
model1 = LogisticRegression(random_state=1, max_iter=1000)
model2 = DecisionTreeClassifier(random_state=1)
model3 = KNeighborsClassifier(n_neighbors=2)
#     estimators: 개별 모형 목록, 리스트나 named parameter 형식으로 입력
#     voting: {hard, soft} hard voting 과 soft voting 선택. 디폴트는 hard
# 로지스틱 회귀분석과 의사결정나무, KNN 3개를 다수결로 합친 모형 
# estimators=[(alias, model)]

In [12]:
import numpy as np
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

ensemble = VotingClassifier(estimators=[('lr', model1), ('tree', model2), ('knn', model3)], voting='soft')
for model in (model1, model2, model3, ensemble):
    print(model)
    model.fit(X_train,y_train)
    print("학습용:",model.score(X_train, y_train))
    print("검증용:",model.score(X_test, y_test))
    print()
    
#일반적으로 다수결 모형이 개별 모형보다 성능이 더 좋음 

LogisticRegression(max_iter=1000, random_state=1)
학습용: 0.9529860228716646
검증용: 0.9289340101522843

DecisionTreeClassifier(random_state=1)
학습용: 1.0
검증용: 0.9289340101522843

KNeighborsClassifier(n_neighbors=2)
학습용: 0.9428208386277002
검증용: 0.9289340101522843

VotingClassifier(estimators=[('lr',
                              LogisticRegression(max_iter=1000,
                                                 random_state=1)),
                             ('tree', DecisionTreeClassifier(random_state=1)),
                             ('knn', KNeighborsClassifier(n_neighbors=2))],
                 voting='soft')
학습용: 1.0
검증용: 0.9441624365482234



## 배깅
- 같은 모형을 사용하지만 같은 데이터 샘플을 중복사용하여 서로 다른 결과를 출력하는 다수의 모형을 사용하는 방법

In [13]:
import pandas as pd
df = pd.read_csv("Data/customer.csv")
df.head()

Unnamed: 0,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn
0,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


In [14]:
df["Churn"].value_counts()

Churn
0    2850
1     483
Name: count, dtype: int64

In [15]:

train_cols = df.columns[0:16]
X = df[train_cols] # 독립변수
y = df["Churn"]
print(type(X), type(y))
print(X.head())
print(y.head())

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
   Int'l Plan  VMail Plan  VMail Message  Day Mins  Day Calls  Day Charge  \
0           0           1             25     265.1        110       45.07   
1           0           1             26     161.6        123       27.47   
2           0           0              0     243.4        114       41.38   
3           1           0              0     299.4         71       50.90   
4           1           0              0     166.7        113       28.34   

   Eve Mins  Eve Calls  Eve Charge  Night Mins  Night Calls  Night Charge  \
0     197.4         99       16.78       244.7           91         11.01   
1     195.5        103       16.62       254.4          103         11.45   
2     121.2        110       10.30       162.6          104          7.32   
3      61.9         88        5.26       196.9           89          8.86   
4     148.3        122       12.61       186.9          121          8.41   



In [16]:
#언더샘플링
from imblearn.under_sampling import RandomUnderSampler
X_sample, y_sample = RandomUnderSampler(random_state=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=train_cols )
y_samp = pd.DataFrame(data=y_sample,columns=['Churn'])
df_samp=pd.concat([X_samp,y_samp],axis=1)
df_samp["Churn"].value_counts()

Churn
0    483
1    483
Name: count, dtype: int64

In [17]:
X = df_samp[train_cols] # 독립변수
y = df_samp["Churn"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [18]:
# BaggingClassifier : 배깅 모형 결합을 위한 클래스
#     base_estimator: 기본 모형
#     n_estimators: 모형 개수. 디폴트 10
#     bootstrap: 데이터의 중복 사용 여부. 디폴트 True
#     max_samples: 데이터 샘플 중 선택할 샘플의 수 혹은 비율. 디폴트 1.0
#     bootstrap_features: 특징 차원의 중복 사용 여부. 디폴트 False
#     max_features: 다차원 독립 변수 중 선택할 차원의 수 혹은 비율 1.0

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

model1=DecisionTreeClassifier(random_state=0)
model2=SVC(kernel='linear')
model3=BaggingClassifier(DecisionTreeClassifier(), n_estimators=100, random_state=0)
model4=BaggingClassifier(KNeighborsClassifier(n_neighbors=2), n_estimators=10, random_state=0)
model5=BaggingClassifier(SVC(), n_estimators=10, random_state=0)

In [19]:
for model in (model1, model2, model3, model4, model5):
    print(model)
    model.fit(X_train,y_train)
    print(model.score(X_train,y_train))
    print(model.score(X_test,y_test))
    print()

DecisionTreeClassifier(random_state=0)
1.0
0.845360824742268

SVC(kernel='linear')
0.7707253886010362
0.7422680412371134

BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100,
                  random_state=0)
1.0
0.8917525773195877

BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=2), random_state=0)
0.8963730569948186
0.6288659793814433

BaggingClassifier(estimator=SVC(), random_state=0)
0.6904145077720207
0.6701030927835051

