# Day 5 Workshop

# 0. 문제정의
* 0-1. 문제(태스크) 정의: 
    - Human activity 데이터로 Human activity를 분류한다. ('Activity' 열이 타겟레이블)
    - Pipeline, GridSearchCV활용할 것
* 0-2. 평가지표설정:
    - 분류 정확도, 목표치는 90%

In [25]:
# 모듈 임포트 및 설정
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, SVC

pd.set_option("max_colwidth",None)

# 1. 데이터수집

In [2]:
train = pd.read_csv('datasets/human_activity_train.csv')
test = pd.read_csv('datasets/human_activity_test.csv')

train.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,STANDING
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,STANDING
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,STANDING
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,STANDING
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,STANDING


# 2. 데이터 탐색

* Day3 워크샵에서와 동일하므로 describe(), isnull(), plotting 등의 과정은 생략함.
* 'subject'이전까지는 수치 데이터이고,'subject'열은 피실험자 ID를 의미하는 범주형임. 훈련셋과 시험셋에서 겹치는 피실험자 ID가 없음에 유의.
* 'Activity' 열이 타겟레이블임.

# 3-1. 데이터 준비

In [3]:
X_train = train.loc[:, :'subject']
X_test = test.loc[:, :'subject']
y_train = train['Activity']
y_test = test['Activity']

# 3-2. 데이터 전처리 (파이프라인 이용)

In [4]:
class DataFrameSelector():
    def __init__(self, attribs):
        self.attribs = attribs
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.loc[:, self.attribs] ### 의미있는 변환을 하는 부분
    
    def fit_transform(self, X, y=None):
        return self.transform(X, y)

In [5]:
# 이상치 처리 해주는 사용자 정의 변환기 by Sulhee Baek

class MyOutlierTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.q1 = np.quantile(X, 0.25, axis=0)        
        self.q3 = np.quantile(X, 0.75, axis=0)
#         print(self.q1.shape, self.q3.shape)
        self.iqr = self.q3 - self.q1
        self.upper = self.q3 + 1.5*self.iqr
        self.lower = self.q1 - 1.5*self.iqr
        return self
    
    def transform(self, X, y=None):
        X_ = np.where(X > self.upper, self.upper, X)
        X_ =  np.where(X_ < self.lower, self.lower, X_)
        self.X = X_
        return self.X

In [6]:
# 사용자 정의 한 MyOutlinerTransformer() 클래스 기능 확인한다.
# fit()과 transform() 메소드를 호출하여 Pipeline에 적용 가능함을 확인

out = MyOutlierTransformer()
out.fit(train.iloc[:,:5])
out_transformed = out.transform(train.iloc[:,:5])
print(type(out_transformed), out_transformed.shape)

<class 'numpy.ndarray'> (7352, 5)


수치형과 범주형 속성을 각각 전처리 후 합집합으로 만드는 FeatureUnion을 활용하여 full_pipeline을 구성함.

In [8]:
num_columns = train.columns[:-2] # 수치형 속성 목록
cat_columns = ['subject']        # 범주형 속성 목록

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_columns)), # 수치속성 선택
        ('imputer', KNNImputer(n_neighbors=7)),       # 누락값 처리
        ('outlier', MyOutlierTransformer()), # 이상치 처리
        ('scaler', StandardScaler()),       # 특성 스케일링
])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_columns)),
        ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore')),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

full_pipeline.fit(X_train)
X_train_prep = full_pipeline.transform(X_train)
X_test_prep = full_pipeline.transform(X_test)

LabelEncoder()를 이용하여 타겟레이블을 수치형으로 변환

In [9]:
# LabelEncoder()를 이용하여 타겟레이블을 수치형으로 변환, 1D -> 1D
print(y_train[:5]) # 타겟레이블이 문자열임을 확인

le = LabelEncoder()
le.fit(y_train)
y_train_prep = le.transform(y_train)
y_test_prep = le.transform(y_test)

print(y_train_prep[:5]) # 타겟레이블이 수치형으로 변환됨을 확인

0    STANDING
1    STANDING
2    STANDING
3    STANDING
4    STANDING
Name: Activity, dtype: object
[2 2 2 2 2]


# 4. 머신러닝 알고리즘으로 모델 학습

## 4-0. 템플릿 파이프라인 준비: prep 후 clf

In [13]:
pipeClassifier = Pipeline(steps=[
            ('prep', full_pipeline), 
            ('clf', LogisticRegression(multi_class='multinomial', solver='lgfgs')),
])

## 4-1. Gaussian NB 알고리즘 [참고]
* priors 파라미터를 이용 사전 확률을 계산 후 입력하는 예제

In [14]:
prior_probabilities = (train['Activity'].value_counts() / train['Activity'].value_counts().sum()).values
print('따로 계산한 사전 확률: ', prior_probabilities)

param_grid = [
    {'prep__num_pipeline__outlier':[None, MyOutlierTransformer()],
        'prep__num_pipeline__scaler':[None],
     'clf':[GaussianNB()],
     'clf__priors':[prior_probabilities, None]},
]
grid_search = GridSearchCV(pipeClassifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

따로 계산한 사전 확률:  [0.1913765  0.18688792 0.17491839 0.16675734 0.14594668 0.13411317]


In [16]:
pd.DataFrame(grid_search.cv_results_, columns=['mean_test_score', 'rank_test_score', 'params']).\
sort_values(by='rank_test_score')

Unnamed: 0,mean_test_score,rank_test_score,params
2,0.536603,1,"{'clf': GaussianNB(), 'clf__priors': None, 'prep__num_pipeline__outlier': None, 'prep__num_pipeline__scaler': None}"
0,0.536331,2,"{'clf': GaussianNB(), 'clf__priors': [0.1913764961915125, 0.1868879216539717, 0.1749183895538629, 0.16675734494015235, 0.1459466811751904, 0.13411316648531013], 'prep__num_pipeline__outlier': None, 'prep__num_pipeline__scaler': None}"
1,0.480554,3,"{'clf': GaussianNB(), 'clf__priors': [0.1913764961915125, 0.1868879216539717, 0.1749183895538629, 0.16675734494015235, 0.1459466811751904, 0.13411316648531013], 'prep__num_pipeline__outlier': MyOutlierTransformer(), 'prep__num_pipeline__scaler': None}"
3,0.480418,4,"{'clf': GaussianNB(), 'clf__priors': None, 'prep__num_pipeline__outlier': MyOutlierTransformer(), 'prep__num_pipeline__scaler': None}"


In [17]:
print('최적 파라미터\n', grid_search.best_params_)
print('최고 지표\n', grid_search.best_score_)

최적 파라미터
 {'clf': GaussianNB(), 'clf__priors': None, 'prep__num_pipeline__outlier': None, 'prep__num_pipeline__scaler': None}
최고 지표
 0.5366028940468097


* Gaussian NB 알고리즘을 적용해 평가한 test set의 분류 정확도는 0.54이고 사전확률을 지정하지 않고, 이상치 처리를 하지 않고, 특성스케일링도 하지 않은 경우임.

## 4-2. 특성스케일링이 필요없는 분류기 중 정확도가 높은 알고리즘 찾기

In [20]:
param_grid = [
    {'prep__num_pipeline__scaler':[None], # 특성스케일링 안함
     'clf':[GaussianNB(), 
            DecisionTreeClassifier(), 
            RandomForestClassifier(),]},
]
grid_search = GridSearchCV(pipeClassifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [21]:
pd.DataFrame(grid_search.cv_results_, columns=['mean_test_score', 'rank_test_score', 'params']).\
sort_values(by='rank_test_score')

Unnamed: 0,mean_test_score,rank_test_score,params
2,0.913768,1,"{'clf': RandomForestClassifier(), 'prep__num_pipeline__scaler': None}"
1,0.851207,2,"{'clf': DecisionTreeClassifier(), 'prep__num_pipeline__scaler': None}"
0,0.480418,3,"{'clf': GaussianNB(), 'prep__num_pipeline__scaler': None}"


In [22]:
print('최적 파라미터\n', grid_search.best_params_)
print('최고 지표\n', grid_search.best_score_)

최적 파라미터
 {'clf': RandomForestClassifier(), 'prep__num_pipeline__scaler': None}
최고 지표
 0.9137677640736784


* 랜덤포레스트 분류기의 평균 정확도가 0.91로 가장 높음

## 4-3. 특성스케일링 적용한 분류기 중 정확도가 높은 알고리즘

In [28]:
param_grid = [
    {'prep__num_pipeline__scaler':[StandardScaler()], # Z점수 표준화 적용
     'clf':[KNeighborsClassifier(), 
            LogisticRegression(tol=0.01, max_iter=500), 
            LinearSVC(max_iter=500),
           SVC()]},
]
grid_search = GridSearchCV(pipeClassifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [29]:
pd.DataFrame(grid_search.cv_results_, columns=['mean_test_score', 'rank_test_score', 'params']).\
sort_values(by='rank_test_score')

Unnamed: 0,mean_test_score,rank_test_score,params
2,0.934717,1,"{'clf': LinearSVC(max_iter=500), 'prep__num_pipeline__scaler': StandardScaler()}"
3,0.930499,2,"{'clf': SVC(), 'prep__num_pipeline__scaler': StandardScaler()}"
1,0.929412,3,"{'clf': LogisticRegression(max_iter=500, tol=0.01), 'prep__num_pipeline__scaler': StandardScaler()}"
0,0.869697,4,"{'clf': KNeighborsClassifier(), 'prep__num_pipeline__scaler': StandardScaler()}"


In [30]:
print('최적 파라미터\n', grid_search.best_params_)
print('최고 지표\n', grid_search.best_score_)

최적 파라미터
 {'clf': LinearSVC(max_iter=500), 'prep__num_pipeline__scaler': StandardScaler()}
최고 지표
 0.9347172778016712


* LinearSVC, SVC, LogisticRegression 알고리즘이 목표치보다 정확도가 높음.

## 4-4. 서포트벡터머신 최적파라미터 탐색

* 정확도가 높기로 유명한 SVC를 이용하여 커널 적용 시 결과 비교
* 시간 절약을 위해 교차검증 폴드 개수를 3으로 줄임

In [31]:
param_grid = [
    {'prep__num_pipeline__scaler':[StandardScaler()], # Z점수 표준화 적용
     'clf':[SVC()],
     'clf__kernel':['linear', 'poly', 'rbf', 'sigmoid']},
]
grid_search = GridSearchCV(pipeClassifier, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [33]:
pd.DataFrame(grid_search.cv_results_, columns=['mean_test_score', 'rank_test_score', 'params']).\
sort_values(by='rank_test_score')

Unnamed: 0,mean_test_score,rank_test_score,params
0,0.936344,1,"{'clf': SVC(kernel='linear'), 'clf__kernel': 'linear', 'prep__num_pipeline__scaler': StandardScaler()}"
2,0.92383,2,"{'clf': SVC(kernel='linear'), 'clf__kernel': 'rbf', 'prep__num_pipeline__scaler': StandardScaler()}"
1,0.911724,3,"{'clf': SVC(kernel='linear'), 'clf__kernel': 'poly', 'prep__num_pipeline__scaler': StandardScaler()}"
3,0.830932,4,"{'clf': SVC(kernel='linear'), 'clf__kernel': 'sigmoid', 'prep__num_pipeline__scaler': StandardScaler()}"


In [34]:
print('최적 파라미터\n', grid_search.best_params_)
print('최고 지표\n', grid_search.best_score_)

최적 파라미터
 {'clf': SVC(kernel='linear'), 'clf__kernel': 'linear', 'prep__num_pipeline__scaler': StandardScaler()}
최고 지표
 0.9363439606768861


* SVC 알고리즘에서는 커널로 linearr, rbf, poly를 지정한 경우 모두 목표치를 만족함.

## 최종 적용 [중요***]

In [36]:
pipeClassifier = Pipeline(steps=[
            ('prep', full_pipeline), 
            ('clf', SVC(kernel='linear')),
])

pipeClassifier.fit(X_train, y_train)

교차검증으로 시험셋과 훈련셋의 평균정확도 비교

In [52]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(pipeClassifier, X_test, y_test, cv=5, scoring='accuracy')
print('시험셋 교차검증 정확도:', accuracies)
print('시험셋 교차검증 평균 정확도:', accuracies.mean())

accuracies = cross_val_score(pipeClassifier, X_train, y_train, cv=5, scoring='accuracy')
print('훈련셋 교차검증 정확도:', accuracies)
print('훈련셋 교차검증 평균 정확도:', accuracies.mean())

시험셋 교차검증 정확도: [0.97118644 0.81864407 0.93548387 0.92529711 0.91341256]
시험셋 교차검증 평균 정확도: 0.9128048113723345
훈련셋 교차검증 정확도: [0.9292998  0.90754589 0.96258503 0.92108844 0.95578231]
훈련셋 교차검증 평균 정확도: 0.9352602931043255


# 4. 고찰

* 다음 알고리즘 중에서 Activity를 잘 분류할 수 있는 모델을 조사하였다.
    - 특성스케일링 필요 없는 분류기:
        - GaussianNB
        - DecisionTreeClassifier
        - RandomForestClassifier
    - 특성스케일링 필요한 분류기:
        - KNeighborsClassifier
        - LogisticRegression
        - LinearSVC, 
        - SVC
* 대략적인 알고리즘을 선택하기 위한 탐색에서 목표치보다 높은 정확도를 보인 SVC 알고리즘의 상세한 파라미터 튜닝을 하였다.
    - SVC(kernel='linear'), 특성스케일링으로 Z점수표준화를 적용한 경우에서의 정확도가 0.91으로써 목표치를 만족면서 평가지표가 가장 높았다. 
    - 선형커널을 적용함으로써 수치형 속성이 많은 데이터의 분류를 잘 수행한 것으로 판단된다.

* 최종 선택된 알고리즘으로 시험셋과 훈련셋으로 교차검증 평균 정확도를 구하여 비교한 결과 약간의 과적합화 경향이 보인다. 
* 그리드탐색에서 좋은 결과를 보였던 랜덤포레스트 분류기를 사용하면 과적합을 규제하면서 목표 정확도를 만족할 것으로 예상된다.