# 교재 p.200 사용자 행동 인식 데이터

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [3]:
feature_name_df = pd.read_csv('../../data/human_activity/features.txt',sep='\s+',
                             header=None,names=['column_index','column_name'])
feature_name = feature_name_df.iloc[:,1].values.tolist()
print('전체 피처명에서 10개만 추출:',feature_name[:10])

전체 피처명에서 10개만 추출: ['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z', 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z', 'tBodyAcc-max()-X']


In [6]:
# 피처명 중복 확인
feature_dup_df = feature_name_df.groupby('column_name').count()
print(feature_dup_df[feature_dup_df['column_index']>1].count())
feature_dup_df[feature_dup_df['column_index']>1].head()

# 총 42개의 피처명이 중복되어 있음.

column_index    42
dtype: int64


Unnamed: 0_level_0,column_index
column_name,Unnamed: 1_level_1
"fBodyAcc-bandsEnergy()-1,16",3
"fBodyAcc-bandsEnergy()-1,24",3
"fBodyAcc-bandsEnergy()-1,8",3
"fBodyAcc-bandsEnergy()-17,24",3
"fBodyAcc-bandsEnergy()-17,32",3


In [7]:
# 중복된 피처명 처리
def get_new_feature_name_df(old_featre_name_df):
    feature_dup_df = pd.DataFrame(data = old_featre_name_df.groupby('column_name').cumcount(),
                                 columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_dup_df = pd.merge(old_featre_name_df.reset_index(),feature_dup_df,how='outer')
    new_feature_dup_df['column_name'] = new_feature_dup_df[['column_name','dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) if x[1] > 0 else x[0],axis=1)
    new_feature_dup_df = new_feature_dup_df.drop(['index'],axis=1)
    return new_feature_dup_df

In [10]:
def get_human_dataset():
    feature_name_df = pd.read_csv('../../data/human_activity/features.txt',sep='\s+',header=None,names=['column_index','column_name'])
    # 중복된 피처명을 수정. get_new_feature_name_df() 호출
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    # DF객체에 피처명을 컬럼명으로 부여하기 위해 리스트로 변환
    feature_name = new_feature_name_df.iloc[:,1].values.tolist()
    # 학습 피처 데이터셋과 테스트 피처 데이터셋 로드
    X_train = pd.read_csv('../../data/human_activity/train/X_train.txt',sep='\s+',names=feature_name)
    X_test = pd.read_csv('../../data/human_activity/test/X_test.txt',sep='\s+',names=feature_name)
    # 학습, 테스트 레이블 데이터 로드
    y_train = pd.read_csv('../../data/human_activity/train/y_train.txt',sep='\s+',header=None,names=['action'])
    y_test = pd.read_csv('../../data/human_activity/test/y_test.txt',sep='\s+',header=None,names=['action'])
    return X_train, X_test, y_train, y_test

In [11]:
X_train, X_test, y_train, y_test = get_human_dataset()

In [12]:
print('## 학습 피처 데이터셋 info')
X_train.info()

## 학습 피처 데이터셋 info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7352 entries, 0 to 7351
Columns: 561 entries, tBodyAcc-mean()-X to angle(Z,gravityMean)
dtypes: float64(561)
memory usage: 31.5 MB


In [14]:
print('## 레이블의 클래스')
y_train.value_counts()

## 레이블의 클래스


action
6         1407
5         1374
4         1286
1         1226
2         1073
3          986
dtype: int64

# DecisionTreeClassifier

In [24]:
dt_clf = DecisionTreeClassifier(random_state=156)
# 모델 학습
dt_clf.fit(X_train,y_train)
# 예측
pred = dt_clf.predict(X_test)
# 성늘 평가
accuracy = accuracy_score(y_test,pred)
print('결정 트리 예측 정확도:',np.round(accuracy,4))
print('결정 트리의 default 하이퍼 파라미터:\n',dt_clf.get_params())

결정 트리 예측 정확도: 0.8548
결정 트리의 default 하이퍼 파라미터:
 {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 156, 'splitter': 'best'}


## Hyper Parameter tuning

In [33]:
parameters = {
    'max_depth':[6,8,10,12,16,20,24],
    'min_samples_split':[16]
}

In [34]:
grid_cv = GridSearchCV(dt_clf,param_grid=parameters,cv=5,scoring='accuracy',verbose=1)
grid_cv.fit(X_train,y_train)
print('GridSearchCV의 최고 평균 정확도 수치: {:.4f}'.format(grid_cv.best_score_))
print('GridSearchCV의 최적 하이퍼 파라미터:',grid_cv.best_params_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
GridSearchCV의 최고 평균 정확도 수치: 0.8548794147162603
GridSearchCV의 최적 하이퍼 파라미터: {'max_depth': 8, 'min_samples_split': 16}


In [37]:
# GridSearchCV의 결과를 DF객체로 변환
cv_results_df = pd.DataFrame(grid_cv.cv_results_)
cv_results_df[['param_max_depth','mean_test_score']]

Unnamed: 0,param_max_depth,mean_test_score
0,6,0.847662
1,8,0.854879
2,10,0.852705
3,12,0.845768
4,16,0.847127
5,20,0.848624
6,24,0.848624


In [38]:
# test dataset으로도 예측 성능 평가
max_depth = [6,8,10,12,16,20,24]
for depth in max_depth:
    dt_clf2 = DecisionTreeClassifier(max_depth=depth,min_samples_split=16,random_state=156)
    dt_clf2.fit(X_train,y_train)
    pred = dt_clf2.predict(X_test)
    accuracy = accuracy_score(y_test,pred)
    print("max_depth = {} , 정확도 : {:.4f}".format(depth,accuracy))
    
# Best : max_depth = 8 , 정확도 : 0.8717

max_depth = 6 , 정확도 : 0.8551
max_depth = 8 , 정확도 : 0.8717
max_depth = 10 , 정확도 : 0.8599
max_depth = 12 , 정확도 : 0.8571
max_depth = 16 , 정확도 : 0.8599
max_depth = 20 , 정확도 : 0.8565
max_depth = 24 , 정확도 : 0.8565
