#**스마트폰 센서 데이터 기반 모션 분류**
# 단계3 : 단계별 모델링


# 0.요약

단계별로 나눠서 모델링을 수행.  

* 단계1 : 정적(0), 동적(1) 행동 분류 모델 생성
* 단계2 : 세부 동작에 대한 분류모델 생성
    * 단계1 모델에서 0으로 예측 -> 정적 행동 3가지 분류 모델링
    * 단계1 모델에서 1으로 예측 -> 동적 행동 3가지 분류 모델링 
* 모델 통합
    * 두 단계 모델을 통합하고, 새로운 데이터에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
* 성능 비교
    * 기본 모델링의 성능과 비교
    * 모든 모델링은 [다양한 알고리즘 + 성능 튜닝]을 수행.
    * 성능 가이드
            * Accuracy : 0.980~1.00


## 1.환경설정

### (1) 경로 설정

#### 1) 로컬 수행(Anaconda)


In [1]:
path = 'C:/Users/User/Desktop/'

#### 2) 구글 콜랩 수행

* 구글 드라이브 연결

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# path = '/content/drive/MyDrive/project/'

### (2) 라이브러리 불러오기

#### 1) 라이브러리 로딩

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format='retina'
# 필요하다고 판단되는 라이브러리를 추가하세요.

#### 2) 제공 함수 생성
* 변수 중요도를 시각화할 수 있는 함수.
* 입력 : 
    * importance : 트리모델의 변수 중요도(예: model.feature_importances_)
    * names : 변수 이름 목록(예 : x_train.columns
    * result_only  : 변수 중요도 순으로 데이터프레임만 return할지, 그래프도 포함할지 결정. False이면 결과 데이터프레임 + 그래프
    * topn : 중요도 상위 n개만 표시. all 이면 전체.
* 출력 : 
    * 중요도 그래프 : 중요도 내림차순으로 정렬
    * 중요도 데이터프레임 : 중요도 내림차순으로 정렬

In [281]:
# 변수의 특성 중요도 계산하기
def plot_feature_importance(importance, names, result_only = False, topn = 'all'):
    feature_importance = np.array(importance)
    feature_name = np.array(names)

    data={'feature_name':feature_name,'feature_importance':feature_importance}
    fi_temp = pd.DataFrame(data)

    #변수의 특성 중요도 순으로 정렬하기
    fi_temp.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_temp.reset_index(drop=True, inplace = True)

    if topn == 'all' :
        fi_df = fi_temp.copy()
    else :
        fi_df = fi_temp.iloc[:topn]

    #변수의 특성 중요도 그래프로 그리기
    if result_only == False :
        plt.figure(figsize=(10,20))
        sns.barplot(x='feature_importance', y='feature_name', data = fi_df)

        plt.xlabel('importance')
        plt.ylabel('feature name')
        plt.grid()

    return fi_df

### (3) 데이터 불러오기

* 주어진 데이터셋
    * data01_train.csv : 학습 및 검증용
    * data01_test.csv : 테스트용
    * feature.csv : feature 이름을 계층구조로 정리한 데이터

* 세부 처리사항
    * 칼럼 삭제 : data01_train.csv와 data01_test.csv 에서 'subject' 칼럼은 불필요하므로 삭제.

#### 1) 데이터로딩

In [282]:
train = pd.read_csv('data01_train.csv')
test = pd.read_csv('data01_test.csv')
features = pd.read_csv('features.csv')

In [283]:
train.drop('subject', axis=1, inplace=True)
test.drop('subject', axis=1, inplace=True)

#### 2) 기본 정보 조회

In [284]:
print(train.info())
print('='*40)
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5881 entries, 0 to 5880
Columns: 562 entries, tBodyAcc-mean()-X to Activity
dtypes: float64(561), object(1)
memory usage: 25.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1471 entries, 0 to 1470
Columns: 562 entries, tBodyAcc-mean()-X to Activity
dtypes: float64(561), object(1)
memory usage: 6.3+ MB
None


In [285]:
print(train.isna().sum())
print('='*40)
print(test.isna().sum())

tBodyAcc-mean()-X                       0
tBodyAcc-mean()-Y                       0
tBodyAcc-mean()-Z                       0
tBodyAcc-std()-X                        0
tBodyAcc-std()-Y                        0
                                       ..
angle(tBodyGyroJerkMean,gravityMean)    0
angle(X,gravityMean)                    0
angle(Y,gravityMean)                    0
angle(Z,gravityMean)                    0
Activity                                0
Length: 562, dtype: int64
tBodyAcc-mean()-X                       0
tBodyAcc-mean()-Y                       0
tBodyAcc-mean()-Z                       0
tBodyAcc-std()-X                        0
tBodyAcc-std()-Y                        0
                                       ..
angle(tBodyGyroJerkMean,gravityMean)    0
angle(X,gravityMean)                    0
angle(Y,gravityMean)                    0
angle(Z,gravityMean)                    0
Activity                                0
Length: 562, dtype: int64


In [286]:
display(train.describe().T)
print('='*100)
display(test.describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tBodyAcc-mean()-X,5881.0,0.274811,0.067614,-0.503823,0.262919,0.277154,0.288526,1.000000
tBodyAcc-mean()-Y,5881.0,-0.017799,0.039422,-0.684893,-0.024877,-0.017221,-0.010920,1.000000
tBodyAcc-mean()-Z,5881.0,-0.109396,0.058373,-1.000000,-0.121051,-0.108781,-0.098163,1.000000
tBodyAcc-std()-X,5881.0,-0.603138,0.448807,-1.000000,-0.992774,-0.943933,-0.242130,1.000000
tBodyAcc-std()-Y,5881.0,-0.509815,0.501815,-0.999844,-0.977680,-0.844575,-0.034499,0.916238
...,...,...,...,...,...,...,...,...
"angle(tBodyGyroMean,gravityMean)",5881.0,0.009340,0.608190,-1.000000,-0.481718,0.011448,0.499857,0.998702
"angle(tBodyGyroJerkMean,gravityMean)",5881.0,-0.007099,0.476738,-1.000000,-0.373345,-0.000847,0.356236,0.996078
"angle(X,gravityMean)",5881.0,-0.491501,0.509069,-1.000000,-0.811397,-0.709441,-0.511330,0.977344
"angle(Y,gravityMean)",5881.0,0.059299,0.297340,-1.000000,-0.018203,0.182893,0.248435,0.478157




Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tBodyAcc-mean()-X,1471.0,0.273198,0.079989,-1.000000,0.263787,0.277322,0.288058,0.631510
tBodyAcc-mean()-Y,1471.0,-0.017281,0.045957,-1.000000,-0.024792,-0.017187,-0.010238,0.359587
tBodyAcc-mean()-Z,1471.0,-0.108123,0.049082,-0.418354,-0.120733,-0.108124,-0.096606,0.543939
tBodyAcc-std()-X,1471.0,-0.614634,0.448480,-0.999717,-0.992669,-0.952426,-0.245405,0.899922
tBodyAcc-std()-Y,1471.0,-0.515427,0.506094,-0.999873,-0.979082,-0.867309,-0.030639,0.782590
...,...,...,...,...,...,...,...,...
"angle(tBodyGyroMean,gravityMean)",1471.0,0.006272,0.608954,-0.995222,-0.485998,-0.005036,0.518184,0.994366
"angle(tBodyGyroJerkMean,gravityMean)",1471.0,-0.001510,0.483028,-0.969066,-0.380300,0.002408,0.374583,0.979522
"angle(X,gravityMean)",1471.0,-0.481737,0.522714,-0.999380,-0.814060,-0.708911,-0.486534,1.000000
"angle(Y,gravityMean)",1471.0,0.055771,0.298124,-0.995073,-0.017413,0.178814,0.248126,0.432496


## 2.데이터 전처리

* 세부 처리사항
    - Label 추가 : data 에 Activity_dynamic 를 추가. Activity_dynamic은 과제1에서 is_dynamic과 동일한 값.
    - x와 y1, y2로 분할.
        * y1 : Activity
        * y2 : Activity_dynamic
    - train : val = 8 : 2 혹은 7 : 3
    - random_state 옵션을 사용하여 다른 모델과 비교를 위해 성능 재현.

In [287]:
train['Activity_dynamic'] = train['Activity'].map({'STANDING': 0, 'SITTING': 0, 'LAYING': 0, 'WALKING': 1, 'WALKING_UPSTAIRS': 1, 'WALKING_DOWNSTAIRS': 1})
train.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity,Activity_dynamic
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,STANDING,0
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING,0
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING,0
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING,1
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS,1


In [288]:
x = train.drop(['Activity', 'Activity_dynamic'], axis=1)
y1 = train.loc[:, 'Activity']
y2 = train.loc[:, 'Activity_dynamic']

In [289]:
from sklearn.model_selection import train_test_split
# 분할 y1
x_train, x_val, y1_train, y1_val = train_test_split(x, y1, test_size=0.3, random_state=1, stratify=y1)

## **3.단계별 모델링**

![](https://github.com/DA4BAM/image/blob/main/step%20by%20step.png?raw=true)

### (1) 단계1 : 정적/동적 행동 분류 모델

* 세부 처리사항
    * 정적 행동(Laying, Sitting, Standing)과 동적 행동(동적 : Walking, Walking-Up, Walking-Down)을 구분하는 모델 생성.
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정.

In [290]:
# 분할 y2
x_train, x_val, y2_train, y2_val = train_test_split(x, y2, test_size=0.3, random_state=1, stratify=y2)

#### 1) Logistic Regression

In [353]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
# 모델
model_lr = LogisticRegression(random_state=7)
# 학습
model_lr.fit(x_train, y2_train)
# 예측
y2_pred = model_lr.predict(x_val)
# 평가
print('Accuracy:', accuracy_score(y2_val, y2_pred))
print(confusion_matrix(y2_val, y2_pred))
print(classification_report(y2_val, y2_pred))

Accuracy: 0.9994334277620397
[[970   1]
 [  0 794]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       971
           1       1.00      1.00      1.00       794

    accuracy                           1.00      1765
   macro avg       1.00      1.00      1.00      1765
weighted avg       1.00      1.00      1.00      1765



In [354]:
model_lr_best = model_lr.coef_.__abs__().sum(axis=0)

In [293]:
features_top30_lr = plot_feature_importance(model_lr_best, names=list(x_train), result_only=30, topn = 30)

In [294]:
features_top30_lr

Unnamed: 0,feature_name,feature_importance
0,tGravityAcc-entropy()-Y,0.584519
1,tBodyAcc-max()-X,0.421386
2,fBodyAccJerk-entropy()-X,0.394913
3,tBodyAccJerk-entropy()-X,0.36569
4,"tBodyGyro-correlation()-Y,Z",0.342911
5,fBodyAcc-entropy()-X,0.34192
6,tGravityAcc-energy()-X,0.329215
7,fBodyAccJerk-maxInds-X,0.323952
8,tBodyAccJerk-entropy()-Z,0.314657
9,tBodyAccJerkMag-entropy(),0.30435


In [355]:
best_model_lr = model_lr.fit(x_train.loc[:, features_top30_lr['feature_name']], y2_train)
y2_pred = best_model_lr.predict(x_val.loc[:,features_top30_lr['feature_name']])

In [356]:
print('Accuracy:', accuracy_score(y2_val, y2_pred))
print(confusion_matrix(y2_val, y2_pred))
print(classification_report(y2_val, y2_pred))

Accuracy: 1.0
[[971   0]
 [  0 794]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       971
           1       1.00      1.00      1.00       794

    accuracy                           1.00      1765
   macro avg       1.00      1.00      1.00      1765
weighted avg       1.00      1.00      1.00      1765



In [357]:
best_model_lr_all = best_model_lr

In [358]:
joblib.dump(features_top30_lr, './features_top30_lr.pkl')
joblib.dump(best_model_lr_all, './best_model_lr_all.pkl')

['./best_model_lr_all.pkl']

#### 2) RandomForest

In [297]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
# 모델
model_rf = RandomForestClassifier()
# 학습
model_rf.fit(x_train, y2_train)
# 예측
y2_pred = model_rf.predict(x_val)
# 평가
print('Accuracy:', accuracy_score(y2_val, y2_pred))
print(confusion_matrix(y2_val, y2_pred))
print(classification_report(y2_val, y2_pred))

Accuracy: 0.9994334277620397
[[970   1]
 [  0 794]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       971
           1       1.00      1.00      1.00       794

    accuracy                           1.00      1765
   macro avg       1.00      1.00      1.00      1765
weighted avg       1.00      1.00      1.00      1765



In [298]:
features_top30_rf = plot_feature_importance(model_rf.feature_importances_, names=list(x_train), result_only=True, topn = 30)

In [313]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# optuna
def objective(trial):
    # 하이퍼파라미터 범위 설정
    max_depth = trial.suggest_int('max_depth', 5, 20)
    n_estimators = trial.suggest_int('n_estimators', 100, 200)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    # 모델
    model_rf = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=7)
    
    # 학습
    model_rf.fit(x_train.loc[:, features_top30_rf['feature_name']], y2_train)
    
    # 예측 및 평가
    y2_pred = model_rf.predict(x_val.loc[:, features_top30_rf['feature_name']])
    accuracy = accuracy_score(y2_val, y2_pred)
    
    return accuracy

# Optuna 최적화
study = optuna.create_study(direction='maximize') # accuracy
study.optimize(objective, n_trials=30)  

# 최적 하이퍼파라미터 
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-04-05 15:35:36,653] A new study created in memory with name: no-name-a30aeced-584f-42f0-ac24-47be3dd32a5f
[I 2024-04-05 15:35:38,344] Trial 0 finished with value: 0.9994334277620397 and parameters: {'max_depth': 7, 'n_estimators': 145, 'min_samples_split': 4, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.9994334277620397.
[I 2024-04-05 15:35:39,572] Trial 1 finished with value: 0.9994334277620397 and parameters: {'max_depth': 8, 'n_estimators': 100, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.9994334277620397.
[I 2024-04-05 15:35:41,051] Trial 2 finished with value: 0.9994334277620397 and parameters: {'max_depth': 19, 'n_estimators': 123, 'min_samples_split': 3, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.9994334277620397.
[I 2024-04-05 15:35:43,085] Trial 3 finished with value: 0.9994334277620397 and parameters: {'max_depth': 12, 'n_estimators': 171, 'min_samples_split': 15, 'min_samples_leaf': 9}. Best is trial 0 with value:

Number of finished trials: 30
Best trial: {'max_depth': 7, 'n_estimators': 145, 'min_samples_split': 4, 'min_samples_leaf': 9}


In [314]:
# 모델 학습, 예측, 평가
best_params = study.best_trial.params
best_model_rf = RandomForestClassifier(**best_params, random_state=7)
best_model_rf.fit(x_train.loc[:, features_top30_rf['feature_name']], y2_train)
y2_pred = best_model_rf.predict(x_val.loc[:, features_top30_rf['feature_name']])
print('Accuracy:', accuracy_score(y2_val, y2_pred))
print(confusion_matrix(y2_val, y2_pred))
print(classification_report(y2_val, y2_pred))

Accuracy: 0.9994334277620397
[[970   1]
 [  0 794]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       971
           1       1.00      1.00      1.00       794

    accuracy                           1.00      1765
   macro avg       1.00      1.00      1.00      1765
weighted avg       1.00      1.00      1.00      1765



In [315]:
model_rf_params = best_params

In [316]:
joblib.dump(model_rf_params, './model_rf_params.pkl')
joblib.dump(best_model_rf, './best_model_rf.pkl')

['./best_model_rf.pkl']

### (2) 단계2-1 : 정적 동작 세부 분류

* 세부 처리사항
    * 정적 행동(Laying, Sitting, Standing)인 데이터 추출
    * Laying, Sitting, Standing 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정.

In [317]:
# 정적 : 0
train_stop = train.loc[train['Activity'].isin(['LAYING', 'STANDING', 'SITTING'])]
# 동적 : 1
train_move = train.loc[train['Activity'].isin(['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS'])]

In [318]:
train_stop.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity,Activity_dynamic
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,STANDING,0
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING,0
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING,0
7,0.272026,-0.001329,-0.125491,-0.992068,-0.912985,-0.972451,-0.994752,-0.943141,-0.976428,-0.925446,...,-0.704995,-0.024442,0.076332,0.741277,0.729812,-0.817201,0.037746,0.136129,STANDING,0
8,0.284338,0.021956,-0.006925,-0.980153,-0.838394,-0.782357,-0.983683,-0.816199,-0.743923,-0.914011,...,-0.400197,0.021212,-0.009465,-0.282762,0.563343,-0.782072,0.242834,-0.025285,STANDING,0


In [319]:
x_stop = train_stop.drop(['Activity', 'Activity_dynamic'], axis=1)
y_stop = train_stop.loc[:, 'Activity']
x_stop.head(3)

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.032207,-0.487737,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.246705,-0.23782,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,0.388765,-0.535287,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755


In [320]:
x_stop_train, x_stop_val, y_stop_train, y_stop_val = train_test_split(x_stop, y_stop, test_size=0.3, random_state=1, stratify=y_stop)

In [321]:
model_lo_stop = LogisticRegression(random_state=7)
model_lo_stop.fit(x_stop_train, y_stop_train)
pred_stop_val = model_lo_stop.predict(x_stop_val)
print('Accuracy:', accuracy_score(y_stop_val, pred_stop_val))
print(confusion_matrix(y_stop_val, pred_stop_val))
print(classification_report(y_stop_val, pred_stop_val))

Accuracy: 0.9670442842430484
[[335   0   0]
 [  0 293  17]
 [  0  15 311]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       335
     SITTING       0.95      0.95      0.95       310
    STANDING       0.95      0.95      0.95       326

    accuracy                           0.97       971
   macro avg       0.97      0.97      0.97       971
weighted avg       0.97      0.97      0.97       971



In [322]:
best_model_lr = model_lo_stop.fit(x_stop_train.loc[:, features_top30_lr['feature_name']], y_stop_train)
pred_stop_val = best_model_lr.predict(x_stop_val.loc[:,features_top30_lr['feature_name']])

In [323]:
print('Accuracy:', accuracy_score(y_stop_val, pred_stop_val))
print(confusion_matrix(y_stop_val, pred_stop_val))
print(classification_report(y_stop_val, pred_stop_val))

Accuracy: 0.933058702368692
[[335   0   0]
 [  3 262  45]
 [  0  17 309]]
              precision    recall  f1-score   support

      LAYING       0.99      1.00      1.00       335
     SITTING       0.94      0.85      0.89       310
    STANDING       0.87      0.95      0.91       326

    accuracy                           0.93       971
   macro avg       0.93      0.93      0.93       971
weighted avg       0.93      0.93      0.93       971



In [324]:
best_model_lr_stop = best_model_lr
joblib.dump(best_model_lr_stop, './best_model_lr_stop.pkl')

['./best_model_lr_stop.pkl']

In [325]:
# 모델 생성
model_rf_stop = RandomForestClassifier(random_state=7)
# 학습
model_rf_stop.fit(x_stop_train, y_stop_train)
# 예측
pred_stop_val = model_rf_stop.predict(x_stop_val)
# 평가
print('Accuracy:', accuracy_score(y_stop_val, pred_stop_val))
print(confusion_matrix(y_stop_val, pred_stop_val))
print(classification_report(y_stop_val, pred_stop_val))

Accuracy: 0.9660144181256437
[[335   0   0]
 [  0 288  22]
 [  0  11 315]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       335
     SITTING       0.96      0.93      0.95       310
    STANDING       0.93      0.97      0.95       326

    accuracy                           0.97       971
   macro avg       0.97      0.97      0.97       971
weighted avg       0.97      0.97      0.97       971



In [327]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# optuna
def objective(trial):
    # 하이퍼파라미터 범위 설정
    max_depth = trial.suggest_int('max_depth', 5, 20)
    n_estimators = trial.suggest_int('n_estimators', 100, 200)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    # 모델
    model_rf_stop = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=7)
    
    # 학습
    model_rf_stop.fit(x_stop_train.loc[:, features_top30_rf['feature_name']], y_stop_train)
    
    # 예측 및 평가
    pred_stop_val = model_rf_stop.predict(x_stop_val.loc[:, features_top30_rf['feature_name']])
    accuracy = accuracy_score(y_stop_val, pred_stop_val)
    
    return accuracy

# Optuna 최적화
study = optuna.create_study(direction='maximize') # accuracy
study.optimize(objective, n_trials=30)  

# 최적 하이퍼파라미터 
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-04-05 15:40:36,726] A new study created in memory with name: no-name-34c5a9fa-5b25-4427-b1d9-7a6c89d0124e
[I 2024-04-05 15:40:41,688] Trial 0 finished with value: 0.6622039134912462 and parameters: {'max_depth': 16, 'n_estimators': 125, 'min_samples_split': 17, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.6622039134912462.
[I 2024-04-05 15:40:49,514] Trial 1 finished with value: 0.6766220391349125 and parameters: {'max_depth': 16, 'n_estimators': 197, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.6766220391349125.
[I 2024-04-05 15:40:52,400] Trial 2 finished with value: 0.6374871266735325 and parameters: {'max_depth': 5, 'n_estimators': 153, 'min_samples_split': 17, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.6766220391349125.
[I 2024-04-05 15:40:56,908] Trial 3 finished with value: 0.6436663233779608 and parameters: {'max_depth': 7, 'n_estimators': 176, 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 1 with value

Number of finished trials: 30
Best trial: {'max_depth': 19, 'n_estimators': 193, 'min_samples_split': 18, 'min_samples_leaf': 5}


In [328]:
# 모델 학습, 예측, 평가
best_params = study.best_trial.params
best_model_rf = RandomForestClassifier(**best_params, random_state=7)
best_model_rf.fit(x_stop_train.loc[:, features_top30_rf['feature_name']], y_stop_train)
pred_stop_val = best_model_rf.predict(x_stop_val.loc[:, features_top30_rf['feature_name']])
print('Accuracy:', accuracy_score(y_stop_val, pred_stop_val))
print(confusion_matrix(y_stop_val, pred_stop_val))
print(classification_report(y_stop_val, pred_stop_val))

Accuracy: 0.6673532440782698
[[264  46  25]
 [ 67 156  87]
 [ 42  56 228]]
              precision    recall  f1-score   support

      LAYING       0.71      0.79      0.75       335
     SITTING       0.60      0.50      0.55       310
    STANDING       0.67      0.70      0.68       326

    accuracy                           0.67       971
   macro avg       0.66      0.66      0.66       971
weighted avg       0.66      0.67      0.66       971



In [329]:
model_rf_params_stop = best_params
best_model_rf_stop = best_model_rf

In [330]:
joblib.dump(features_top30_rf, './features_top30_rf.pkl')
joblib.dump(model_rf_params_stop, './model_rf_params_stop.pkl')
joblib.dump(best_model_rf_stop, './best_model_rf_stop.pkl')

['./best_model_rf_stop.pkl']

### (3) 단계2-2 : 동적 동작 세부 분류

* 세부 처리사항
    * 동적 행동(Walking, Walking Upstairs, Walking Downstairs)인 데이터 추출
    * Walking, Walking Upstairs, Walking Downstairs 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정

In [331]:
x_move = train_move.drop(['Activity', 'Activity_dynamic'], axis=1)
y_move = train_move.loc[:, 'Activity']
x_move.head(3)

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,0.050888,-0.004012,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.013902,-0.157832,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758
5,0.330708,0.007561,-0.061371,-0.21576,0.101075,0.072949,-0.269857,0.06006,0.101298,-0.019263,...,0.114524,-0.599906,-0.887024,-0.030645,-0.852091,-0.500195,0.306091,-0.552729,0.253885,0.291256


In [332]:
x_move_train, x_move_val, y_move_train, y_move_val = train_test_split(x_move, y_move, test_size=0.3, random_state=1, stratify=y_move)

In [333]:
model_lo_move = LogisticRegression()
model_lo_move.fit(x_move_train, y_move_train)
pred_move_val = model_lo_move.predict(x_move_val)
print('Accuracy:', accuracy_score(y_move_val, pred_move_val))
print(confusion_matrix(y_move_val, pred_move_val))
print(classification_report(y_move_val, pred_move_val))

Accuracy: 0.9949685534591195
[[300   0   0]
 [  1 233   3]
 [  0   0 258]]
                    precision    recall  f1-score   support

           WALKING       1.00      1.00      1.00       300
WALKING_DOWNSTAIRS       1.00      0.98      0.99       237
  WALKING_UPSTAIRS       0.99      1.00      0.99       258

          accuracy                           0.99       795
         macro avg       1.00      0.99      0.99       795
      weighted avg       1.00      0.99      0.99       795



In [334]:
best_model_lr = model_lo_move.fit(x_move_train.loc[:, features_top30_lr['feature_name']], y_move_train)
pred_stop_val = best_model_lr.predict(x_move_val.loc[:,features_top30_lr['feature_name']])

In [335]:
print('Accuracy:', accuracy_score(y_move_val, pred_move_val))
print(confusion_matrix(y_move_val, pred_move_val))
print(classification_report(y_move_val, pred_move_val))

Accuracy: 0.9949685534591195
[[300   0   0]
 [  1 233   3]
 [  0   0 258]]
                    precision    recall  f1-score   support

           WALKING       1.00      1.00      1.00       300
WALKING_DOWNSTAIRS       1.00      0.98      0.99       237
  WALKING_UPSTAIRS       0.99      1.00      0.99       258

          accuracy                           0.99       795
         macro avg       1.00      0.99      0.99       795
      weighted avg       1.00      0.99      0.99       795



In [341]:
best_model_lr_move = best_model_lr
joblib.dump(best_model_lr_move, './best_model_lr_move.pkl')

['./best_model_lr_move.pkl']

In [336]:
# 모델 생성
model_rf_move = RandomForestClassifier()
# 학습
model_rf_move.fit(x_move_train, y_move_train)
# 예측
pred_move_val = model_rf_move.predict(x_move_val)
# 평가
print('Accuracy:', accuracy_score(y_move_val, pred_move_val))
print(confusion_matrix(y_move_val, pred_move_val))
print(classification_report(y_move_val, pred_move_val))

Accuracy: 0.9811320754716981
[[295   3   2]
 [  3 230   4]
 [  0   3 255]]
                    precision    recall  f1-score   support

           WALKING       0.99      0.98      0.99       300
WALKING_DOWNSTAIRS       0.97      0.97      0.97       237
  WALKING_UPSTAIRS       0.98      0.99      0.98       258

          accuracy                           0.98       795
         macro avg       0.98      0.98      0.98       795
      weighted avg       0.98      0.98      0.98       795



In [337]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# optuna
def objective(trial):
    # 하이퍼파라미터 범위 설정
    max_depth = trial.suggest_int('max_depth', 5, 20)
    n_estimators = trial.suggest_int('n_estimators', 100, 200)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    # 모델
    model_rf_move = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=7)
    
    # 학습
    model_rf_move.fit(x_move_train.loc[:, features_top30_rf['feature_name']], y_move_train)
    
    # 예측 및 평가
    pred_move_val = model_rf_move.predict(x_move_val.loc[:, features_top30_rf['feature_name']])
    accuracy = accuracy_score(y_move_val, pred_move_val)
    
    return accuracy

# Optuna 최적화
study = optuna.create_study(direction='maximize') # accuracy
study.optimize(objective, n_trials=30)  

# 최적 하이퍼파라미터 
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-04-05 15:43:58,244] A new study created in memory with name: no-name-505b7e0d-e668-4ba4-a859-b455340ed760
[I 2024-04-05 15:44:00,655] Trial 0 finished with value: 0.8465408805031447 and parameters: {'max_depth': 5, 'n_estimators': 137, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.8465408805031447.
[I 2024-04-05 15:44:05,227] Trial 1 finished with value: 0.889308176100629 and parameters: {'max_depth': 7, 'n_estimators': 198, 'min_samples_split': 12, 'min_samples_leaf': 8}. Best is trial 1 with value: 0.889308176100629.
[I 2024-04-05 15:44:08,162] Trial 2 finished with value: 0.9132075471698113 and parameters: {'max_depth': 18, 'n_estimators': 104, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.9132075471698113.
[I 2024-04-05 15:44:13,081] Trial 3 finished with value: 0.9132075471698113 and parameters: {'max_depth': 15, 'n_estimators': 174, 'min_samples_split': 13, 'min_samples_leaf': 2}. Best is trial 2 with value:

Number of finished trials: 30
Best trial: {'max_depth': 18, 'n_estimators': 103, 'min_samples_split': 10, 'min_samples_leaf': 5}


In [338]:
# 모델 학습, 예측, 평가
best_params = study.best_trial.params
best_model_rf = RandomForestClassifier(**best_params, random_state=7)
best_model_rf.fit(x_move_train.loc[:, features_top30_rf['feature_name']], y_move_train)
pred_move_val = best_model_rf.predict(x_move_val.loc[:, features_top30_rf['feature_name']])
print('Accuracy:', accuracy_score(y_move_val, pred_move_val))
print(confusion_matrix(y_move_val, pred_move_val))
print(classification_report(y_move_val, pred_move_val))

Accuracy: 0.8968553459119497
[[285   6   9]
 [  7 200  30]
 [ 23   7 228]]
                    precision    recall  f1-score   support

           WALKING       0.90      0.95      0.93       300
WALKING_DOWNSTAIRS       0.94      0.84      0.89       237
  WALKING_UPSTAIRS       0.85      0.88      0.87       258

          accuracy                           0.90       795
         macro avg       0.90      0.89      0.89       795
      weighted avg       0.90      0.90      0.90       795



In [339]:
model_rf_params_move = best_params
best_model_rf_move = best_model_rf

In [340]:
joblib.dump(model_rf_params_move, './model_rf_params_move.pkl')
joblib.dump(best_model_rf_move, './best_model_rf_move.pkl')

['./best_model_rf_move.pkl']

### (4) 분류 모델 합치기


* 세부 처리사항
    * 두 단계 모델을 통합하고, 새로운 데이터(test)에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
    * 데이터 파이프라인 구축 : test데이터가 로딩되어 전처리 과정을 거치고, 예측 및 성능 평가 수행

![](https://github.com/DA4BAM/image/blob/main/pipeline%20function.png?raw=true)

#### 1) 함수 만들어서 분류모델 합치기

In [359]:
def model(data):
    data['Activity_dynamic'] = data['Activity'].map({'STANDING': 0, 'SITTING': 0, 'LAYING': 0, 'WALKING': 1, 'WALKING_UPSTAIRS': 1, 'WALKING_DOWNSTAIRS': 1})

    x_test = data.drop(['Activity', 'Activity_dynamic'], axis=1)
    y_test = data.loc[:, 'Activity']
    y2 = data.loc[:, 'Activity_dynamic']

    # 예측
    pred = best_model_lr_all.predict(x_test.loc[:,features_top30_lr['feature_name']])
    data['Predicted'] = pred
    return data['Predicted']

In [360]:
model(test)

0       0
1       0
2       1
3       0
4       0
       ..
1466    0
1467    0
1468    1
1469    0
1470    1
Name: Predicted, Length: 1471, dtype: int64

In [361]:
def sum_model(data):    
    # 정적 : 0
    stop = data.loc[data['Predicted'] == 0]
    x_stop = stop.drop(['Activity', 'Activity_dynamic', 'Predicted'], axis=1)
    y_stop = stop.loc[:, 'Activity']        

    # 예측
    pred_stop = best_model_lr_stop.predict(x_stop.loc[:,features_top30_lr['feature_name']])    
    stop['Predict_stop'] = pred_stop
    # 동적 : 1
    move = data.loc[data['Predicted'] == 1]
    x_move = move.drop(['Activity', 'Activity_dynamic', 'Predicted'], axis=1)
    y_move = move.loc[:, 'Activity']        

    # 예측
    pred_move = best_model_lr_move.predict(x_move.loc[:,features_top30_lr['feature_name']])   

    # 예측 결과 합
    move['Predict_move'] = pred_move
    result = pd.concat([stop, move], axis=0)
    result['Predicted_all'] = result.apply(lambda row: row['Predict_stop'] if pd.notnull(row['Predict_stop']) else row['Predict_move'], axis=1)
    result.drop(['Predict_stop', 'Predict_move'], axis=1, inplace=True)
    
    y_test = result.loc[:, 'Activity']
    y_pred = result.loc[:, 'Predicted_all']
    
    print(classification_report(y_test, y_pred))
    return y_pred

#### 2) test 셋으로 예측하고 평가하기

In [362]:
evaluation = sum_model(test)
evaluation

                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       292
           SITTING       0.92      0.88      0.90       254
          STANDING       0.90      0.93      0.92       287
           WALKING       0.92      0.95      0.93       228
WALKING_DOWNSTAIRS       0.97      0.96      0.97       195
  WALKING_UPSTAIRS       0.93      0.91      0.92       215

          accuracy                           0.94      1471
         macro avg       0.94      0.94      0.94      1471
      weighted avg       0.94      0.94      0.94      1471



0                  SITTING
1                 STANDING
3                  SITTING
4                 STANDING
6                 STANDING
               ...        
1459      WALKING_UPSTAIRS
1462    WALKING_DOWNSTAIRS
1463    WALKING_DOWNSTAIRS
1468               WALKING
1470    WALKING_DOWNSTAIRS
Name: Predicted_all, Length: 1471, dtype: object

* 성능 평가

- 정확도:98%
- F1-score: sitting, stading에 대한 f1-score는 95%
- 동적 움직임에 대해서는 거의 완벽에 가깝게 맞춤

In [363]:
import joblib

# 함수 저장
joblib.dump(model, 'model_function.pkl')
joblib.dump(sum_model, 'sum_model_function.pkl')

# 평가 저장
joblib.dump(evaluation, 'evaluation.pkl')

['evaluation.pkl']