#**스마트폰 센서 데이터 기반 모션 분류**
# 단계3 : 단계별 모델링


## 0.미션

단계별로 나눠서 모델링을 수행하고자 합니다.  

* 단계1 : 정적(0), 동적(1) 행동 분류 모델 생성
* 단계2 : 세부 동작에 대한 분류모델 생성
    * 단계1 모델에서 0으로 예측 -> 정적 행동 3가지 분류 모델링
    * 단계1 모델에서 1으로 예측 -> 동적 행동 3가지 분류 모델링 
* 모델 통합
    * 두 단계 모델을 통합하고, 새로운 데이터에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
* 성능 비교
    * 기본 모델링의 성능과 비교
    * 모든 모델링은 [다양한 알고리즘 + 성능 튜닝]을 수행해야 합니다.


## 1.환경설정

### (1) 라이브러리 불러오기

* 세부 요구사항
    - 기본적으로 필요한 라이브러리를 import 하도록 코드가 작성되어 있습니다.
    - 필요하다고 판단되는 라이브러리를 추가하세요.

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 필요하다고 판단되는 라이브러리를 추가하세요.





In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### (2) 데이터 불러오기

* 주어진 데이터셋
    * data01_train.csv : 학습 및 검증용

 <br/>  

* 세부 요구사항
    - data01_train.csv 를 불러와 'data' 이름으로 저장합니다.
        - data에서 변수 subject는 삭제합니다.


In [24]:
data = pd.read_csv('/content/drive/MyDrive/aivle/5차 미니프로젝트/data01_train.csv')
data.drop('subject', axis = 1, inplace = True)

## 2.데이터 전처리

* 세부 요구사항
    - Label 추가 : data 에 Activity_dynamic 를 추가합니다. Activity_dynamic은 과제1에서 is_dynamic과 동일한 값입니다.
    - x와 y1, y2로 분할하시오.
        * y1 : Activity
        * y2 : Activity_dynamic
    - train : val = 8 : 2 혹은 7 : 3
    - random_state 옵션을 사용하여 다른 모델과 비교를 위해 성능이 재현되도록 합니다.

In [73]:
data['Activity_dynamic'] = 0
data['Activity_dynamic'] = np.where((data['Activity'] == 'STANDING') | (data['Activity'] == 'SITTING') | (data['Activity'] == 'LAYING'), 0, 1)
np.unique(data['Activity_dynamic'])

array([0, 1])

In [74]:
x = data.drop(['Activity', 'Activity_dynamic'], axis = 1)
y = data.loc[:, ['Activity', 'Activity_dynamic']]
# y2 = data.loc[:, 'Activity_dynamic']

In [75]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, random_state = 2023)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((4704, 561), (1177, 561), (4704, 2), (1177, 2))

In [76]:
y1_train = y_train.loc[:, 'Activity']
y2_train = y_train.loc[:, 'Activity_dynamic']
y1_val = y_val.loc[:, 'Activity']
y2_val = y_val.loc[:, 'Activity_dynamic']

In [98]:
np.unique(y1_train)

array(['LAYING', 'SITTING', 'STANDING', 'WALKING', 'WALKING_DOWNSTAIRS',
       'WALKING_UPSTAIRS'], dtype=object)

## **3.단계별 모델링**

![](https://github.com/DA4BAM/image/blob/main/step%20by%20step.png?raw=true)

### (1) 단계1 : 정적/동적 행동 분류 모델

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)과 동적 행동(동적 : Walking, Walking-Up, Walking-Down)을 구분하는 모델 생성.
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

#### 1) 알고리즘1 : 

In [None]:
# !pip install catboost

In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import *

In [79]:
model = LogisticRegression()
model.fit(x_train, y2_train)
y_pred = model.predict(x_val)

print(confusion_matrix(y2_val, y_pred))
print(classification_report(y2_val, y_pred))

[[656   1]
 [  0 520]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       657
           1       1.00      1.00      1.00       520

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177



In [87]:
non_dyna = pd.DataFrame(x_val[y_pred == 0], columns = x_val.columns)
non_dyna.shape

(656, 561)

In [86]:
dyna = pd.DataFrame(x_val[y_pred == 1], columns = x_val.columns)
dyna.shape

(521, 561)

In [85]:
x_val.shape

(1177, 561)

In [95]:
non_dyna_train = pd.DataFrame(x_train[y_train['Activity_dynamic'] == 0], columns = x_train.columns)
non_dyna_train.shape

(2577, 561)

In [96]:
dyna_train = pd.DataFrame(x_train[y_train['Activity_dynamic'] == 1], columns = x_train.columns)
dyna_train.shape

(2127, 561)

In [97]:
x_train.shape

(4704, 561)

In [77]:
def result(model):
    model.fit(x_train, y2_train)
    y_pred = model.predict(x_val)

    print(confusion_matrix(y2_val, y_pred))
    print(classification_report(y2_val, y_pred))

#### 2) 알고리즘2 : 

In [33]:
model = RandomForestClassifier()
result(model)

[[656   1]
 [  0 520]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       657
           1       1.00      1.00      1.00       520

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177



In [36]:
model = XGBClassifier()
result(model)

[[657   0]
 [  0 520]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       657
           1       1.00      1.00      1.00       520

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177



In [37]:
model = SVC()
result(model)

[[657   0]
 [  0 520]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       657
           1       1.00      1.00      1.00       520

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177



In [None]:
model = CatBoostClassifier(iterations = 10, depth = 5)
result(model)

### (2) 단계2-1 : 정적 동작 세부 분류

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)인 데이터 추출
    * Laying, Sitting, Standing 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [41]:
data_temp = data[np.isin(data['Activity'], ['LAYING', 'SITTING', 'STANDING'])]

x = data_temp.drop(['Activity', 'Activity_dynamic'], axis = 1)
y = data_temp.loc[:, 'Activity']

In [42]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, random_state = 2023)

In [43]:
def result2(model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)

    print(confusion_matrix(y_val, y_pred))
    print(classification_report(y_val, y_pred))

In [45]:
model = LogisticRegression(max_iter = 1000)
result2(model)

[[208   0   0]
 [  0 204   7]
 [  0  11 217]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       208
     SITTING       0.95      0.97      0.96       211
    STANDING       0.97      0.95      0.96       228

    accuracy                           0.97       647
   macro avg       0.97      0.97      0.97       647
weighted avg       0.97      0.97      0.97       647



In [46]:
model = RandomForestClassifier()
result2(model)

[[208   0   0]
 [  0 203   8]
 [  0   7 221]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       208
     SITTING       0.97      0.96      0.96       211
    STANDING       0.97      0.97      0.97       228

    accuracy                           0.98       647
   macro avg       0.98      0.98      0.98       647
weighted avg       0.98      0.98      0.98       647



In [48]:
model = SVC()
result2(model)

[[208   0   0]
 [  1 198  12]
 [  0  19 209]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       208
     SITTING       0.91      0.94      0.93       211
    STANDING       0.95      0.92      0.93       228

    accuracy                           0.95       647
   macro avg       0.95      0.95      0.95       647
weighted avg       0.95      0.95      0.95       647



In [49]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_val_le = le.transform(y_val)

In [51]:
model = XGBClassifier()
model.fit(x_train, y_train_le)
y_pred = model.predict(x_val)

print(confusion_matrix(y_val_le, y_pred))
print(classification_report(y_val_le, y_pred))

[[208   0   0]
 [  0 207   4]
 [  0   3 225]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       208
           1       0.99      0.98      0.98       211
           2       0.98      0.99      0.98       228

    accuracy                           0.99       647
   macro avg       0.99      0.99      0.99       647
weighted avg       0.99      0.99      0.99       647



In [None]:
model = CatBoostClassifier(iterations = 10, depth = 5)
result2(model)

### (3) 단계2-2 : 동적 동작 세부 분류

* 세부 요구사항
    * 동적 행동(Walking, Walking Upstairs, Walking Downstairs)인 데이터 추출
    * Walking, Walking Upstairs, Walking Downstairs 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [53]:
data_temp = data[np.isin(data['Activity'], ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS'])]

x = data_temp.drop(['Activity', 'Activity_dynamic'], axis = 1)
y = data_temp.loc[:, 'Activity']

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, random_state = 2023)

In [59]:
model = LogisticRegression(max_iter = 1000)
result2(model)

[[180   0   1]
 [  0 164   0]
 [  1   0 184]]
                    precision    recall  f1-score   support

           WALKING       0.99      0.99      0.99       181
WALKING_DOWNSTAIRS       1.00      1.00      1.00       164
  WALKING_UPSTAIRS       0.99      0.99      0.99       185

          accuracy                           1.00       530
         macro avg       1.00      1.00      1.00       530
      weighted avg       1.00      1.00      1.00       530



In [60]:
model = RandomForestClassifier()
result2(model)

[[175   5   1]
 [  2 161   1]
 [  0   5 180]]
                    precision    recall  f1-score   support

           WALKING       0.99      0.97      0.98       181
WALKING_DOWNSTAIRS       0.94      0.98      0.96       164
  WALKING_UPSTAIRS       0.99      0.97      0.98       185

          accuracy                           0.97       530
         macro avg       0.97      0.97      0.97       530
      weighted avg       0.97      0.97      0.97       530



In [61]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_val_le = le.transform(y_val)

In [62]:
model = XGBClassifier()
model.fit(x_train, y_train_le)
y_pred = model.predict(x_val)

print(confusion_matrix(y_val_le, y_pred))
print(classification_report(y_val_le, y_pred))

[[177   3   1]
 [  0 163   1]
 [  0   2 183]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       181
           1       0.97      0.99      0.98       164
           2       0.99      0.99      0.99       185

    accuracy                           0.99       530
   macro avg       0.99      0.99      0.99       530
weighted avg       0.99      0.99      0.99       530



In [67]:
model = CatBoostClassifier(iterations = 10, depth = 5)
result2(model)

Learning rate set to 0.5
0:	learn: 0.5835357	total: 727ms	remaining: 6.54s
1:	learn: 0.4027012	total: 1.05s	remaining: 4.21s
2:	learn: 0.3138427	total: 1.39s	remaining: 3.23s
3:	learn: 0.2429915	total: 1.71s	remaining: 2.56s
4:	learn: 0.2002614	total: 1.89s	remaining: 1.89s
5:	learn: 0.1700071	total: 2.08s	remaining: 1.39s
6:	learn: 0.1479377	total: 2.26s	remaining: 968ms
7:	learn: 0.1341510	total: 2.44s	remaining: 609ms
8:	learn: 0.1124116	total: 2.63s	remaining: 292ms
9:	learn: 0.0947465	total: 2.82s	remaining: 0us
[[174   4   3]
 [  4 157   3]
 [  0   6 179]]
                    precision    recall  f1-score   support

           WALKING       0.98      0.96      0.97       181
WALKING_DOWNSTAIRS       0.94      0.96      0.95       164
  WALKING_UPSTAIRS       0.97      0.97      0.97       185

          accuracy                           0.96       530
         macro avg       0.96      0.96      0.96       530
      weighted avg       0.96      0.96      0.96       530



### (4) 분류 모델 합치기


* 세부 요구사항
    * 두 단계 모델을 통합하고, 새로운 데이터(test)에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
    * 데이터 파이프라인 구축 : test데이터가 로딩되어 전처리 과정을 거치고, 예측 및 성능 평가 수행

![](https://github.com/DA4BAM/image/blob/main/pipeline%20function.png?raw=true)

#### 1) 함수 만들기

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier   # catboost 설치 후 쓸 것

from sklearn.metrics import *

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [186]:
data = pd.read_csv('/content/drive/MyDrive/aivle/5차 미니프로젝트/data01_train.csv')
data.drop('subject', axis = 1, inplace = True)

data['Activity_dynamic'] = 0
data['Activity_dynamic'] = np.where((data['Activity'] == 'STANDING') | (data['Activity'] == 'SITTING') | (data['Activity'] == 'LAYING'), 0, 1)

x = data.drop(['Activity', 'Activity_dynamic'], axis = 1)
y = data.loc[:, ['Activity', 'Activity_dynamic']]

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, random_state = 2023)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

y1_train = y_train.loc[:, 'Activity']
y2_train = y_train.loc[:, 'Activity_dynamic']
y1_val = y_val.loc[:, 'Activity']
y2_val = y_val.loc[:, 'Activity_dynamic']

In [189]:
# 1단계: 동적, 정적 이진 분류 모델
model1 = LogisticRegression(max_iter = 3000)
model1.fit(x_train, y2_train)
y_pred1 = model1.predict(x_val)

# 정적인 경우의 x_val
non_x_val = pd.DataFrame(x_val[y_pred1 == 0], columns = x_val.columns)
# 동적인 경우의 x_val
dyna_x_val = pd.DataFrame(x_val[y_pred1 == 1], columns = x_val.columns)


# 2단계: 3-Class 분류 모델
# 모델 2-1: 정적인 경우
model2_1 = LogisticRegression(max_iter = 3000)
model2_1.fit(x_train, y1_train)
y_pred2_1 = model2_1.predict(non_x_val)


# 모델 2-2: 동적인 경우
model2_2 = LogisticRegression(max_iter = 3000)
model2_2.fit(x_train, y1_train)
y_pred2_2 = model2_2.predict(dyna_x_val)


# 3단계: 예측 결과 및 성능 평가 결과 출력
# 예측 값에 대한 x_val 데이터프레임의 인덱스 달아주기
dyna_x_val['Activity'] = y_pred2_2
non_x_val['Activity'] = y_pred2_1
final = pd.concat([dyna_x_val, non_x_val], axis = 0)

# 예측 값과 y_val 둘의 인덱스 정렬해주기
# 현재 예측 값의 index는 뒤죽박죽임 -> y_val을 기준으로 정렬
final_y = final.loc[:, 'Activity']
final_df = pd.DataFrame(final_y).rename(columns = {'Activity' : 'Predicted'})
temp = pd.DataFrame(y1_val)
merge_df = pd.merge(temp, final_df, left_index=True, right_index = True)

# 실제 값과 예측 값 분리
validation = merge_df.loc[:, 'Activity']
predicted = merge_df.loc[:, 'Predicted']

print(confusion_matrix(validation, predicted))
print(classification_report(validation, predicted))

[[230   0   0   0   0   0]
 [  0 203   6   0   0   0]
 [  0   6 212   0   0   0]
 [  0   0   0 180   0   0]
 [  0   0   0   0 165   0]
 [  0   0   0   2   0 173]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       230
           SITTING       0.97      0.97      0.97       209
          STANDING       0.97      0.97      0.97       218
           WALKING       0.99      1.00      0.99       180
WALKING_DOWNSTAIRS       1.00      1.00      1.00       165
  WALKING_UPSTAIRS       1.00      0.99      0.99       175

          accuracy                           0.99      1177
         macro avg       0.99      0.99      0.99      1177
      weighted avg       0.99      0.99      0.99      1177



In [194]:
def model_eval(first, second, third):
    '''this function requires three models as parameters.
    params:
        first : model for dynamic and static classification 
        second : 3-classification model for static case
        third : 3-classification model for dynamic case
    print:
        confusion_matrix
        classification_report
    '''

    # 1단계: 동적, 정적 이진 분류 모델
    model1 = first
    model1.fit(x_train, y2_train)
    y_pred1 = model1.predict(x_val)

    # 정적인 경우의 x_val
    stat_x_val = pd.DataFrame(x_val[y_pred1 == 0], columns = x_val.columns)
    # 동적인 경우의 x_val
    dyna_x_val = pd.DataFrame(x_val[y_pred1 == 1], columns = x_val.columns)


    # 2단계: 3-Class 분류 모델
    # 모델 2-1: 정적인 경우
    model2_1 = second
    model2_1.fit(x_train, y1_train)
    y_pred2_1 = model2_1.predict(stat_x_val)


    # 모델 2-2: 동적인 경우
    model2_2 = third
    model2_2.fit(x_train, y1_train)
    y_pred2_2 = model2_2.predict(dyna_x_val)


    # 3단계: 예측 결과 및 성능 평가 결과 출력
    # 예측 값에 대한 x_val 데이터프레임의 인덱스 달아주기
    dyna_x_val['Activity'] = y_pred2_2
    stat_x_val['Activity'] = y_pred2_1
    final = pd.concat([dyna_x_val, stat_x_val], axis = 0)

    # 예측 값과 y_val 둘의 인덱스 정렬해주기
    # 현재 예측 값의 index는 뒤죽박죽임 -> y_val을 기준으로 정렬
    final_y = final.loc[:, 'Activity']
    final_df = pd.DataFrame(final_y).rename(columns = {'Activity' : 'Predicted'})
    temp = pd.DataFrame(y1_val)
    merge_df = pd.merge(temp, final_df, left_index=True, right_index = True)

    # 실제 값과 예측 값 분리
    validation = merge_df.loc[:, 'Activity']
    predicted = merge_df.loc[:, 'Predicted']

    print(confusion_matrix(validation, predicted))
    print(classification_report(validation, predicted))

In [195]:
model1 = LogisticRegression()
model2 = RandomForestClassifier()
model3 = RandomForestClassifier()

model_eval(model1, model2, model3)

[[230   0   0   0   0   0]
 [  0 195  14   0   0   0]
 [  0   5 213   0   0   0]
 [  0   0   0 176   3   1]
 [  0   0   0   2 161   2]
 [  0   0   0   0   2 173]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       230
           SITTING       0.97      0.93      0.95       209
          STANDING       0.94      0.98      0.96       218
           WALKING       0.99      0.98      0.98       180
WALKING_DOWNSTAIRS       0.97      0.98      0.97       165
  WALKING_UPSTAIRS       0.98      0.99      0.99       175

          accuracy                           0.98      1177
         macro avg       0.98      0.98      0.98      1177
      weighted avg       0.98      0.98      0.98      1177

