### 1. 필요한 패키지 임포트

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier   # catboost 설치 후 쓸 것

from sklearn.metrics import *

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 2. 데이터 불러오기

In [318]:
data = pd.read_csv('/content/drive/MyDrive/aivle/5차 미니프로젝트/train_dataset.csv')
test = pd.read_csv('/content/drive/MyDrive/aivle/5차 미니프로젝트/test_dataset.csv')

In [292]:
data['temp'] = data['Activity_dynamic']

In [293]:
data = data.drop('Activity_dynamic', axis = 1)

In [294]:
data['Activity_dynamic'] = np.where((data['Activity'] == 'STANDING') | (data['Activity'] == 'SITTING') | (data['Activity'] == 'LAYING'), 0, 1)
np.unique(data['Activity_dynamic'])

array([0, 1])

In [295]:
data['Activity'] = data['temp']
data.drop('temp', axis = 1, inplace = True)

In [296]:
x = data.drop(['Activity', 'Activity_dynamic'], axis = 1)
y = data.loc[:, ['Activity', 'Activity_dynamic']]

In [297]:
test_size = 0.2

In [298]:
# train set, valid set
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = test_size)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((4704, 50), (1177, 50), (4704, 2), (1177, 2))

In [299]:
# Activity, Activity_dynamic
# y1 = Activity, y2 = Activity_dynamic
y1_train = y_train.loc[:, 'Activity']
y2_train = y_train.loc[:, 'Activity_dynamic']
y1_val = y_val.loc[:, 'Activity']
y2_val = y_val.loc[:, 'Activity_dynamic']

In [302]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)

### 3. 성능 예측
- 어제 만든 함수 활용

In [303]:
def model_eval(first, second, third):
    '''this function requires three models as parameters.
    params:
        first : model for dynamic and static classification 
        second : 3-classification model for static case
        third : 3-classification model for dynamic case
    print:
        confusion_matrix
        classification_report
    '''

    # 1단계: 동적, 정적 이진 분류 모델
    model1 = first
    model1.fit(x_train, y2_train)
    y_pred1 = model1.predict(x_val)

    # 정적인 경우의 x_val
    stat_x_val = pd.DataFrame(x_val[y_pred1 == 0], columns = x_val.columns)
    # 동적인 경우의 x_val
    dyna_x_val = pd.DataFrame(x_val[y_pred1 == 1], columns = x_val.columns)


    # 2단계: 3-Class 분류 모델
    # 모델 2-1: 정적인 경우
    model2_1 = second
    model2_1.fit(x_train, y1_train)
    y_pred2_1 = model2_1.predict(stat_x_val)


    # 모델 2-2: 동적인 경우
    model2_2 = third
    model2_2.fit(x_train, y1_train)
    y_pred2_2 = model2_2.predict(dyna_x_val)


    # 3단계: 예측 결과 및 성능 평가 결과 출력
    # 예측 값에 대한 x_val 데이터프레임의 인덱스 달아주기
    dyna_x_val['Activity'] = y_pred2_2
    stat_x_val['Activity'] = y_pred2_1
    final = pd.concat([dyna_x_val, stat_x_val], axis = 0)

    # 예측 값과 y_val 둘의 인덱스 정렬해주기
    # 현재 예측 값의 index는 뒤죽박죽임 -> y_val을 기준으로 정렬
    final_y = final.loc[:, 'Activity']
    final_df = pd.DataFrame(final_y).rename(columns = {'Activity' : 'Predicted'})
    temp = pd.DataFrame(y1_val)
    merge_df = pd.merge(temp, final_df, left_index=True, right_index = True)

    # 실제 값과 예측 값 분리
    validation = merge_df.loc[:, 'Activity']
    predicted = merge_df.loc[:, 'Predicted']

    print(confusion_matrix(validation, predicted))
    print(classification_report(validation, predicted))

In [304]:
model1 = LogisticRegression(max_iter = 3000)
model2 = XGBClassifier()
model3 = XGBClassifier()

model_eval(model1, model2, model3)

[[172  13  14   0   0   0]
 [ 12 143  11   0   0   0]
 [ 11  15 123   0   0   0]
 [  0   1   0 182  19  25]
 [  0   0   0  15 164  51]
 [  0   0   0  37  43 126]]
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       199
           1       0.83      0.86      0.85       166
           2       0.83      0.83      0.83       149
           3       0.78      0.80      0.79       227
           4       0.73      0.71      0.72       230
           5       0.62      0.61      0.62       206

    accuracy                           0.77      1177
   macro avg       0.78      0.78      0.78      1177
weighted avg       0.77      0.77      0.77      1177



In [29]:
np.unique(y2_train)

array([0, 1, 2, 3, 4, 5])

### 4. 적합한 모델 찾기
- 정확도가 너무 낮다

In [41]:
model = LogisticRegression()
model.fit(x_train, y2_train)
y_pred = model.predict(x_val)

print(confusion_matrix(y2_val, y_pred))
print(classification_report(y2_val, y_pred))

[[639   2]
 [  1 535]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       641
           1       1.00      1.00      1.00       536

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177



In [42]:
model = RandomForestClassifier()
model.fit(x_train, y2_train)
y_pred = model.predict(x_val)

print(confusion_matrix(y2_val, y_pred))
print(classification_report(y2_val, y_pred))

[[639   2]
 [  0 536]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       641
           1       1.00      1.00      1.00       536

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177



In [43]:
model = XGBClassifier()
model.fit(x_train, y2_train)
y_pred = model.predict(x_val)

print(confusion_matrix(y2_val, y_pred))
print(classification_report(y2_val, y_pred))

[[639   2]
 [  0 536]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       641
           1       1.00      1.00      1.00       536

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177



#### 결론
- 첫 번째 모델은 어떤 것을 써도 비슷하다

In [118]:
data_temp = data[np.isin(data['Activity'], ['LAYING', 'SITTING', 'STANDING'])].drop('Activity_dynamic', axis = 1)

x = data_temp.drop(['Activity'], axis = 1)
y = data_temp.loc[:, 'Activity']
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2)

In [46]:
np.unique(y_train)

array(['LAYING', 'SITTING', 'STANDING'], dtype=object)

In [48]:
model = LogisticRegression(max_iter = 2500)
model.fit(x_train, y_train)
y_pred = model.predict(x_val)

print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[152  43  28]
 [ 45  87  76]
 [ 21  42 153]]
              precision    recall  f1-score   support

      LAYING       0.70      0.68      0.69       223
     SITTING       0.51      0.42      0.46       208
    STANDING       0.60      0.71      0.65       216

    accuracy                           0.61       647
   macro avg       0.60      0.60      0.60       647
weighted avg       0.60      0.61      0.60       647



In [59]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_val)

print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[172  32  22]
 [ 52 104  62]
 [ 28  36 139]]
              precision    recall  f1-score   support

      LAYING       0.68      0.76      0.72       226
     SITTING       0.60      0.48      0.53       218
    STANDING       0.62      0.68      0.65       203

    accuracy                           0.64       647
   macro avg       0.64      0.64      0.64       647
weighted avg       0.64      0.64      0.64       647



In [50]:
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_val)

print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[164  40  19]
 [ 62  76  70]
 [ 31  42 143]]
              precision    recall  f1-score   support

      LAYING       0.64      0.74      0.68       223
     SITTING       0.48      0.37      0.42       208
    STANDING       0.62      0.66      0.64       216

    accuracy                           0.59       647
   macro avg       0.58      0.59      0.58       647
weighted avg       0.58      0.59      0.58       647



In [58]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_val)

print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[125  53  48]
 [ 49  91  78]
 [ 37  66 100]]
              precision    recall  f1-score   support

      LAYING       0.59      0.55      0.57       226
     SITTING       0.43      0.42      0.43       218
    STANDING       0.44      0.49      0.47       203

    accuracy                           0.49       647
   macro avg       0.49      0.49      0.49       647
weighted avg       0.49      0.49      0.49       647



In [60]:
model = KNeighborsClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_val)

print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[148  49  29]
 [ 70  92  56]
 [ 53  57  93]]
              precision    recall  f1-score   support

      LAYING       0.55      0.65      0.60       226
     SITTING       0.46      0.42      0.44       218
    STANDING       0.52      0.46      0.49       203

    accuracy                           0.51       647
   macro avg       0.51      0.51      0.51       647
weighted avg       0.51      0.51      0.51       647



In [119]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_val_le = le.transform(y_val)

In [120]:
model = XGBClassifier()
model.fit(x_train, y_train_le)
y_pred = model.predict(x_val)

print(confusion_matrix(y_val_le, y_pred))
print(classification_report(y_val_le, y_pred))

[[185  26  13]
 [ 45 114  61]
 [ 19  44 140]]
              precision    recall  f1-score   support

           0       0.74      0.83      0.78       224
           1       0.62      0.52      0.56       220
           2       0.65      0.69      0.67       203

    accuracy                           0.68       647
   macro avg       0.67      0.68      0.67       647
weighted avg       0.67      0.68      0.67       647



In [None]:
!pip install catboost

In [66]:
from catboost import CatBoostClassifier

In [None]:
model = CatBoostClassifier(iterations = 100, depth = 6)
model.fit(x_train, y_train)
y_pred = model.predict(x_val)

print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

#### 결론
- 모델2에서는 xgb모델이 좋다

### 4. submission 생성

In [416]:
data = pd.read_csv('/content/drive/MyDrive/aivle/5차 미니프로젝트/train_dataset.csv')
test = pd.read_csv('/content/drive/MyDrive/aivle/5차 미니프로젝트/test_dataset.csv')
data['temp'] = data['Activity_dynamic']
data = data.drop('Activity_dynamic', axis = 1)
data['Activity_dynamic'] = np.where((data['Activity'] == 'STANDING') | (data['Activity'] == 'SITTING') | (data['Activity'] == 'LAYING'), 0, 1)
data['Activity'] = data['temp']
data.drop('temp', axis = 1, inplace = True)

In [417]:
x_train = data.drop(['Activity', 'Activity_dynamic'], axis = 1)
y_train = data.loc[:, ['Activity', 'Activity_dynamic']]

In [360]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
test = scaler.transform(test)

In [361]:
test = pd.DataFrame(test)
test.shape

(1471, 50)

In [407]:
new_cols = list(range(len(x_train.columns)))
x_train.columns = new_cols
x_train.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.103362,-0.094784,-0.109208,0.068584,-0.014314,0.1986,-0.073666,0.113098,-0.058315,0.35427,...,-0.999669,-0.999719,-0.996607,-0.999969,-0.985141,-1.0,-0.975435,-0.993965,-0.032207,-0.042494
1,-0.098163,-0.052478,0.464764,0.075947,0.004456,0.13031,0.401913,0.512569,0.011987,0.503466,...,-0.999818,-0.999583,-0.992681,-0.999941,-0.988982,-1.0,-0.978534,-0.994733,-0.246705,-0.062899
2,-0.108717,-0.226718,0.301165,0.07953,0.014504,0.11134,0.263556,0.56273,0.314589,0.132061,...,-0.999566,-0.999278,-0.999971,-0.999976,-0.991535,-1.0,-0.991953,-0.9922,0.388765,0.000265


In [408]:
new_cols = list(range(len(x_train.columns)))
test.columns = new_cols
test.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.116683,0.059275,0.00505,0.074204,0.005466,0.22913,0.064325,0.20852,0.128505,0.178639,...,-0.999635,-0.999936,-0.998551,-0.999986,-0.99535,-1.0,-0.992586,-0.995778,0.255432,-0.018043
1,-0.118412,-0.219041,-0.255267,0.076562,0.000427,0.027814,-0.278551,-0.008195,0.044317,-0.057602,...,-0.999677,-0.999838,-0.997029,-0.999954,-0.984526,-1.0,-0.970626,-0.991971,-0.166341,-0.022456
2,-0.087303,0.552368,-0.025095,0.069317,0.25178,0.586216,-0.07961,0.096668,-0.219583,-0.469174,...,-0.907521,-0.825203,-0.983423,-0.971489,-0.169575,0.629909,-0.428692,-0.653058,0.468354,-0.362616


In [418]:
y1_train = y_train.loc[:, 'Activity']
y2_train = y_train.loc[:, 'Activity_dynamic']

In [None]:
# 1단계: 동적, 정적 이진 분류 모델
model1 = LogisticRegression()
model1.fit(x_train, y2_train)
y_pred1 = model1.predict(test)

# 정적인 경우의 test
stat_test = pd.DataFrame(test[y_pred1 == 0], columns = test.columns)
# 동적인 경우의 test
dyna_test = pd.DataFrame(test[y_pred1 == 1], columns = test.columns)


# 2단계: 3-Class 분류 모델
# 모델 2-1: 정적인 경우
model2_1 = CatBoostClassifier()
model2_1.fit(x_train, y1_train)
y_pred2_1 = model2_1.predict(stat_test)


# 모델 2-2: 동적인 경우
model2_2 = CatBoostClassifier()
model2_2.fit(x_train, y1_train)
y_pred2_2 = model2_2.predict(dyna_test)


# 3단계: 예측 결과 및 성능 평가 결과 출력
# 예측 값에 대한 test 데이터프레임의 인덱스 달아주기
dyna_test['Activity'] = y_pred2_2
stat_test['Activity'] = y_pred2_1
final = pd.concat([dyna_test, stat_test], axis = 0)

In [421]:
final = final.sort_index()
final.head(5)

Unnamed: 0,tBodyAcc-mean()-Z,"tBodyAcc-arCoeff()-X,2","tBodyAcc-arCoeff()-Y,1",tBodyAccJerk-mean()-X,tBodyAccJerk-mean()-Y,"tBodyAccJerk-arCoeff()-X,2","tBodyAccJerk-arCoeff()-Y,1","tBodyAccJerk-arCoeff()-Y,3","tBodyAccJerk-arCoeff()-Z,1","tBodyGyro-arCoeff()-Z,3",...,"fBodyAccJerk-bandsEnergy()-49,64.2",fBodyGyro-min()-Y,"fBodyGyro-bandsEnergy()-49,64.1",fBodyBodyAccJerkMag-iqr(),fBodyBodyAccJerkMag-entropy(),fBodyBodyGyroMag-mean(),fBodyBodyGyroJerkMag-min(),fBodyBodyGyroJerkMag-meanFreq(),"angle(tBodyAccMean,gravity)",Activity
0,-0.116683,0.059275,0.00505,0.074204,0.005466,0.22913,0.064325,0.20852,0.128505,0.178639,...,-0.999936,-0.998551,-0.999986,-0.99535,-1.0,-0.992586,-0.995778,0.255432,-0.018043,4
1,-0.118412,-0.219041,-0.255267,0.076562,0.000427,0.027814,-0.278551,-0.008195,0.044317,-0.057602,...,-0.999838,-0.997029,-0.999954,-0.984526,-1.0,-0.970626,-0.991971,-0.166341,-0.022456,4
2,-0.087303,0.552368,-0.025095,0.069317,0.25178,0.586216,-0.07961,0.096668,-0.219583,-0.469174,...,-0.825203,-0.983423,-0.971489,-0.169575,0.629909,-0.428692,-0.653058,0.468354,-0.362616,0
3,-0.108381,-0.097072,0.198134,0.078063,-0.003247,0.235713,0.110652,-0.056465,0.164869,0.319642,...,-0.999625,-0.998581,-0.999995,-0.992776,-1.0,-0.993341,-0.996087,0.337635,0.289548,5
4,-0.078856,0.202431,-0.546042,0.066171,0.089363,0.20581,-0.57641,-0.394336,0.111303,0.194097,...,-0.997885,-0.995305,-0.999823,-0.958406,-0.839332,-0.779465,-0.98157,-0.594792,0.010111,3


In [422]:
pred = final.loc[:, 'Activity']

In [423]:
pred_df = pd.DataFrame(pred)
pred_df.head()

Unnamed: 0,Activity
0,4
1,4
2,0
3,5
4,3


In [424]:
pred_df['ID'] = pred_df.index
pred_df.head()

Unnamed: 0,Activity,ID
0,4,0
1,4,1
2,0,2
3,5,3
4,3,4


In [425]:
pred_df.to_csv('pred13.csv', index = False)

### First try: 0.76070
- 더 나은 방안 찾기
- 1. gridsearch로 최적의 하이퍼 파라미터 찾기
- 2. 데이터 전처리 손보기 

In [111]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = XGBClassifier()
model.fit(x_train, y_train_le)
y_pred = model.predict(x_val)

print(confusion_matrix(y_val_le, y_pred))
print(classification_report(y_val_le, y_pred))

In [121]:
model = XGBClassifier()

params = {
    'n_estimators': list(range(10, 31, 5)),
    'max_depth': list(range(1, 31, 5))
}

grid = GridSearchCV(model,
                    params,
                    cv = 5)

grid.fit(x_train, y_train_le)

y_pred = grid.predict(x_val)

In [122]:
print('best param: ', grid.best_params_)

best param:  {'max_depth': 6, 'n_estimators': 30}


- GridSearch시 성능 하락..
- 데이터 전처리 시도

In [None]:
data.columns

In [141]:
model1 = LogisticRegression()
model2 = XGBClassifier()
model3 = XGBClassifier()

# walking upstair f1-score: 0.58
model_eval(model1, model2, model3)

[[186  15  12   0   0   0]
 [ 13 165  10   0   0   0]
 [ 10  23 127   0   0   0]
 [  0   0   0 162   6  35]
 [  0   0   0  22 144  38]
 [  0   1   0  44  50 114]]
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       213
           1       0.81      0.88      0.84       188
           2       0.85      0.79      0.82       160
           3       0.71      0.80      0.75       203
           4       0.72      0.71      0.71       204
           5       0.61      0.55      0.58       209

    accuracy                           0.76      1177
   macro avg       0.77      0.77      0.76      1177
weighted avg       0.76      0.76      0.76      1177



In [143]:
data = pd.read_csv('/content/drive/MyDrive/aivle/5차 미니프로젝트/train_dataset.csv')
data_temp = data[np.isin(data['Activity'], ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS'])]

x = data_temp.drop(['Activity', 'Activity_dynamic'], axis = 1)
y = data_temp.loc[:, 'Activity']

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, random_state = 2023)

In [146]:
x_train.shape

(2117, 50)

In [147]:
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[143  22  16]
 [ 20 130  14]
 [ 27  23 135]]
                    precision    recall  f1-score   support

           WALKING       0.75      0.79      0.77       181
WALKING_DOWNSTAIRS       0.74      0.79      0.77       164
  WALKING_UPSTAIRS       0.82      0.73      0.77       185

          accuracy                           0.77       530
         macro avg       0.77      0.77      0.77       530
      weighted avg       0.77      0.77      0.77       530



In [148]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_val_le = le.transform(y_val)

In [150]:
model = XGBClassifier()
model.fit(x_train, y_train_le)
y_pred = model.predict(x_val)
print(confusion_matrix(y_val_le, y_pred))
print(classification_report(y_val_le, y_pred))

[[152  18  11]
 [ 18 130  16]
 [ 16  17 152]]
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       181
           1       0.79      0.79      0.79       164
           2       0.85      0.82      0.84       185

    accuracy                           0.82       530
   macro avg       0.82      0.82      0.82       530
weighted avg       0.82      0.82      0.82       530



In [151]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[137  27  17]
 [ 26 117  21]
 [ 29  22 134]]
                    precision    recall  f1-score   support

           WALKING       0.71      0.76      0.73       181
WALKING_DOWNSTAIRS       0.70      0.71      0.71       164
  WALKING_UPSTAIRS       0.78      0.72      0.75       185

          accuracy                           0.73       530
         macro avg       0.73      0.73      0.73       530
      weighted avg       0.73      0.73      0.73       530



- model2-2도 xgb가 좋아보이며, 모든 train set을 사용(valid 안나눔)하여 모델링 결과 0.77498

### Third Try: 0.77498

- train set을 더 늘려보자
3,4일차에 쓴 train data에서 요소 50개 그대로 뽑아서 사용

In [162]:
data2 = pd.read_csv('/content/drive/MyDrive/aivle/5차 미니프로젝트/data01_train.csv')
data2.drop('subject', axis = 1, inplace = True)

In [None]:
x_train.columns

In [168]:
target = []
for col in x_train.columns:
    target.append(col)

In [169]:
data2_temp = data2.loc[:, target]

In [171]:
data2_temp.head(3)

Unnamed: 0,tBodyAcc-mean()-Z,"tBodyAcc-arCoeff()-X,2","tBodyAcc-arCoeff()-Y,1",tBodyAccJerk-mean()-X,tBodyAccJerk-mean()-Y,"tBodyAccJerk-arCoeff()-X,2","tBodyAccJerk-arCoeff()-Y,1","tBodyAccJerk-arCoeff()-Y,3","tBodyAccJerk-arCoeff()-Z,1","tBodyGyro-arCoeff()-Z,3",...,"fBodyAccJerk-bandsEnergy()-41,48.2","fBodyAccJerk-bandsEnergy()-49,64.2",fBodyGyro-min()-Y,"fBodyGyro-bandsEnergy()-49,64.1",fBodyBodyAccJerkMag-iqr(),fBodyBodyAccJerkMag-entropy(),fBodyBodyGyroMag-mean(),fBodyBodyGyroJerkMag-min(),fBodyBodyGyroJerkMag-meanFreq(),"angle(tBodyAccMean,gravity)"
0,-0.103362,-0.094784,-0.109208,0.068584,-0.014314,0.1986,-0.073666,0.113098,-0.058315,0.35427,...,-0.999669,-0.999719,-0.996607,-0.999969,-0.985141,-1.0,-0.975435,-0.993965,-0.032207,-0.042494
1,-0.098163,-0.052478,0.464764,0.075947,0.004456,0.13031,0.401913,0.512569,0.011987,0.503466,...,-0.999818,-0.999583,-0.992681,-0.999941,-0.988982,-1.0,-0.978534,-0.994733,-0.246705,-0.062899
2,-0.108717,-0.226718,0.301165,0.07953,0.014504,0.11134,0.263556,0.56273,0.314589,0.132061,...,-0.999566,-0.999278,-0.999971,-0.999976,-0.991535,-1.0,-0.991953,-0.9922,0.388765,0.000265


In [172]:
x_train.head(3)

Unnamed: 0,tBodyAcc-mean()-Z,"tBodyAcc-arCoeff()-X,2","tBodyAcc-arCoeff()-Y,1",tBodyAccJerk-mean()-X,tBodyAccJerk-mean()-Y,"tBodyAccJerk-arCoeff()-X,2","tBodyAccJerk-arCoeff()-Y,1","tBodyAccJerk-arCoeff()-Y,3","tBodyAccJerk-arCoeff()-Z,1","tBodyGyro-arCoeff()-Z,3",...,"fBodyAccJerk-bandsEnergy()-41,48.2","fBodyAccJerk-bandsEnergy()-49,64.2",fBodyGyro-min()-Y,"fBodyGyro-bandsEnergy()-49,64.1",fBodyBodyAccJerkMag-iqr(),fBodyBodyAccJerkMag-entropy(),fBodyBodyGyroMag-mean(),fBodyBodyGyroJerkMag-min(),fBodyBodyGyroJerkMag-meanFreq(),"angle(tBodyAccMean,gravity)"
0,-0.103362,-0.094784,-0.109208,0.068584,-0.014314,0.1986,-0.073666,0.113098,-0.058315,0.35427,...,-0.999669,-0.999719,-0.996607,-0.999969,-0.985141,-1.0,-0.975435,-0.993965,-0.032207,-0.042494
1,-0.098163,-0.052478,0.464764,0.075947,0.004456,0.13031,0.401913,0.512569,0.011987,0.503466,...,-0.999818,-0.999583,-0.992681,-0.999941,-0.988982,-1.0,-0.978534,-0.994733,-0.246705,-0.062899
2,-0.108717,-0.226718,0.301165,0.07953,0.014504,0.11134,0.263556,0.56273,0.314589,0.132061,...,-0.999566,-0.999278,-0.999971,-0.999976,-0.991535,-1.0,-0.991953,-0.9922,0.388765,0.000265


어제 데이터와 같네..?

model2-2만 돌렸을 때는 walking_downstair가 높으나
model 3개를 같이 하면 낮게 나옴 왜 그럴까?

In [287]:
data = pd.read_csv('/content/drive/MyDrive/aivle/5차 미니프로젝트/train_dataset.csv')
data['temp'] = data['Activity_dynamic']
data = data.drop('Activity_dynamic', axis = 1)
data['Activity_dynamic'] = np.where((data['Activity'] == 'STANDING') | (data['Activity'] == 'SITTING') | (data['Activity'] == 'LAYING'), 0, 1)
data['Activity'] = data['temp']
data.drop('temp', axis = 1, inplace = True)
x = data.drop(['Activity', 'Activity_dynamic'], axis = 1)
y = data.loc[:, ['Activity', 'Activity_dynamic']]
test_size = 0.2
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = test_size)
y1_train = y_train.loc[:, 'Activity']
y2_train = y_train.loc[:, 'Activity_dynamic']
y1_val = y_val.loc[:, 'Activity']
y2_val = y_val.loc[:, 'Activity_dynamic']

In [281]:
y_pred1 = pd.read_csv('/content/pred.csv')
y_pred2 = pd.read_csv('/content/pred2.csv')
y_pred3 = pd.read_csv('/content/pred3.csv')

y_pred1.drop('ID', axis = 1, inplace = True)
y_pred2.drop('ID', axis = 1, inplace = True)
y_pred3.drop('ID', axis = 1, inplace = True)

y_pred1 = np.array(y_pred1)
y_pred2 = np.array(y_pred2)
y_pred3 = np.array(y_pred3)

ensemble_pred = np.concatenate((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1), y_pred3.reshape(-1, 1)), axis=1)

# 다수결 투표를 통해 앙상블된 예측 결과 계산
ensemble_pred = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=ensemble_pred)

print("Ensemble Prediction:", ensemble_pred)


Ensemble Prediction: [4 4 0 ... 2 3 2]


In [283]:
final_pred = pd.DataFrame(ensemble_pred)
final_pred['ID'] = final_pred.index
final_pred.head(3)

Unnamed: 0,0,ID
0,4,0
1,4,1
2,0,2


In [284]:
pred_df.to_csv('pred4.csv', index = False)

결과 앙상블 -> 다를바 없다

In [None]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = sclaer.transform(x_val)

In [None]:
data[data['Activity'] == 5].describe()

In [354]:
data = pd.read_csv('/content/drive/MyDrive/aivle/5차 미니프로젝트/train_dataset.csv')
data['temp'] = data['Activity_dynamic']
data = data.drop('Activity_dynamic', axis = 1)
data['Activity_dynamic'] = np.where((data['Activity'] == 'STANDING') | (data['Activity'] == 'SITTING') | (data['Activity'] == 'LAYING'), 0, 1)
data['Activity'] = data['temp']
data.drop('temp', axis = 1, inplace = True)
x = data.drop(['Activity', 'Activity_dynamic'], axis = 1)
y = data.loc[:, ['Activity', 'Activity_dynamic']]
test_size = 0.2
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = test_size)
y1_train = y_train.loc[:, 'Activity']
y2_train = y_train.loc[:, 'Activity_dynamic']
y1_val = y_val.loc[:, 'Activity']
y2_val = y_val.loc[:, 'Activity_dynamic']

In [357]:
model = XGBClassifier()
model.fit(x_train, y1_train)
y_pred = model.predict(x_val)

print(confusion_matrix(y1_val, y_pred))
print(classification_report(y1_val, y_pred))

[[202   7  11   0   0   0]
 [ 12 142  11   0   0   0]
 [ 18  14 137   0   0   0]
 [  0   0   0 165  15  29]
 [  0   0   0  21 136  42]
 [  0   0   0  43  61 111]]
              precision    recall  f1-score   support

           0       0.87      0.92      0.89       220
           1       0.87      0.86      0.87       165
           2       0.86      0.81      0.84       169
           3       0.72      0.79      0.75       209
           4       0.64      0.68      0.66       199
           5       0.61      0.52      0.56       215

    accuracy                           0.76      1177
   macro avg       0.76      0.76      0.76      1177
weighted avg       0.76      0.76      0.76      1177

