# 데이콘 - 운동동작분류 1등 분석 (21.03.22)

## load data

In [None]:
import pandas as pd
import numpy as np
# pandas에서 옵션 보기 (600개 행과 열)
pd.set_option("display.max_column", 600)
pd.set_option("display.max_row", 600)
# 경고메세지 무시
import warnings
warnings.filterwarnings("ignore")
# 경로 지정
path = "H:/fitness/"

train = pd.read_csv(path + "data/train_features.csv")
# train data -> acc,gy,time으로 분리
train_acc, train_gy  = train.iloc[:, 2:5], train.iloc[:, 5:]
# 0~599 / 50 ?
train_time = train.time[:600]/50

train_label = pd.read_csv(path + "data/train_labels.csv")
train_y = train_label.label

test = pd.read_csv(path + "data/test_features.csv")
submission = pd.read_csv(path + "data/sample_submission.csv")

## data aug

time rolling
rotation은 wearable센서의 무작위한 착용 방향을 고려한 데이터 증강 방법입니다.

permutation은 신호를 n segment로 나누어 순서를 랜덤하게 바꿔주는 방법입니다.
Permutation (Perm) is a simple way to randomly perturb the
temporal location of within-window events. To perturb the location
of the data in a single window, we first slice the data into N samelength
segments, with N ranging from 1 to 5, and randomly permute
the segments to create a new window.

https://github.com/terryum/Data-Augmentation-For-Wearable-Sensor-Data/blob/master/Example_DataAugmentation_TimeseriesData.ipynb

In [None]:
import scipy
from transforms3d.axangles import axangle2mat

# time rolling
def rolling(data):
    # np.random.choice(): 임의표본추출
    for j in np.random.choice(data.shape[0], int(data.shape[0]*2/3)):
        data[j] = np.roll(data[j], np.random.choice(data.shape[1]), axis= 0)
    return data

# data aug methods
def rotation(data):
    axis = np.random.uniform(low=-1, high=1, size=data.shape[1])
    angle = np.random.uniform(low=-np.pi, high=np.pi)
    return np.matmul(data , axangle2mat(axis,angle))

def permutation(data, nPerm=4, mSL=10):
    data_new = np.zeros(data.shape)
    idx = np.random.permutation(nPerm)
    bWhile = True
    while bWhile == True:
        segs = np.zeros(nPerm+1, dtype=int)
        segs[1:-1] = np.sort(np.random.randint(mSL, data.shape[0]-mSL, nPerm-1))
        segs[-1] = data.shape[0]
        if np.min(segs[1:]-segs[0:-1]) > mSL:
            bWhile = False
    pp = 0
    for ii in range(nPerm):
        data_temp = data[segs[idx[ii]]:segs[idx[ii]+1],:]
        data_new[pp:pp+len(data_temp),:] = data_temp
        pp += len(data_temp)
    return(data_new)

# per + rot
def combine_aug(data, k, aug_P = 0):
    data_ = data.copy()
    if aug_P == 0:
        if (k+1) % 2 == 0:
            for i in np.random.choice(int(data.shape[0]/600), int(data.shape[0]/600*2/3)):
                data_[600*i:600*(i+1)] = rotation(np.array(data_[600*i:600*(i+1)]))
        if (k+1) % 2 == 1:
            for i in np.random.choice(int(data.shape[0]/600), int(data.shape[0]/600*2/3)):
                data_[600*i:600*(i+1)] = permutation(np.array(data_[600*i:600*(i+1)]))
                
    if aug_P != 0:
        pass
    return data_

In [None]:
# 증강한 것 시각화

import matplotlib.pyplot as plt

np.random.seed(10)

f, axes = plt.subplots(1, 3, sharex=True, sharey=True)

f.set_size_inches((40, 6))
f.patch.set_facecolor("white")

axes[0].plot(train_acc[:600])
axes[0].set_title("ORIGINAL", fontsize = 20)
axes[1].plot(rotation(train_acc[:600]))
axes[1].set_title("ROTATION", fontsize = 20)
axes[2].plot(permutation(np.array(train_acc[:600])))
axes[2].set_title("PERMUTATION", fontsize = 20)
plt.show()

최종적으로 사용한 데이터 증강 method

짝수 epoch에 rolling, permutation 결합
홀수 epoch에 rolling, rotation 결합
LB public score 기준

rolling만 사용했을경우 0.51
rolling + rotation combine한 경우 0.41
최종적인 aug 방법을 사용한 경우 0.386

feature

In [None]:
from math import atan, sqrt
from scipy.integrate import cumtrapz

def get_mag(data):
    return (data.iloc[:, 0]**2) + (data.iloc[:, 1]**2) + (data.iloc[:, 2]**2)

def get_mul(data):
    return data.iloc[:, 0] * data.iloc[:, 1] * data.iloc[:, 2]


##########################################################################################################################

def get_roll_pitch(data):
    roll = (data.iloc[:,1]/(data.iloc[:,0]**2 + data.iloc[:,2]**2).apply(lambda x : sqrt(x))).apply(lambda x : atan(x))*180/np.pi
    pitch = (data.iloc[:,0]/(data.iloc[:,1]**2 + data.iloc[:,2]**2).apply(lambda x : sqrt(x))).apply(lambda x : atan(x))*180/np.pi
    return pd.concat([roll, pitch], axis= 1)

##########################################################################################################################

def setting(data, data_, case = 0):
    if case == 0:
        for i in range(0, data.shape[0], 600):
            data[i] = data_[i] - data_[i+599]
    else:
        for i in range(0, data.shape[0], 600):
            data[i: i+5] = data_[i: i+5].values - data_[i+594:i+599].values
    return data
        
def get_diff(data, case = 0):
    if case == 0:
        x_dif, y_dif, z_dif = data.iloc[:, 0].diff(), data.iloc[:, 1].diff(), data.iloc[:, 2].diff()
    else:
        x_dif, y_dif, z_dif = data.iloc[:, 0].diff(5), data.iloc[:, 1].diff(5), data.iloc[:, 2].diff(5)
    return pd.concat([setting(x_dif, data.iloc[:, 0], case),
                      setting(y_dif, data.iloc[:, 1], case),
                      setting(z_dif, data.iloc[:, 2], case)], axis= 1)
############################################################################################################################

def get_cumtrapz(acc):
    acc_x, acc_y, acc_z = [], [], []
    ds_x, ds_y, ds_z = [], [], []
    for i in range(int(acc.shape[0]/600)):
        acc_x.append(pd.DataFrame(cumtrapz(acc.iloc[600*i:600*(i+1), 0], train_time, initial=0)))
        acc_y.append(pd.DataFrame(cumtrapz(acc.iloc[600*i:600*(i+1), 1], train_time, initial=0)))
        acc_z.append(pd.DataFrame(cumtrapz(acc.iloc[600*i:600*(i+1), 2], train_time, initial=0)))
        ds_x.append(pd.DataFrame(cumtrapz(cumtrapz(acc.iloc[600*i:600*(i+1), 0], train_time, initial=0), train_time, initial=0)))
        ds_y.append(pd.DataFrame(cumtrapz(cumtrapz(acc.iloc[600*i:600*(i+1), 1], train_time, initial=0), train_time, initial=0)))
        ds_z.append(pd.DataFrame(cumtrapz(cumtrapz(acc.iloc[600*i:600*(i+1), 2], train_time, initial=0), train_time, initial=0)))
    return (pd.concat([pd.concat(acc_x), pd.concat(acc_y), pd.concat(acc_z)], axis = 1).reset_index(drop=True),
           pd.concat([pd.concat(ds_x), pd.concat(ds_y), pd.concat(ds_z)], axis= 1).reset_index(drop = True))

Make dataset

In [None]:
def train_dataset(acc_data, gy_data, i, aug_P = 0):

    aug_acc = combine_aug(acc_data, i, aug_P)
    aug_gy = combine_aug(gy_data, i, aug_P)
    
    diff_acc = get_diff(aug_acc)
    #diff_acc_5 = get_diff(aug_acc, 1)
    
    roll_pitch_acc = get_roll_pitch(aug_acc)
    mag_acc, mul_acc = get_mag(aug_acc), get_mul(aug_acc)
    mag_mul_acc = pd.concat([mag_acc, mul_acc], axis= 1)
    #accvel, disp = get_cumtrapz(aug_acc)

    diff_gy = get_diff(aug_gy)
    #diff_gy_5 = get_diff(aug_gy, 1)
    mag_gy, mul_gy = get_mag(aug_gy), get_mul(aug_gy)
    mag_mul_gy = pd.concat([mag_gy, mul_gy], axis= 1)

    return pd.concat([aug_acc, diff_acc, roll_pitch_acc, mag_mul_acc,
                     aug_gy, diff_gy, mag_mul_gy], axis= 1)

def test_dataset(acc_data, gy_data):
    
    diff_acc = get_diff(acc_data)
    #diff_acc_5 = get_diff(acc_data, 1)
    
    roll_pitch_acc = get_roll_pitch(acc_data)
    mag_acc, mul_acc = get_mag(acc_data), get_mul(acc_data)
    mag_mul_acc = pd.concat([mag_acc, mul_acc], axis= 1)
    #accvel, disp = get_cumtrapz(acc_data)

    diff_gy = get_diff(gy_data)
    #diff_gy_5 = get_diff(gy_data, 1)
    mag_gy, mul_gy = get_mag(gy_data), get_mul(gy_data)
    mag_mul_gy = pd.concat([mag_gy, mul_gy], axis= 1)

    return pd.concat([acc_data, diff_acc, roll_pitch_acc, mag_mul_acc,
                      gy_data, diff_gy, mag_mul_gy], axis= 1)

Scaler
train data만 사용하여 scaler를 만들었습니다.
standscaler 사용

In [None]:
import sklearn
from sklearn.preprocessing import StandardScaler

data_for_scaler = test_dataset(train_acc, train_gy) # train data만 사용
scaler = StandardScaler().fit(np.array(data_for_scaler))

data_for_scaler = np.array(data_for_scaler).reshape(-1, 600, data_for_scaler.shape[1])
########################################################################################
test_x = test_dataset(test.iloc[:, 2:5], test.iloc[:, 5:])

test_X = scaler.transform(np.array(test_x)).reshape(-1, 600, test_x.shape[1])

gru layer + pooling layer + dense layer를 조합하여 선택
적은 데이터에는 gru를 사용하는 것이 accuracy와 loss사이의 trade off를 잘 조절해주는 것(개인적 견해)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers as L

from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

최적의 모델을 찾기 위해 파라미터를 수정해 가면서 모니터링한 결과 각각의 모델의 성능은 비슷하나 설명할 수 있는 부분이 살짝 다르다고 판단하여
각 모델당 2번씩 seed를 다르게 부여하여, 총 8개 결과의 평균을 냈습니다.

In [None]:
def First_model():
    inputs = L.Input(shape = (data_for_scaler.shape[1], data_for_scaler.shape[2]))
    gru1 = L.GRU(256, return_sequences = True, dropout = 0.2)(inputs)
    ap = L.AveragePooling1D()(gru1)
    gru2 = L.GRU(150, return_sequences = True)(ap)
    GAP = L.GlobalAveragePooling1D()(gru2)
    dense = L.Dense(61, activation = "softmax")(GAP)
    return keras.models.Model(inputs, dense)

def Second_model():
    inputs = L.Input(shape = (data_for_scaler.shape[1], data_for_scaler.shape[2]))
    gru1 = L.GRU(256, return_sequences = True, dropout = 0.2)(inputs)
    mp = L.MaxPool1D()(gru1)
    ap = L.AveragePooling1D()(gru1)
    concat1 = L.Concatenate()([mp, ap])
    gru2 = L.GRU(150, return_sequences = True, dropout = 0.2)(concat1)
    GAP = L.GlobalAveragePooling1D()(gru2)
    dense = L.Dense(61, activation = "softmax")(GAP)
    return keras.models.Model(inputs, dense)

def Third_model():
    inputs = L.Input(shape = (data_for_scaler.shape[1], data_for_scaler.shape[2]))
    gru1 = L.GRU(256, return_sequences = True, dropout = 0.2)(inputs)
    mp = L.MaxPool1D()(gru1)
    ap = L.AveragePooling1D()(gru1)
    concat1 = L.Concatenate()([mp, ap])
    gru2 = L.GRU(256, return_sequences = True, dropout = 0.2)(concat1)
    GAP = L.GlobalAveragePooling1D()(gru2)
    dense = L.Dense(61, activation = "softmax")(GAP)
    return keras.models.Model(inputs, dense)

def Fourth_model():
    inputs = L.Input(shape = (data_for_scaler.shape[1], data_for_scaler.shape[2]))
    gru1 = L.GRU(256, return_sequences = True, dropout = 0.2)(inputs)
    ap = L.AveragePooling1D()(gru1)
    gru2 = L.GRU(150, return_sequences = True, dropout = 0.2)(ap)
    GAP = L.GlobalAveragePooling1D()(gru2)
    dense = L.Dense(61, activation = "softmax")(GAP)
    return keras.models.Model(inputs, dense)

Training

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import random

def train_model(model_ = None, epochs = 40, first_rlr = 15, second_rlr = 30, r_seed = 10, aug_P = 0, seed_ = 1):
    # first_rlr : 첫번째로 learning_rate이 감소
    # second_rlr : 두번째로 learning_rate이 감소
    # r_seed : StratifiedKFold seed
    # seed_ : numpy/random seed
    
    result_model = []
    cnt = 0
    array_acc = np.array(train_acc).reshape(-1, 600, 3)
    array_gy = np.array(train_gy).reshape(-1, 600, 3)
    
    random.seed(seed_)
    tf.random.set_seed(21)

    split = StratifiedKFold(n_splits=10, shuffle = True, random_state = r_seed)
    for train_idx, valid_idx in split.split(data_for_scaler, train_y):
        
        train_Y, valid_Y = np.array(pd.get_dummies(train_y))[train_idx], np.array(pd.get_dummies(train_y))[valid_idx]

        valid_ACC, valid_GY = array_acc[valid_idx].reshape(-1, 3), array_gy[valid_idx].reshape(-1, 3)
        valid_x = test_dataset(pd.DataFrame(valid_ACC), pd.DataFrame(valid_GY))
        valid_X = scaler.transform(np.array(valid_x)).reshape(-1, 600, valid_x.shape[1])

        model = model_()
        model.compile(optimizer=keras.optimizers.RMSprop(0.003),
                      loss='categorical_crossentropy', metrics=['accuracy'])
        val_score = 0
        seed_ += 1

        for i in range(epochs):
            
            np.random.seed(seed_*47 + i)
            
            train_ACC, train_GY = array_acc[train_idx].reshape(-1, 3), array_gy[train_idx].reshape(-1, 3)
            train_x = train_dataset(pd.DataFrame(train_ACC), pd.DataFrame(train_GY), i, aug_P)
            train_X = scaler.transform(np.array(train_x)).reshape(-1, 600, valid_x.shape[1])

            train_X_ = train_X.copy()

            train_X_ = rolling(train_X_)

            hist = model.fit(train_X_, train_Y, epochs = 1, validation_data = (valid_X, valid_Y), verbose = 0)

            train_accuracy = hist.history["accuracy"]
            new_val_score = accuracy_score(np.argmax(valid_Y, axis = 1), np.argmax(model.predict(valid_X), axis = 1))
            val_loss = hist.history["val_loss"]

            if i == first_rlr:
                model.compile(optimizer=keras.optimizers.RMSprop(0.003*0.2),
                              loss='categorical_crossentropy', metrics=['accuracy'])

            if i == second_rlr:
                model.compile(optimizer = keras.optimizers.RMSprop(0.003*0.2*0.4),
                             loss='categorical_crossentropy', metrics=['accuracy'])

            print("epoch {} - train_accuracy : {} - validation_loss : {} - validation_accuracy : {}".format(i,
                                                                                                            train_accuracy,
                                                                                                            val_loss,
                                                                                                            new_val_score,
                                                                                                            ))

            if i == 0:
                val_loss_score = val_loss[0]
        
            if val_loss_score >= val_loss[0]:
                val_loss_score = val_loss[0]
                best_model = model
                print("####best_val####")
                    
            if new_val_score >= val_score:
                val_score = new_val_score
                best_model = model
                print("####best_acc####")
        print("####################################################### cycle {} is done".format(cnt))
        result_model.append(best_model)
        cnt+=1
    return result_model


def predict_(model):
    result = []
    for mod in model:
        result.append(mod.predict(test_X))
    predict = np.array(result).mean(axis = 0)
    return predict

def save_model(models, name = '1'):
    cnt = 1
    for model in models:
        model.save(path + "submission/last/weight/" + name + '-{}.h5'.format(cnt))
        cnt +=1

In [None]:
first_result = train_model(First_model, r_seed = 47, seed_ = 1)

In [None]:
second_result = train_model(Second_model, r_seed = 47, seed_ = 5)

In [None]:
third_result = train_model(First_model, r_seed = 32, seed_ = 9)

In [None]:
fourth_result = train_model(Second_model, r_seed = 32, seed_ = 13)

In [None]:
fifth_result = train_model(Third_model, r_seed = 2020, seed_ = 21)

In [None]:
sixth_result = train_model(Third_model, r_seed = 2020, seed_ = 17)

In [None]:
seventh_result = train_model(Fourth_model, r_seed = 2020, seed_ = 21)

In [None]:
eighth_result = train_model(Fourth_model, r_seed = 2020, seed_ = 25)

In [None]:
#save_model(first_result)
submission.iloc[:, 1:] = predict_(first_result)
submission1 = submission
#submission1.to_csv(path + "submission/last/1.csv", index = False)

#save_model(second_result, "2")
submission.iloc[:, 1:] = predict_(second_result)
submission2 = submission
#submission2.to_csv(path + "submission/last/2.csv", index = False)

#save_model(third_result, "3")
submission.iloc[:, 1:] = predict_(third_result)
submission3 = submission
#submission3.to_csv(path + "submission/last/3.csv", index = False)

#save_model(fourth_result, "4")
submission.iloc[:, 1:] = predict_(fourth_result)
submission4 = submission
#submission4.to_csv(path + "submission/last/4.csv", index = False)

#save_model(fifth_result, "5")
submission.iloc[:, 1:] = predict_(fifth_result)
submission5 = submission
#submission5.to_csv(path + "submission/last/5.csv", index = False)

#save_model(sixth_result, "6")
submission.iloc[:, 1:] = predict_(sixth_result)
submission6 = submission
#submission6.to_csv(path + "submission/last/6.csv", index = False)

#save_model(seventh_result, "7")
submission.iloc[:, 1:] = predict_(seventh_result)
submission7 = submission
#submission7.to_csv(path + "submission/last/7.csv", index = False)

#save_model(eighth_result, "8")
submission.iloc[:, 1:] = predict_(eighth_result)
submission8 = submission
#submission8.to_csv(path + "submission/last/8.csv", index = False)

In [None]:
#submission1 = pd.read_csv(path + "submission/last/1.csv")
#submission2 = pd.read_csv(path + "submission/last/2.csv")
#submission3 = pd.read_csv(path + "submission/last/3.csv")
#submission4 = pd.read_csv(path + "submission/last/4.csv")
#submission5 = pd.read_csv(path + "submission/last/5.csv")
#submission6 = pd.read_csv(path + "submission/last/6.csv")
#submission7 = pd.read_csv(path + "submission/last/7.csv")
#submission8 = pd.read_csv(path + "submission/last/8.csv")

In [None]:
submission.iloc[:, 1:] = (submission1.iloc[:, 1:]/8 + submission2.iloc[:, 1:]/8 +
                         submission3.iloc[:, 1:]/8 + submission4.iloc[:, 1:]/8 +
                         submission5.iloc[:, 1:]/8 + submission6.iloc[:, 1:]/8 +
                         submission7.iloc[:, 1:]/8 + submission8.iloc[:, 1:]/8)
#submission.to_csv(path + "submission/last/95.csv", index = False)