In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split

In [2]:
file_list = os.listdir("/works/Data/Dacon/health_data_clf/")
file_list = [file for file in file_list if file.endswith(".csv")]
print(file_list)

['train_labels.csv', 'test_features.csv', 'train_features.csv', 'sample_submission.csv']


In [3]:
file_path = "/works/Data/Dacon/health_data_clf/"
train_label = pd.read_csv(file_path + file_list[0])
test_features = pd.read_csv(file_path + file_list[1])
train_features = pd.read_csv(file_path + file_list[2])
sample_submissuib = pd.read_csv(file_path + file_list[3])

print(train_label.shape)
print(test_features.shape)
print(train_features.shape)

(3125, 3)
(469200, 8)
(1875000, 8)


# Preprocessing

In [4]:
def return_index(arr, num):
    return arr.argsort()[-num:]

def prerpocessing_using_static(features_set, label_set):
    input_set = []
    target_set = []
    
    for id_idx in range(features_set.id.min(), features_set.id.max()+1):
        sample_features = features_set[["acc_x","acc_y","acc_z"]][features_set.id==id_idx].values
        
        #Fourier transform
        strength = np.fft.fft(sample_features, axis=0)
        strength = abs(strength)
        frequency = np.fft.fftfreq(len(sample_features), 1)
        strength_pos = strength[frequency>=0]
        frequency_pos = frequency[frequency>=0]
        freq_x = frequency_pos[return_index(strength_pos[:,0],3)].mean()
        freq_y = frequency_pos[return_index(strength_pos[:,1],3)].mean()
        freq_z = frequency_pos[return_index(strength_pos[:,2],3)].mean()
        
        #Transform to static
        #Mean values
        mean_arr = sample_features.mean(axis=0)
        #Mag values
        mag_arr = abs(sample_features).mean(axis=0)
        #Std values
        std_arr = sample_features.std(axis=0)
        #Cov values
        cov_xy = np.cov(sample_features[:,0], sample_features[:,1])[0][1]
        cov_yz = np.cov(sample_features[:,1], sample_features[:,2])[0][1]
        cov_zx = np.cov(sample_features[:,2], sample_features[:,0])[0][1]
        #Corr values
        cor_xy = cov_xy / (std_arr[0]*std_arr[1])
        cor_yz = cov_yz / (std_arr[1]*std_arr[2])
        cor_zx = cov_zx / (std_arr[2]*std_arr[0])
        #Start End point change of vector values
        start_point = sample_features[:3,:].mean(axis=0)
        end_point = sample_features[-3:,:].mean(axis=0)
        change_of_vector_st = np.cos((start_point * end_point).sum() / (np.sqrt(np.power(start_point, 2).sum()) * np.sqrt(np.power(end_point, 2).sum())))
        #Energy values
        shift_0_values = np.roll(sample_features, 1, axis=0)[1:]
        shift_1_values = sample_features[1:]
        energys = np.power((shift_0_values-shift_1_values),2)
        energy_strength = np.fft.fft(energys, axis=0)
        energy_strength = abs(energy_strength)
        energy_frequency = np.fft.fftfreq(len(energy_strength), 1)
        energy_strength_pos = energy_strength[energy_frequency>=0]
        energy_frequency_pos = energy_frequency[energy_frequency>=0]
        energy_freq_x = energy_frequency_pos[return_index(energy_strength_pos[:,0],3)].mean()
        energy_freq_y = energy_frequency_pos[return_index(energy_strength_pos[:,1],3)].mean()
        energy_freq_z = energy_frequency_pos[return_index(energy_strength_pos[:,2],3)].mean()
        energy_mean = energys.mean(axis=0)
        energy_std = energys.std(axis=0)
        #Max Min point num
        increase_shift_0 = np.where(shift_0_values-shift_1_values>=0, 1, -1)
        increase_shift_1 = np.roll(increase_shift_0, 1, axis=0)
        max_min_point_num = np.where(increase_shift_0[1:]*increase_shift_1[1:]==-1, 1, 0).sum(axis=0)
        
        #Make Set
        freqs = [freq_x, freq_y, freq_z, energy_freq_x, energy_freq_y, energy_freq_z]
        id_sample_set = list(mean_arr) + list(mag_arr) + list(std_arr)\
        + [cor_xy, cor_yz, cor_zx]\
        + [change_of_vector_st]\
        + list(energy_mean) + list(energy_std) + list(max_min_point_num) + freqs + [id_idx]
        
        #Append
        input_set.append(id_sample_set)
        target_set.append(label_set[label_set.id == id_idx].label.values[0])
    
    input_set = np.array(input_set)
    target_set = np.array(target_set)
    print("Input set : {} / Target set : {}".format(input_set.shape, target_set.shape))
    return input_set, target_set

In [5]:
input_set, target_set = prerpocessing_using_static(features_set=train_features, label_set=train_label)

Input set : (3125, 29) / Target set : (3125,)


# Data Split

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(input_set, target_set, test_size=0.1, stratify=target_set)

print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

y_train_id = x_train[:,-1]
x_train = x_train[:,:-1]

y_val_id = x_val[:,-1]
x_val = x_val[:,:-1]

print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

(2812, 29) (2812,) (313, 29) (313,)
(2812, 28) (2812,) (313, 28) (313,)


# Model part

In [7]:
import lightgbm as lgb

In [18]:
train_ds = lgb.Dataset(x_train, label=y_train)
val_ds = lgb.Dataset(x_val, label=y_val)
params = {
    'learning_rate' : 0.05,
    'boosting_type' : 'dart',
    'max_depth' : 3,
    'num_leaves' : 2,
    'min_data_in_leaf': 100,
    'drop_rate' : 0.6,
    'feature_fraction' : 0.2,
    'bagging_fraction' : 0.2,
    'objective' : 'multiclass',
    'metric' : 'multi_logloss',
    'num_class':61
}

clf = lgb.train(params, train_ds, 1300, val_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6154
[LightGBM] [Info] Number of data points in the train set: 2812, number of used features: 28
[LightGBM] [Info] Start training from score -5.543756
[LightGBM] [Info] Start training from score -4.997212
[LightGBM] [Info] Start training from score -5.051279
[LightGBM] [Info] Start training from score -4.897129
[LightGBM] [Info] Start training from score -4.507664
[LightGBM] [Info] Start training from score -4.806157
[LightGBM] [Info] Start training from score -4.850609
[LightGBM] [Info] Start training from score -4.806157
[LightGBM] [Info] Start training from score -3.475743
[LightGBM] [Info] Start training from score -4.445144
[LightGBM] [Info] Start training from score -5.051279
[LightGBM] [Info] Start training from score -4.897129
[LightGBM] [Info] Start training from score -5.543756
[LightGBM] [Info] Start training from score -5.543756
[LightGBM] [Info] Start training from score -4.806157
[Light

# Result

In [19]:
train_pred = clf.predict(x_train)
val_pred = clf.predict(x_val)

def return_armax(arrs):
    return np.argmax(arrs, axis=1)
train_pred = return_armax(train_pred)
val_pred = return_armax(val_pred)

from sklearn.metrics import confusion_matrix, accuracy_score

print(accuracy_score(y_train, train_pred))
print(accuracy_score(y_val, val_pred))

0.9626600284495022
0.7763578274760383


# For Submit

In [10]:

def prerpocessing_using_static_sub(features_set):
    input_set = []
    
    for id_idx in range(features_set.id.min(), features_set.id.max()+1):
        sample_features = features_set[["acc_x","acc_y","acc_z"]][features_set.id==id_idx].values
        
        #Fourier transform
        strength = np.fft.fft(sample_features, axis=0)
        strength = abs(strength)
        frequency = np.fft.fftfreq(len(sample_features), 1)
        strength_pos = strength[frequency>=0]
        frequency_pos = frequency[frequency>=0]
        freq_x = frequency_pos[return_index(strength_pos[:,0],3)].mean()
        freq_y = frequency_pos[return_index(strength_pos[:,1],3)].mean()
        freq_z = frequency_pos[return_index(strength_pos[:,2],3)].mean()
        
        #Transform to static
        #Mean values
        mean_arr = sample_features.mean(axis=0)
        #Mag values
        mag_arr = abs(sample_features).mean(axis=0)
        #Std values
        std_arr = sample_features.std(axis=0)
        #Cov values
        cov_xy = np.cov(sample_features[:,0], sample_features[:,1])[0][1]
        cov_yz = np.cov(sample_features[:,1], sample_features[:,2])[0][1]
        cov_zx = np.cov(sample_features[:,2], sample_features[:,0])[0][1]
        #Corr values
        cor_xy = cov_xy / (std_arr[0]*std_arr[1])
        cor_yz = cov_yz / (std_arr[1]*std_arr[2])
        cor_zx = cov_zx / (std_arr[2]*std_arr[0])
        #Start End point change of vector values
        start_point = sample_features[:3,:].mean(axis=0)
        end_point = sample_features[-3:,:].mean(axis=0)
        change_of_vector_st = np.cos((start_point * end_point).sum() / (np.sqrt(np.power(start_point, 2).sum()) * np.sqrt(np.power(end_point, 2).sum())))
        #Energy values
        shift_0_values = np.roll(sample_features, 1, axis=0)[1:]
        shift_1_values = sample_features[1:]
        energys = np.power((shift_0_values-shift_1_values),2)
        energy_strength = np.fft.fft(energys, axis=0)
        energy_strength = abs(energy_strength)
        energy_frequency = np.fft.fftfreq(len(energy_strength), 1)
        energy_strength_pos = energy_strength[energy_frequency>=0]
        energy_frequency_pos = energy_frequency[energy_frequency>=0]
        energy_freq_x = energy_frequency_pos[return_index(energy_strength_pos[:,0],3)].mean()
        energy_freq_y = energy_frequency_pos[return_index(energy_strength_pos[:,1],3)].mean()
        energy_freq_z = energy_frequency_pos[return_index(energy_strength_pos[:,2],3)].mean()
        energy_mean = energys.mean(axis=0)
        energy_std = energys.std(axis=0)
        #Max Min point num
        increase_shift_0 = np.where(shift_0_values-shift_1_values>=0, 1, -1)
        increase_shift_1 = np.roll(increase_shift_0, 1, axis=0)
        max_min_point_num = np.where(increase_shift_0[1:]*increase_shift_1[1:]==-1, 1, 0).sum(axis=0)
        
        #Make Set
        freqs = [freq_x, freq_y, freq_z, energy_freq_x, energy_freq_y, energy_freq_z]
        id_sample_set = list(mean_arr) + list(mag_arr) + list(std_arr)\
        + [cor_xy, cor_yz, cor_zx]\
        + [change_of_vector_st]\
        + list(energy_mean) + list(energy_std) + list(max_min_point_num) + freqs
        
        #Append
        input_set.append(id_sample_set)
    
    input_set = np.array(input_set)
    print("Input set : {}".format(input_set.shape))
    return input_set

In [11]:
sub_input_set =  prerpocessing_using_static_sub(features_set=test_features)
print(sub_input_set.shape)

Input set : (782, 28)
(782, 28)


In [12]:
sub_pred = clf.predict(sub_input_set)

In [14]:
result_df = pd.DataFrame()
result_df['id'] = sample_submissuib.id
for i in range(61):
    result_df[i] = sub_pred[:,i]

In [15]:
result_df.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,51,52,53,54,55,56,57,58,59,60
0,3125,0.001991,6.7e-05,0.00033,0.001266,0.0003190235,1.5e-05,0.0004003686,2.9e-05,0.000554,...,0.009534,0.000946,0.000905,5.4e-05,0.000468,0.000137,2e-05,0.00756,5.021264e-08,0.000122
1,3126,0.003855,5.6e-05,0.000162,0.001198,4.088824e-06,6e-05,9.199024e-05,0.000766,8.3e-05,...,0.000118,0.000153,5.5e-05,0.000122,7e-06,5.2e-05,6e-06,1.8e-05,3.519291e-07,0.00038
2,3127,0.002162,0.702971,0.000113,0.001469,4.897223e-07,0.000426,0.00074308,0.000222,0.002062,...,0.000165,0.000138,0.000144,0.007392,0.000258,0.002136,8e-06,0.000133,4.176305e-05,6.6e-05
3,3128,5.1e-05,3.6e-05,9.3e-05,7e-06,4.505711e-06,2.6e-05,2.093899e-07,0.00241,0.000125,...,3.3e-05,7e-06,5e-06,0.0018,1e-05,4.2e-05,1.3e-05,4e-06,6.364746e-07,0.040401
4,3129,0.000565,0.002646,3.6e-05,0.000266,2.044785e-05,6.9e-05,0.0001205354,0.000654,0.000128,...,0.000662,2.9e-05,0.000805,3e-05,4e-06,0.00024,0.000335,3e-05,1.434244e-06,0.000364


In [16]:
result_df.to_csv("/works/Data/Dacon/health_data_clf/result/210217.csv",index=False)