# UCI HAR Dataset

In [1]:
import os
import numpy as np
import pandas as pd
from typing import List
import matplotlib.pyplot as plt

In [2]:
dataset_name = 'uci_har'
train_dataset_dir = './train'
test_dataset_dir = './test'

output_dir = f'./../../processed/{dataset_name}/'
os.makedirs(output_dir, exist_ok=True)
train_outp_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
test_outp_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')


In [3]:
features = []
with open('features.txt', 'r') as f:
    for line in f:
        idx, name = line.strip().split()
        features.append(f'{idx}_{name}')

features

['1_tBodyAcc-mean()-X',
 '2_tBodyAcc-mean()-Y',
 '3_tBodyAcc-mean()-Z',
 '4_tBodyAcc-std()-X',
 '5_tBodyAcc-std()-Y',
 '6_tBodyAcc-std()-Z',
 '7_tBodyAcc-mad()-X',
 '8_tBodyAcc-mad()-Y',
 '9_tBodyAcc-mad()-Z',
 '10_tBodyAcc-max()-X',
 '11_tBodyAcc-max()-Y',
 '12_tBodyAcc-max()-Z',
 '13_tBodyAcc-min()-X',
 '14_tBodyAcc-min()-Y',
 '15_tBodyAcc-min()-Z',
 '16_tBodyAcc-sma()',
 '17_tBodyAcc-energy()-X',
 '18_tBodyAcc-energy()-Y',
 '19_tBodyAcc-energy()-Z',
 '20_tBodyAcc-iqr()-X',
 '21_tBodyAcc-iqr()-Y',
 '22_tBodyAcc-iqr()-Z',
 '23_tBodyAcc-entropy()-X',
 '24_tBodyAcc-entropy()-Y',
 '25_tBodyAcc-entropy()-Z',
 '26_tBodyAcc-arCoeff()-X,1',
 '27_tBodyAcc-arCoeff()-X,2',
 '28_tBodyAcc-arCoeff()-X,3',
 '29_tBodyAcc-arCoeff()-X,4',
 '30_tBodyAcc-arCoeff()-Y,1',
 '31_tBodyAcc-arCoeff()-Y,2',
 '32_tBodyAcc-arCoeff()-Y,3',
 '33_tBodyAcc-arCoeff()-Y,4',
 '34_tBodyAcc-arCoeff()-Z,1',
 '35_tBodyAcc-arCoeff()-Z,2',
 '36_tBodyAcc-arCoeff()-Z,3',
 '37_tBodyAcc-arCoeff()-Z,4',
 '38_tBodyAcc-correlation()

In [4]:
with open(os.path.join(train_dataset_dir, 'X_train.txt')) as f:
    X_train = pd.read_csv(f, header=None, sep='\s+', names=features)

with open(os.path.join(train_dataset_dir, 'y_train.txt')) as f:
    y_train = pd.read_csv(f, header=None, names=['activity'])

with open(os.path.join(test_dataset_dir, 'X_test.txt')) as f:
    X_test = pd.read_csv(f, header=None, sep='\s+', names=features)

with open(os.path.join(test_dataset_dir, 'y_test.txt')) as f:
    y_test = pd.read_csv(f, header=None, names=['activity'])

with open(train_dataset_dir + '/subject_train.txt') as f:
    subject_train = pd.read_csv(f, header=None, names=['subject_id'])

with open(test_dataset_dir + '/subject_test.txt') as f:
    subject_test = pd.read_csv(f, header=None, names=['subject_id'])

In [5]:
train_df = pd.concat([subject_train, X_train, y_train], axis=1)
test_df = pd.concat([subject_test, X_test, y_test], axis=1)

In [7]:
dfs = []
for id in train_df['subject_id'].unique():
    temp = train_df[train_df['subject_id'] == id].copy()
    temp.insert(1, 'time_step', range(len(temp)))
    dfs.append(temp)

train_df = pd.concat(dfs)
train_df.sort_values(['subject_id', 'time_step'], inplace=True)

In [9]:
dfs = []
for id in test_df['subject_id'].unique():
    temp = test_df[test_df['subject_id'] == id].copy()
    temp.insert(1, 'time_step', range(len(temp)))
    dfs.append(temp)

test_df = pd.concat(dfs)
test_df.sort_values(['subject_id', 'time_step'], inplace=True)

In [10]:
train_df.to_csv(train_outp_fname, index=False)
test_df.to_csv(test_outp_fname, index=False)