# UCI HAR Dataset
The experiments have been carried out with a group of 30 volunteers within an age bracket of 19-48 years. Each person performed six activities (WALKING, WALKING_UPSTAIRS, WALKING_DOWNSTAIRS, SITTING, STANDING, LAYING) wearing a smartphone (Samsung Galaxy S II) on the waist. Using its embedded accelerometer and gyroscope, we captured 3-axial linear acceleration and 3-axial angular velocity at a constant rate of 50Hz. The experiments have been video-recorded to label the data manually. The obtained dataset has been randomly partitioned into two sets, where 70% of the volunteers was selected for generating the training data and 30% the test data. 

The sensor signals (accelerometer and gyroscope) were pre-processed by applying noise filters and then sampled in fixed-width sliding windows of 2.56 sec and 50% overlap (128 readings/window). The sensor acceleration signal, which has gravitational and body motion components, was separated using a Butterworth low-pass filter into body acceleration and gravity. The gravitational force is assumed to have only low frequency components, therefore a filter with 0.3 Hz cutoff frequency was used. From each window, a vector of features was obtained by calculating variables from the time and frequency domain.

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import random

In [2]:
uci_folder = "../../../Datasets/UCI HAR Dataset/"

In [3]:
os.listdir(uci_folder)

['.DS_Store',
 'activity_labels.txt',
 'features.txt',
 'features_info.txt',
 'README.txt',
 'test',
 'train']

In [5]:
train_folder = uci_folder + 'train/Inertial Signals/'
test_folder = uci_folder + 'test/Inertial Signals/'

train_labels_file = uci_folder + 'train/y_train.txt'
test_labels_file = uci_folder + 'test/y_test.txt'

train_subjects_ids_file = uci_folder + 'train/subject_train.txt'
test_subjects_ids_file = uci_folder + 'test/subject_test.txt'


In [7]:
train_subjects_ids = np.loadtxt(train_subjects_ids_file, dtype = int)
np.unique(train_subjects_ids, return_counts=True)

(array([ 1,  3,  5,  6,  7,  8, 11, 14, 15, 16, 17, 19, 21, 22, 23, 25, 26,
        27, 28, 29, 30]),
 array([347, 341, 302, 325, 308, 281, 316, 323, 328, 366, 368, 360, 408,
        321, 372, 409, 392, 376, 382, 344, 383], dtype=int64))

In [8]:
test_subjects_ids = np.loadtxt(test_subjects_ids_file, dtype=int)
np.unique(test_subjects_ids, return_counts=True)

(array([ 2,  4,  9, 10, 12, 13, 18, 20, 24]),
 array([302, 317, 288, 294, 320, 327, 364, 354, 381], dtype=int64))

In [10]:
all_subject_ids = np.concatenate([train_subjects_ids, test_subjects_ids])

In [12]:
# Load the body acceleration data and put the data of three axes in one row for each label
uci_har_x = np.loadtxt(train_folder + 'body_acc_x_train.txt')
uci_har_y = np.loadtxt(train_folder + 'body_acc_y_train.txt')
uci_har_z = np.loadtxt(train_folder + 'body_acc_z_train.txt')

total, len_ = uci_har_x.shape

uci_har_train_data = np.zeros((total, len_, 3))

for i in range(uci_har_x.shape[0]):
    uci_har_train_data[i, :, 0] = uci_har_x[i] 
    uci_har_train_data[i, :, 1] = uci_har_y[i] 
    uci_har_train_data[i, :, 2] = uci_har_z[i]

In [14]:
# Load the body acceleration data and put the data of three axes in one row for each label
uci_har_x = np.loadtxt(test_folder + 'body_acc_x_test.txt')
uci_har_y = np.loadtxt(test_folder + 'body_acc_y_test.txt')
uci_har_z = np.loadtxt(test_folder + 'body_acc_z_test.txt')

total, len_ = uci_har_x.shape

uci_har_test_data = np.zeros((total, len_, 3))

for i in range(uci_har_x.shape[0]):
    uci_har_test_data[i, :, 0] = uci_har_x[i] 
    uci_har_test_data[i, :, 1] = uci_har_y[i] 
    uci_har_test_data[i, :, 2] = uci_har_z[i]

In [15]:
uci_har_train_data.shape, uci_har_test_data.shape

((7352, 128, 3), (2947, 128, 3))

In [16]:
uci_train_labels = np.loadtxt(train_labels_file, dtype=int)
uci_test_labels = np.loadtxt(test_labels_file, dtype=int)

In [18]:
uci_har_train_data.shape, uci_train_labels.shape, uci_har_test_data.shape, uci_test_labels.shape

((7352, 128, 3), (7352,), (2947, 128, 3), (2947,))

In [19]:
uci_har_activities = ['Walking',
                     'Walking up',
                     'Walking Down',
                     'Sitting',
                     'Standing',
                     'Lying Down']

In [59]:
uci_x = np.concatenate([uci_har_train_data, uci_har_test_data])
uci_y = np.concatenate([uci_train_labels, uci_test_labels])
uci_y = np.array(uci_y) - 1

In [60]:
uci_x.shape, uci_y.shape, all_subject_ids.shape

((10299, 128, 3), (10299,), (10299,))

In [24]:
subject_ids = np.unique(all_subject_ids)

In [25]:
subject_ids

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30])

In [26]:
source_ids = subject_ids[random.sample(range(0, max(subject_ids)), len(subject_ids)//2)]
target_ids = [r for r in subject_ids if r not in source_ids]

In [27]:
source_ids, target_ids

(array([25,  8, 16, 12,  6,  5, 26, 17,  7, 27, 19, 20,  9, 23,  4]),
 [1, 2, 3, 10, 11, 13, 14, 15, 18, 21, 22, 24, 28, 29, 30])

In [68]:
source_ids == target_ids

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False])

In [69]:
def get_data_for_subject_ids(x, y, subject_ids, selection_ids):
    index = [0]
    
    for s in selection_ids:
        i_ = np.where(subject_ids == s)[0]
        index.extend(i_)
        
    return x[index[1:]], y[index[1:]]

In [70]:
source_x, source_y = get_data_for_subject_ids(uci_x, uci_y, all_subject_ids, source_ids)

In [71]:
target_x, target_y = get_data_for_subject_ids(uci_x, uci_y, all_subject_ids, target_ids)

In [72]:
source_x.shape, target_x.shape, uci_x.shape

((5138, 128, 3), (5161, 128, 3), (10299, 128, 3))

In [75]:
data_folder = "../Processed data/"

In [79]:
os.listdir(data_folder)

['adl_activity_dataset.pickle',
 'adl_activity_dataset_small.pickle',
 'adl_activity_dataset_small_minmax_scaled.pickle',
 'adl_activity_feature_dataset_small.pickle',
 'adl_dataset_small_minmax_scaled_feature.pickle',
 'adl_posture_data.pickle',
 'adl_posture_dataset.pickle',
 'adl_posture_dataset_small.pickle',
 'adl_posture_feature_dataset.pickle',
 'adl_posture_feature_dataset_1.pickle',
 'mHealth_ankle_dataset.pickle',
 'mHealth_ankle_feature_dataset.pickle',
 'mHealth_chest_dataset.pickle',
 'mHealth_chest_feature_dataset.pickle',
 'mHealth_wrist_dataset.pickle',
 'mHealth_wrist_feature_dataset.pickle',
 'mHealth_wrist_feature_dataset_1.pickle',
 'mHealth_wrist_source_dataset.pickle',
 'mHealth_wrist_target_dataset.pickle',
 'uci_body_acc_dataset.pickle',
 'uci_feature_dataset.pickle',
 'uci_feature_dataset_1.pickle',
 'uci_source_dataset.pickle',
 'uci_target_dataset.pickle']

In [77]:
f = open(data_folder+"uci_source_dataset.pickle", "wb")
pkl.dump([source_x, source_y, source_ids], f)
f.close()

In [78]:
f = open(data_folder+"uci_target_dataset.pickle", "wb")
pkl.dump([target_x, target_y, target_ids], f)
f.close()