In [7]:
import os
import random
import numpy as np
from math import sqrt


In [12]:
TRAIN_DIR = 'UCI HAR Dataset/train'
TEST_DIR = 'UCI HAR Dataset/test'


def read_a_file(file):
    """
    read_a_file: Read the data file, Each row is corresponding to one subject's motion. 
    file: the file path
    
    """
    f = open(file, 'r', encoding='utf-8')
    data = []
    for line in f:
        line = line.strip().split()
        x = [float(v) for v in line]
        data.append(x)
    f.close()
    data = np.array(data)
    return data

In [13]:
def read_dataset(data_dir):
    """
    read_dataset: Read X and y form 'UCI HAR Dataset/train' or 'UCI HAR Dataset/test' . The returned X and y are used
    as training or testing data set.

    data_dir: file address 'UCI HAR Dataset/train' or 'UCI HAR Dataset/test'
     X (A n * f matrix,where n is the number of samples, f is the number of features),
     y (A n*1 matrix where n is corresponding to the number of labels)
    """
    kind = 'train' if 'train' in data_dir else 'test'

    # X has 2 parts: for train, it comes form X_train.txt, and Inertial Signals
    # Part 1. X_train.txt or X_test.txt
    X1 = read_a_file(os.path.join(data_dir, 'X_{}.txt'.format(kind)))
    print('The data shape of X_{}.txt is {} × {}'.format(kind, X1.shape[0], X1.shape[1]))

    
    # Part 2. Inertial Signals
    # IS_dir = os.path.join(data_dir, 'Inertial Signals')
    # X2 = []
    # for file_name in os.listdir(IS_dir):
    #     file_path = os.path.join(IS_dir, file_name)
    #     tmp = read_a_file(file_path)
    #     X2.append(tmp)
    # X2 = np.concatenate(X2, axis=1)
    # print('The data shape of Inertial Signals is {} × {}'.format(X2.shape[0], X2.shape[1]))

    # X = np.concatenate([X1, X2], axis=1)
    
    X = X1
  
    # Read the y data
    y = read_a_file(os.path.join(data_dir, 'y_{}.txt'.format(kind)))
    y = y.reshape(-1)
    y = [int(v) for v in y]
    return X, y


In [14]:
# The original 6 labels is divided into two new labels: static state and active state
#labels 1, 2, 3 are corresponding to active state and it is converted as -1 in new label
#labels 4, 5, 6 are corresponding to static state and it is converted as 1 in new label

if __name__ == '__main__':
    X1, y1 = read_dataset(TRAIN_DIR)
    print()
    X2, y2 = read_dataset(TEST_DIR)
    print()
    X = np.concatenate([X1, X2], axis=0)
    y = y1 + y2
    y= [-1 if v in [1, 2, 3] else 1 for v in y]
    
    # Divide all data into 4 sets, then using 2 sets as the training dataset
    #Setting the other 2 sets as the testing dataset
    sets = [[0, 2574], [2574, 2574*2], [2574*2, 2574*3], [2574*3, 2574*4]]
    split_X = [X[g[0]:g[1], :] for g in sets]
    split_y = [y[g[0]:g[1]] for g in sets]
    for i, g in enumerate(sets):
        
        train_X = []
        train_y = []
        for j in range(4):
            if j != i:
                train_X.append(split_X[j])
                train_y.extend(split_y[j])
        train_X = np.concatenate(train_X, axis=0)
        
        test_X = split_X[i]
        test_y = split_y[i]
        
        # w = (X^T X)^(-1)X^T y
        weights = (train_X.T @ train_X) ** -1 @ train_X.T @ train_y
        error_rate = (np.mean(np.sign(test_X @ weights) != test_y))
        
        print('Using data from {} to {} for test, other for training'.format(g[0], g[1]))
        print('Error rate : '+str(error_rate))

The data shape of X_train.txt is 7352 × 561

The data shape of X_test.txt is 2947 × 561

Using data from 0 to 2574 for test, other for training
Error rate : 0.7703962703962703
Using data from 2574 to 5148 for test, other for training
Error rate : 0.45998445998445997
Using data from 5148 to 7722 for test, other for training
Error rate : 0.4627039627039627
Using data from 7722 to 10296 for test, other for training
Error rate : 0.45765345765345766
