In [1]:
import numpy as np
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

#helper function required later
#custom metric
def DTW(a, b):   
    an = a.size
    bn = b.size
    pointwise_distance = distance.cdist(a.reshape(-1,1),b.reshape(-1,1))
    cumdist = np.matrix(np.ones((an+1,bn+1)) * np.inf)
    cumdist[0,0] = 0

    for ai in range(an):
        for bi in range(bn):
            minimum_cost = np.min([cumdist[ai, bi+1],
                                   cumdist[ai+1, bi],
                                   cumdist[ai, bi]])
            cumdist[ai+1, bi+1] = pointwise_distance[ai,bi] + minimum_cost

    return cumdist[an, bn]

### Code to Retrive Raw Acceleromter Data

In [2]:
# Import the RAW HAR dataset
x_train_raw_x = []
x_train_raw_y = []
x_train_raw_z = []
y_train_raw = []
x_test_raw_x = []
x_test_raw_y = []
x_test_raw_z = []
y_test_raw = []

x_train_file_raw_x = open('data/UCI-HAR-Dataset/train/Inertial Signals/total_acc_x_train.txt', 'r')
x_train_file_raw_y = open('data/UCI-HAR-Dataset/train/Inertial Signals/total_acc_y_train.txt', 'r')
x_train_file_raw_z = open('data/UCI-HAR-Dataset/train/Inertial Signals/total_acc_z_train.txt', 'r')

y_train_file_raw = open('data/UCI-HAR-Dataset/train/y_train.txt', 'r')

x_test_file_raw_x = open('data/UCI-HAR-Dataset/test/Inertial Signals/total_acc_x_test.txt', 'r')
x_test_file_raw_y = open('data/UCI-HAR-Dataset/test/Inertial Signals/total_acc_y_test.txt', 'r')
x_test_file_raw_z= open('data/UCI-HAR-Dataset/test/Inertial Signals/total_acc_z_test.txt', 'r')

y_test_file_raw = open('data/UCI-HAR-Dataset/test/y_test.txt', 'r')


for x in x_train_file_raw_x:
    x_train_raw_x.append([float(ts) for ts in x.split()])
for x in x_train_file_raw_y:
    x_train_raw_y.append([float(ts) for ts in x.split()])
for x in x_train_file_raw_z:
    x_train_raw_z.append([float(ts) for ts in x.split()])
    
    
for y in y_train_file_raw:
    y_train_raw.append(int(y.rstrip('\n')))
    
for x in x_test_file_raw_x:
    x_test_raw_x.append([float(ts) for ts in x.split()])
for x in x_test_file_raw_y:
    x_test_raw_y.append([float(ts) for ts in x.split()])
for x in x_test_file_raw_z:
    x_test_raw_z.append([float(ts) for ts in x.split()])
    
    
for y in y_test_file_raw:
    y_test_raw.append(int(y.rstrip('\n')))

    
x_train_raw = np.hstack([np.array(x_train_raw_x),np.array(x_train_raw_y),np.array(x_train_raw_z)])
x_test_raw = np.hstack([np.array(x_test_raw_x), np.array(x_test_raw_y), np.array(x_test_raw_z)])
y_train_raw = np.array(y_train_raw)
y_test_raw = np.array(y_test_raw)

#lets have the full data 
x_full_raw = np.append(x_train_raw, x_test_raw, axis=0)
y_full_raw = np.append(y_train_raw, y_test_raw, axis=0)


#reduce dataset so that we have a balanced class dataset
x_subset_raw = []
y_subset_raw = []
di = {1:0,2:0,3:0,4:0,5:0,6:0}
for i, item in enumerate(x_full_raw):
    if di[y_full_raw[i]] < 500:
        x_subset_raw.append(item)
        y_subset_raw.append(y_full_raw[i])
        di[y_full_raw[i]]+=1;
#convert to numpy type
x_subset_raw = np.array(x_subset_raw)
y_subset_raw = np.array(y_subset_raw)
print("Shape of reduced data = ", x_subset_raw.shape)
unique, counts = np.unique(y_subset_raw, return_counts=True)
print("[Label: Count] of reduced data") 
dict(zip(unique, counts))


Shape of reduced data =  (3000, 384)
[Label: Count] of reduced data


{1: 500, 2: 500, 3: 500, 4: 500, 5: 500, 6: 500}

In [4]:
unique, counts = np.unique(y_full_raw, return_counts=True)
print("[Label: Count] of reduced data") 
dict(zip(unique, counts))

[Label: Count] of reduced data


{1: 1722, 2: 1544, 3: 1406, 4: 1777, 5: 1906, 6: 1944}

## Learn ML Model On Time Series Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_subset_raw, y_subset_raw, test_size=0.33, random_state=42)
#train
parameters = {'n_neighbors':[1]}
clf = GridSearchCV(KNeighborsClassifier(metric=DTW), parameters, cv=3, verbose=1, n_jobs=-1)
clf.fit(X_train, y_train)



#evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 1 candidates, totalling 3 fits


### Code To Retrieve Curated Features

In [3]:
# Import the HAR dataset
x_train_file = open('data/UCI-HAR-Dataset/train/X_train.txt', 'r')
y_train_file = open('data/UCI-HAR-Dataset/train/y_train.txt', 'r')

x_test_file = open('data/UCI-HAR-Dataset/test/X_test.txt', 'r')
y_test_file = open('data/UCI-HAR-Dataset/test/y_test.txt', 'r')

# Create empty lists
x_train = []
y_train = []
x_test = []
y_test = []

# Mapping table for classes
labels = {1:'WALKING', 2:'WALKING UPSTAIRS', 3:'WALKING DOWNSTAIRS',
          4:'SITTING', 5:'STANDING', 6:'LAYING'}

# Loop through datasets
for x in x_train_file:
    x_train.append([float(ts) for ts in x.split()])
    
for y in y_train_file:
    y_train.append(int(y.rstrip('\n')))
    
for x in x_test_file:
    x_test.append([float(ts) for ts in x.split()])
    
for y in y_test_file:
    y_test.append(int(y.rstrip('\n')))
    
# Convert to numpy for efficiency
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

#lets have the full data for CV
x_full = np.append(x_train, x_test, axis=0)
y_full = np.append(y_train, y_test, axis=0)
print("Dimension of input = ",x_full.shape)
print("Dimension of output = ",y_full.shape)
x_subset = []
y_subset = []
di = {1:0,2:0,3:0,4:0,5:0,6:0}
for i, item in enumerate(x_full):
    if di[y_full[i]] < 500:
        x_subset.append(item)
        y_subset.append(y_full[i])
        di[y_full[i]]+=1;
#convert to numpy type
x_subset = np.array(x_subset)
y_subset = np.array(y_subset)
print("Shape of reduced data = ", x_subset.shape)
unique, counts = np.unique(y_subset, return_counts=True)
print("[Label: Count] of reduced data") 
dict(zip(unique, counts))

Dimension of input =  (10299, 561)
Dimension of output =  (10299,)
Shape of reduced data =  (3000, 561)
[Label: Count] of reduced data


{1: 500, 2: 500, 3: 500, 4: 500, 5: 500, 6: 500}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_subset, y_subset, test_size=0.33, random_state=42)
#train
parameters = {'n_neighbors':[1]}
clf = GridSearchCV(KNeighborsClassifier(metric=DTW), parameters, cv=3, verbose=1, n_jobs=-1)
clf.fit(X_train, y_train)



#evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
