In [1]:
# including total acc and gyroscope signals too, will stick to np for now

In [2]:
import numpy as np

from sklearn import preprocessing, svm
from sklearn.metrics import confusion_matrix, classification_report, f1_score
import matplotlib.pyplot as plt

import scipy.stats as st
from scipy.fftpack import fft, fftfreq
from scipy.signal import argrelextrema
import operator

In [3]:
# load body acceleration raw signals 
# x axis
body_acc_x_train = np.loadtxt('./HARDataset/train/Inertial Signals/body_acc_x_train.txt')
body_acc_x_test = np.loadtxt('./HARDataset/test/Inertial Signals/body_acc_x_test.txt')
# y axis
body_acc_y_train = np.loadtxt('./HARDataset/train/Inertial Signals/body_acc_y_train.txt')
body_acc_y_test = np.loadtxt('./HARDataset/test/Inertial Signals/body_acc_y_test.txt')
# z axis
body_acc_z_train = np.loadtxt('./HARDataset/train/Inertial Signals/body_acc_z_train.txt')
body_acc_z_test = np.loadtxt('./HARDataset/test/Inertial Signals/body_acc_z_test.txt')

In [4]:
# load total acceleration raw signals 
# x axis
total_acc_x_train = np.loadtxt('./HARDataset/train/Inertial Signals/total_acc_x_train.txt')
total_acc_x_test = np.loadtxt('./HARDataset/test/Inertial Signals/total_acc_x_test.txt')
# y axis
total_acc_y_train = np.loadtxt('./HARDataset/train/Inertial Signals/total_acc_y_train.txt')
total_acc_y_test = np.loadtxt('./HARDataset/test/Inertial Signals/total_acc_y_test.txt')
# z axis
total_acc_z_train = np.loadtxt('./HARDataset/train/Inertial Signals/total_acc_z_train.txt')
total_acc_z_test = np.loadtxt('./HARDataset/test/Inertial Signals/total_acc_z_test.txt')

In [5]:
# load body gyroscope raw signals 
# x axis
body_gyro_x_train = np.loadtxt('./HARDataset/train/Inertial Signals/body_gyro_x_train.txt')
body_gyro_x_test = np.loadtxt('./HARDataset/test/Inertial Signals/body_gyro_x_test.txt')
# y axis
body_gyro_y_train = np.loadtxt('./HARDataset/train/Inertial Signals/body_gyro_y_train.txt')
body_gyro_y_test = np.loadtxt('./HARDataset/test/Inertial Signals/body_gyro_y_test.txt')
# z axis
body_gyro_z_train = np.loadtxt('./HARDataset/train/Inertial Signals/body_gyro_z_train.txt')
body_gyro_z_test = np.loadtxt('./HARDataset/test/Inertial Signals/body_gyro_z_test.txt')

In [6]:
body_acc_x_train.shape, body_acc_x_test.shape

((7352, 128), (2947, 128))

In [7]:
total_acc_x_train.shape, total_acc_x_test.shape

((7352, 128), (2947, 128))

In [8]:
body_gyro_x_train.shape, body_gyro_x_test.shape

((7352, 128), (2947, 128))

In [9]:
# load label vectors
y_train = np.loadtxt('./HARDataset/train/y_train.txt')
y_test = np.loadtxt('./HARDataset/test/y_test.txt')

In [10]:
y_train.shape, y_test.shape

((7352,), (2947,))

In [11]:
label_names = ['Walking', 'Walking upstairs', 'Walking downstairs', 'Sitting', 'Standing', 'Laying']

In [83]:
def stat_area_features(x, Te=1.0):
    # mean
    mean_ts = np.mean(x, axis=1).reshape(-1, 1)
    # max
    max_ts = np.amax(x, axis=1).reshape(-1, 1)
    # min
    min_ts = np.amin(x, axis=1).reshape(-1, 1)
    # std
    std_ts = np.std(x, axis=1).reshape(-1, 1)
    # skew
    skew_ts = st.skew(x, axis=1).reshape(-1, 1)
    # kurtosis
    kurtosis_ts = st.kurtosis(x, axis=1).reshape(-1, 1)
    # interquartile range
    iqr_ts = st.iqr(x, axis=1).reshape(-1, 1)
    # median absolute deviation
    mad_ts = np.median(np.sort(abs(x - np.mean(x, axis=1).reshape(-1, 1)), axis=1), axis=1).reshape(-1, 1)
    # area under curve
    area_ts = np.trapz(x, axis=1, dx=Te).reshape(-1, 1)
    # area under curve ** 2
    sq_area_ts = np.trapz(x ** 2, axis=1, dx=Te).reshape(-1, 1)
    
    return np.concatenate((mean_ts, max_ts, min_ts, std_ts, skew_ts, kurtosis_ts, iqr_ts, 
                           mad_ts, area_ts, sq_area_ts), axis=1)

In [13]:
# stats for train data

In [14]:
body_acc_x_train_stats = stat_area_features(body_acc_x_train)
body_acc_y_train_stats = stat_area_features(body_acc_y_train)
body_acc_z_train_stats = stat_area_features(body_acc_z_train)

In [15]:
body_acc_x_train_stats.shape, body_acc_y_train_stats.shape, body_acc_z_train_stats.shape

((7352, 10), (7352, 10), (7352, 10))

In [16]:
total_acc_x_train_stats = stat_area_features(total_acc_x_train)
total_acc_y_train_stats = stat_area_features(total_acc_y_train)
total_acc_z_train_stats = stat_area_features(total_acc_z_train)

In [17]:
total_acc_x_train_stats.shape, total_acc_y_train_stats.shape, total_acc_z_train_stats.shape

((7352, 10), (7352, 10), (7352, 10))

In [18]:
body_gyro_x_train_stats = stat_area_features(body_gyro_x_train)
body_gyro_y_train_stats = stat_area_features(body_gyro_y_train)
body_gyro_z_train_stats = stat_area_features(body_gyro_z_train)

In [19]:
body_gyro_x_train_stats.shape, body_gyro_y_train_stats.shape, body_gyro_z_train_stats.shape

((7352, 10), (7352, 10), (7352, 10))

In [20]:
# stats for test data

In [21]:
body_acc_x_test_stats = stat_area_features(body_acc_x_test)
body_acc_y_test_stats = stat_area_features(body_acc_y_test)
body_acc_z_test_stats = stat_area_features(body_acc_z_test)

In [22]:
body_acc_x_test_stats.shape, body_acc_y_test_stats.shape, body_acc_z_test_stats.shape

((2947, 10), (2947, 10), (2947, 10))

In [23]:
total_acc_x_test_stats = stat_area_features(total_acc_x_test)
total_acc_y_test_stats = stat_area_features(total_acc_y_test)
total_acc_z_test_stats = stat_area_features(total_acc_z_test)

In [24]:
total_acc_x_test_stats.shape, total_acc_y_test_stats.shape, total_acc_z_test_stats.shape

((2947, 10), (2947, 10), (2947, 10))

In [25]:
body_gyro_x_test_stats = stat_area_features(body_gyro_x_test)
body_gyro_y_test_stats = stat_area_features(body_gyro_y_test)
body_gyro_z_test_stats = stat_area_features(body_gyro_z_test)

In [26]:
body_gyro_x_test_stats.shape, body_gyro_y_test_stats.shape, body_gyro_z_test_stats.shape

((2947, 10), (2947, 10), (2947, 10))

In [27]:
# creating X_train and X_test 

In [28]:
X_train = np.concatenate((body_acc_x_train_stats, body_acc_y_train_stats, body_acc_z_train_stats, 
                          total_acc_x_train_stats, total_acc_y_train_stats, total_acc_z_train_stats, 
                          body_gyro_x_train_stats, body_gyro_y_train_stats, body_gyro_z_train_stats), axis=1)

In [29]:
X_test = np.concatenate((body_acc_x_test_stats, body_acc_y_test_stats, body_acc_z_test_stats, 
                          total_acc_x_test_stats, total_acc_y_test_stats, total_acc_z_test_stats, 
                          body_gyro_x_test_stats, body_gyro_y_test_stats, body_gyro_z_test_stats), axis=1)

In [30]:
X_train.shape, X_test.shape

((7352, 90), (2947, 90))

In [31]:
# scaling data makes big differnce! to all! 

In [32]:
from sklearn.preprocessing import StandardScaler

In [33]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [34]:
#can one do a simple log reg first?
from sklearn.linear_model import LogisticRegression

In [35]:
logreg = LogisticRegression()

In [36]:
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
logreg.score(X_train, y_train)

0.9530739934711643

In [38]:
y_pred_logreg = logreg.predict(X_test)

In [39]:
print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

         1.0       0.82      0.76      0.79       496
         2.0       0.82      0.76      0.79       471
         3.0       0.76      0.93      0.84       420
         4.0       0.88      0.90      0.89       491
         5.0       0.90      0.89      0.90       532
         6.0       1.00      0.95      0.97       537

    accuracy                           0.87      2947
   macro avg       0.86      0.87      0.86      2947
weighted avg       0.87      0.87      0.87      2947



In [40]:
from sklearn import svm

svm = svm.SVC()

In [41]:
svm.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [42]:
svm.score(X_train, y_train)

0.9581066376496191

In [43]:
y_pred_svm = svm.predict(X_test)

In [44]:
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

         1.0       0.85      0.83      0.84       496
         2.0       0.85      0.86      0.86       471
         3.0       0.91      0.93      0.92       420
         4.0       0.81      0.82      0.82       491
         5.0       0.85      0.82      0.83       532
         6.0       0.97      1.00      0.98       537

    accuracy                           0.88      2947
   macro avg       0.87      0.88      0.88      2947
weighted avg       0.88      0.88      0.88      2947



In [45]:
from sklearn.ensemble import RandomForestClassifier

In [46]:
rfc = RandomForestClassifier()

In [47]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [48]:
rfc.score(X_train, y_train)

1.0

In [49]:
y_pred_rfc = rfc.predict(X_test)

In [50]:
print(classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

         1.0       0.76      0.79      0.77       496
         2.0       0.82      0.79      0.80       471
         3.0       0.89      0.89      0.89       420
         4.0       0.81      0.92      0.86       491
         5.0       0.92      0.80      0.86       532
         6.0       1.00      1.00      1.00       537

    accuracy                           0.87      2947
   macro avg       0.87      0.87      0.86      2947
weighted avg       0.87      0.87      0.87      2947



In [51]:
# cool this got better - code still messy and can be improved - how? use functions? 
# add more features? perhaps 

In [52]:
# will quickly try out pca - to reduce dimenionality of dataset and then fit data 
# not sure this is particularly useful when i don't have soooo many dimensions in the data 
# but this is more for practice 

from sklearn.decomposition import PCA

In [53]:
pca = PCA(0.9) # to capture 90% of the variance 

In [54]:
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.fit_transform(X_test)

In [55]:
logreg.fit(X_train_pca, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [56]:
logreg.score(X_train_pca, y_train)

0.8809847660500544

In [57]:
# got worse! 

In [58]:
svm.fit(X_train_pca, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [59]:
svm.score(X_train_pca, y_train)

0.9266866158868335

In [60]:
# also got worse! 

In [61]:
rfc.fit(X_train_pca, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [62]:
rfc.score(X_train_pca, y_train)

1.0

In [63]:
y_pred_rfc_pca = rfc.predict(X_test_pca)

In [64]:
print(classification_report(y_test, y_pred_rfc_pca))

              precision    recall  f1-score   support

         1.0       0.48      0.49      0.48       496
         2.0       0.36      0.32      0.34       471
         3.0       0.63      0.69      0.66       420
         4.0       0.63      0.75      0.68       491
         5.0       0.74      0.66      0.70       532
         6.0       1.00      0.92      0.96       537

    accuracy                           0.65      2947
   macro avg       0.64      0.64      0.64      2947
weighted avg       0.65      0.65      0.64      2947



In [65]:
# decidedly worse - so as thought before, PCA for low dimensional dataset only makes it worse 

In [66]:
# whats with the jerk signals bit? esp the first derivative part?

In [134]:
q = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 6, 7, 8, 9, 10], [6, 7, 8, 9, 10, 6, 7, 8, 9, 10, 6, 7, 8, 9, 10], [6, 7, 8, 9, 10, 4, 5, 6, 7, 8, 6, 7, 8, 9, 10]] )

In [135]:
q

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  6,  7,  8,  9, 10],
       [ 6,  7,  8,  9, 10,  6,  7,  8,  9, 10,  6,  7,  8,  9, 10],
       [ 6,  7,  8,  9, 10,  4,  5,  6,  7,  8,  6,  7,  8,  9, 10]])

In [136]:
stat_area_features(q, Te=1.0)

array([[ 6.33333333e+00,  1.00000000e+01,  1.00000000e+00,
         2.74873708e+00, -4.45837546e-01, -9.02076125e-01,
         4.00000000e+00,  2.33333333e+00,  8.95000000e+01,
         6.64500000e+02],
       [ 8.00000000e+00,  1.00000000e+01,  6.00000000e+00,
         1.41421356e+00,  0.00000000e+00, -1.30000000e+00,
         2.00000000e+00,  1.00000000e+00,  1.12000000e+02,
         9.22000000e+02],
       [ 7.33333333e+00,  1.00000000e+01,  4.00000000e+00,
         1.69967317e+00, -1.20686852e-01, -7.65088757e-01,
         2.50000000e+00,  1.33333333e+00,  1.02000000e+02,
         7.82000000e+02]])

In [137]:
q[:, 1:]

array([[ 2,  3,  4,  5,  6,  7,  8,  9, 10,  6,  7,  8,  9, 10],
       [ 7,  8,  9, 10,  6,  7,  8,  9, 10,  6,  7,  8,  9, 10],
       [ 7,  8,  9, 10,  4,  5,  6,  7,  8,  6,  7,  8,  9, 10]])

In [138]:
q[:, :-1]

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  6,  7,  8,  9],
       [ 6,  7,  8,  9, 10,  6,  7,  8,  9, 10,  6,  7,  8,  9],
       [ 6,  7,  8,  9, 10,  4,  5,  6,  7,  8,  6,  7,  8,  9]])

In [139]:
(q[:, 1:]-q[:, :-1])

array([[ 1,  1,  1,  1,  1,  1,  1,  1,  1, -4,  1,  1,  1,  1],
       [ 1,  1,  1,  1, -4,  1,  1,  1,  1, -4,  1,  1,  1,  1],
       [ 1,  1,  1,  1, -6,  1,  1,  1,  1, -2,  1,  1,  1,  1]])

In [140]:
q2 = stat_area_features(q[:, 1:]-q[:, :-1])

In [141]:
q2 

array([[ 0.64285714,  1.        , -4.        ,  1.28769688, -3.32820118,
         9.07692308,  0.        ,  0.35714286,  8.        , 28.        ],
       [ 0.28571429,  1.        , -4.        ,  1.74963553, -2.04124145,
         2.16666667,  0.        ,  0.71428571,  3.        , 43.        ],
       [ 0.28571429,  1.        , -6.        ,  1.90595201, -2.64020432,
         5.6143795 ,  0.        ,  0.71428571,  3.        , 51.        ]])

In [114]:
# still lost about the use of Te here: esp in context below which does not work 
# features_xt_jerk = stat_area_features((x[:, 1:]-x[:, :-1])/Te, Te=Te)

In [115]:
# trying without Te business and sees what it gives for raw data

In [116]:
lol = stat_area_features(body_acc_x_train, Te=1.0)

In [117]:
lol

array([[ 2.26869044e-03,  1.08102500e-02, -4.29425300e-03, ...,
         2.44426240e-03,  2.89514201e-01,  1.76470832e-03],
       [ 1.73739553e-04,  5.25084200e-03, -6.70614800e-03, ...,
         9.99486447e-04,  2.24911623e-02,  5.04263828e-04],
       [ 4.28091216e-04,  8.16653600e-03, -1.04828100e-02, ...,
         1.73074978e-03,  5.19412136e-02,  1.09754309e-03],
       ...,
       [-8.63149109e-04,  6.62930500e-01, -3.95135000e-01, ...,
         1.61759151e-01,  1.10483064e-01,  8.23674326e+00],
       [ 2.48912541e-03,  7.01377900e-01, -4.13906100e-01, ...,
         1.64010275e-01,  4.87184787e-01,  8.22770611e+00],
       [ 1.52350776e-02,  8.12856600e-01, -4.13906100e-01, ...,
         1.42122228e-01,  2.13282473e+00,  7.24051114e+00]])

In [132]:
lol_jerk = stat_area_features((body_acc_x_train[:, 1:]-body_acc_x_train[:, :-1])/1.0, Te=1.0)

In [133]:
lol_jerk

array([[ 1.09814843e-05,  9.95770850e-03, -6.76514900e-03, ...,
         9.82247484e-04, -4.48308850e-03,  4.60391132e-04],
       [-2.12008110e-05,  5.27592377e-03, -5.53020420e-03, ...,
         1.10281419e-03, -5.09195650e-03,  3.64013168e-04],
       [-1.06583307e-05,  9.00938000e-03, -7.41563700e-03, ...,
         1.48491073e-03, -2.60944000e-04,  7.45909893e-04],
       ...,
       [ 9.05724409e-05,  2.54225300e-01, -3.53026300e-01, ...,
         5.91451876e-02, -2.70417000e-02,  1.25776991e+00],
       [-1.63418528e-03,  3.39070600e-01, -3.24523700e-01, ...,
         4.94311853e-02, -2.29172965e-01,  1.10208825e+00],
       [ 1.84817323e-04,  4.38307700e-01, -4.39394790e-01, ...,
         4.30934027e-02,  2.75327500e-02,  1.35522443e+00]])