In [1]:
# trial for part of the SVM 
# with just basic statistical features 

In [2]:
import numpy as np
from sklearn import preprocessing, svm
from sklearn.metrics import confusion_matrix, classification_report, f1_score
import matplotlib.pyplot as plt

import scipy.stats as st
from scipy.fftpack import fft, fftfreq
from scipy.signal import argrelextrema
import operator

In [3]:
# load body acceleration raw signals 
# x axis
X_train_x_raw = np.loadtxt('./HARDataset/train/Inertial Signals/body_acc_x_train.txt')
X_test_x_raw = np.loadtxt('./HARDataset/test/Inertial Signals/body_acc_x_test.txt')

# y axis
X_train_y_raw = np.loadtxt('./HARDataset/train/Inertial Signals/body_acc_y_train.txt')
X_test_y_raw = np.loadtxt('./HARDataset/test/Inertial Signals/body_acc_y_test.txt')

# z axis
X_train_z_raw = np.loadtxt('./HARDataset/train/Inertial Signals/body_acc_z_train.txt')
X_test_z_raw = np.loadtxt('./HARDataset/test/Inertial Signals/body_acc_z_test.txt')

In [4]:
X_train_x_raw.shape, X_test_x_raw.shape, X_train_y_raw.shape, X_test_y_raw.shape, X_train_z_raw.shape, X_test_z_raw.shape

((7352, 128), (2947, 128), (7352, 128), (2947, 128), (7352, 128), (2947, 128))

In [5]:
# load label vectors
y_train = np.loadtxt('./HARDataset/train/y_train.txt')
y_test = np.loadtxt('./HARDataset/test/y_test.txt')

In [6]:
y_train.shape, y_test.shape

((7352,), (2947,))

In [7]:
label_names = ['Walking', 'Walking upstairs', 'Walking downstairs', 'Sitting', 'Standing', 'Laying']

In [8]:
def stat_area_features(x): #, Te=1.0):
    # mean
    mean_ts = np.mean(x, axis=1).reshape(-1, 1)
    # max
    max_ts = np.amax(x, axis=1).reshape(-1, 1)
    # min
    min_ts = np.amin(x, axis=1).reshape(-1, 1)
    # std
    std_ts = np.std(x, axis=1).reshape(-1, 1)
    # skew
    skew_ts = st.skew(x, axis=1).reshape(-1, 1)
    # kurtosis
    kurtosis_ts = st.kurtosis(x, axis=1).reshape(-1, 1)
    # interquartile range
    iqr_ts = st.iqr(x, axis=1).reshape(-1, 1)
    # median absolute deviation
    mad_ts = np.median(np.sort(abs(x - np.mean(x, axis=1).reshape(-1, 1)), axis=1), axis=1).reshape(-1, 1)
    # area under curve
    #area_ts = np.trapz(x, axis=1, dx=Te).reshape(-1, 1)
    # area under curve ** 2
    #sq_area_ts = np.mean(x ** 2, axis=1, dx=Te).reshape(-1, 1)
    
    return np.concatenate((mean_ts, max_ts, min_ts, std_ts, skew_ts, kurtosis_ts, iqr_ts, 
                           mad_ts), axis=1)

In [11]:
# whats with Te 1/50? 

X_train_x_raw_stats = stat_area_features(X_train_x_raw)#, Te=Te)
X_train_y_raw_stats = stat_area_features(X_train_y_raw)
X_train_z_raw_stats = stat_area_features(X_train_z_raw)

In [12]:
X_train_x_raw_stats.shape, X_train_y_raw_stats.shape, X_train_z_raw_stats.shape

((7352, 8), (7352, 8), (7352, 8))

In [13]:
X_train = np.concatenate((X_train_x_raw_stats, X_train_y_raw_stats, X_train_z_raw_stats), axis=1)

In [14]:
X_train.shape

(7352, 24)

In [15]:
X_test_x_raw_stats = stat_area_features(X_test_x_raw)#, Te=Te)
X_test_y_raw_stats = stat_area_features(X_test_y_raw)
X_test_z_raw_stats = stat_area_features(X_test_z_raw)

In [16]:
X_test_x_raw_stats.shape, X_test_y_raw_stats.shape, X_test_z_raw_stats.shape

((2947, 8), (2947, 8), (2947, 8))

In [17]:
X_test = np.concatenate((X_test_x_raw_stats, X_test_y_raw_stats, X_test_z_raw_stats), axis=1)

In [18]:
X_test.shape

(2947, 24)

In [19]:
y_train.shape

(7352,)

In [20]:
y_test.shape

(2947,)

In [21]:
#can one do a simple log reg first?
from sklearn.linear_model import LogisticRegression

In [22]:
logreg = LogisticRegression()

In [23]:
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
logreg.score(X_train, y_train)

0.6392818280739935

In [27]:
y_pred_logreg = logreg.predict(X_test)

In [29]:
print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

         1.0       0.59      0.70      0.64       496
         2.0       0.66      0.54      0.60       471
         3.0       0.88      0.87      0.87       420
         4.0       0.52      0.19      0.27       491
         5.0       0.52      0.69      0.59       532
         6.0       0.60      0.76      0.67       537

    accuracy                           0.62      2947
   macro avg       0.63      0.62      0.61      2947
weighted avg       0.62      0.62      0.60      2947



In [33]:
from sklearn import svm

svm = svm.SVC()

In [34]:
svm.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [35]:
svm.score(X_train, y_train)

0.6711099020674647

In [36]:
y_pred_svm = svm.predict(X_test)

In [37]:
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

         1.0       0.57      0.74      0.64       496
         2.0       0.64      0.43      0.51       471
         3.0       0.87      0.89      0.88       420
         4.0       0.44      0.22      0.30       491
         5.0       0.49      0.74      0.59       532
         6.0       0.65      0.63      0.64       537

    accuracy                           0.60      2947
   macro avg       0.61      0.61      0.59      2947
weighted avg       0.60      0.60      0.59      2947



In [38]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
rfc = RandomForestClassifier()

In [40]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [41]:
rfc.score(X_train, y_train)

1.0

In [42]:
y_pred_rfc = rfc.predict(X_test)

In [43]:
print(classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

         1.0       0.62      0.69      0.66       496
         2.0       0.62      0.59      0.61       471
         3.0       0.90      0.83      0.86       420
         4.0       0.63      0.49      0.56       491
         5.0       0.66      0.77      0.71       532
         6.0       0.85      0.87      0.86       537

    accuracy                           0.71      2947
   macro avg       0.72      0.71      0.71      2947
weighted avg       0.71      0.71      0.71      2947



In [44]:
# okay i got some basics working with just one set of file and a minimal set of statistical features 
# lemme first expand this to include all three axes of body acceleration data 
# would putting them all into pandas dataframes be the most sensible thing to do? 
# i am 