In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
#helper function to convert 2d array to 1D
def flatten_list(_2d_list):
    flat_list = []
    # Iterate through the outer list
    for element in _2d_list:
        if type(element) is list:
            # If the element is of type list, iterate through the sublist
            for item in element:
                flat_list.append(item)
        else:
            flat_list.append(element)
    return flat_list

### Prepare data so that we have a class-balanced dataset

### Code to Retrive Raw Acceleromter Data

In [22]:
# Import the RAW HAR dataset
x_train_raw_x = []
x_train_raw_y = []
x_train_raw_z = []
y_train_raw = []
x_test_raw_x = []
x_test_raw_y = []
x_test_raw_z = []
y_test_raw = []

x_train_file_raw_x = open('data/UCI-HAR-Dataset/train/Inertial Signals/total_acc_x_train.txt', 'r')
x_train_file_raw_y = open('data/UCI-HAR-Dataset/train/Inertial Signals/total_acc_y_train.txt', 'r')
x_train_file_raw_z = open('data/UCI-HAR-Dataset/train/Inertial Signals/total_acc_z_train.txt', 'r')

y_train_file_raw = open('data/UCI-HAR-Dataset/train/y_train.txt', 'r')

x_test_file_raw_x = open('data/UCI-HAR-Dataset/test/Inertial Signals/total_acc_x_test.txt', 'r')
x_test_file_raw_y = open('data/UCI-HAR-Dataset/test/Inertial Signals/total_acc_y_test.txt', 'r')
x_test_file_raw_z= open('data/UCI-HAR-Dataset/test/Inertial Signals/total_acc_z_test.txt', 'r')

y_test_file_raw = open('data/UCI-HAR-Dataset/test/y_test.txt', 'r')


for x in x_train_file_raw_x:
    x_train_raw_x.append([float(ts) for ts in x.split()])
for x in x_train_file_raw_y:
    x_train_raw_y.append([float(ts) for ts in x.split()])
for x in x_train_file_raw_z:
    x_train_raw_z.append([float(ts) for ts in x.split()])
    
    
for y in y_train_file_raw:
    y_train_raw.append(int(y.rstrip('\n')))
    
for x in x_test_file_raw_x:
    x_test_raw_x.append([float(ts) for ts in x.split()])
for x in x_test_file_raw_y:
    x_test_raw_y.append([float(ts) for ts in x.split()])
for x in x_test_file_raw_z:
    x_test_raw_z.append([float(ts) for ts in x.split()])
    
    
for y in y_test_file_raw:
    y_test_raw.append(int(y.rstrip('\n')))

    
x_train_raw = np.hstack([np.array(x_train_raw_x),np.array(x_train_raw_y),np.array(x_train_raw_z)])
x_test_raw = np.hstack([np.array(x_test_raw_x), np.array(x_test_raw_y), np.array(x_test_raw_z)])
y_train_raw = np.array(y_train_raw)
y_test_raw = np.array(y_test_raw)

#lets have the full data 
x_full_raw = np.append(x_train_raw, x_test_raw, axis=0)
y_full_raw = np.append(y_train_raw, y_test_raw, axis=0)


#reduce dataset so that we have a balanced class dataset
x_subset_raw = []
y_subset_raw = []
di = {1:0,2:0,3:0,4:0,5:0,6:0}
for i, item in enumerate(x_full_raw):
    if di[y_full_raw[i]] < 500:
        x_subset_raw.append(item)
        y_subset_raw.append(y_full_raw[i])
        di[y_full_raw[i]]+=1;
#convert to numpy type
x_subset_raw = np.array(x_subset_raw)
y_subset_raw = np.array(y_subset_raw)
print("Shape of reduced data = ", x_subset_raw.shape)
unique, counts = np.unique(y_subset_raw, return_counts=True)
print("[Label: Count] of reduced data") 
dict(zip(unique, counts))


Shape of reduced data =  (3000, 384)
[Label: Count] of reduced data


{1: 500, 2: 500, 3: 500, 4: 500, 5: 500, 6: 500}

## Training Using Raw Acceleromter Data

In [24]:
X_train, X_test, y_train, y_test = train_test_split(x_subset_raw, y_subset_raw, test_size=0.33, random_state=42)
classifier=svm.SVC()
parameters=[{'kernel': ['rbf'], 'gamma': [0.001, 0.0001], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
model=GridSearchCV(classifier,parameters,n_jobs=-1,cv=4,verbose=4)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print('Best Parameters: '+ str(model.best_params_))
print('Accuracy Score: '+ str(accuracy*100) + ' %')
print(classification_report(y_test, y_pred))

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Best Parameters: {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
Accuracy Score: 88.18181818181819 %
              precision    recall  f1-score   support

           1       0.79      0.83      0.81       173
           2       0.83      0.83      0.83       148
           3       0.96      0.84      0.90       184
           4       0.81      0.95      0.87       156
           5       0.92      0.85      0.88       165
           6       1.00      1.00      1.00       164

    accuracy                           0.88       990
   macro avg       0.88      0.88      0.88       990
weighted avg       0.89      0.88      0.88       990



## Code To Retrieve Curated Features

In [41]:
# Import the HAR dataset
x_train_file = open('data/UCI-HAR-Dataset/train/X_train.txt', 'r')
y_train_file = open('data/UCI-HAR-Dataset/train/y_train.txt', 'r')

x_test_file = open('data/UCI-HAR-Dataset/test/X_test.txt', 'r')
y_test_file = open('data/UCI-HAR-Dataset/test/y_test.txt', 'r')

# Create empty lists
x_train = []
y_train = []
x_test = []
y_test = []

# Mapping table for classes
labels = {1:'WALKING', 2:'WALKING UPSTAIRS', 3:'WALKING DOWNSTAIRS',
          4:'SITTING', 5:'STANDING', 6:'LAYING'}

# Loop through datasets
for x in x_train_file:
    x_train.append([float(ts) for ts in x.split()])
    
for y in y_train_file:
    y_train.append(int(y.rstrip('\n')))
    
for x in x_test_file:
    x_test.append([float(ts) for ts in x.split()])
    
for y in y_test_file:
    y_test.append(int(y.rstrip('\n')))
    
# Convert to numpy for efficiency
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

#lets have the full data 
x_full = np.append(x_train, x_test, axis=0)
y_full = np.append(y_train, y_test, axis=0)


#reduce dataset so that we have a balanced class dataset
x_subset = []
y_subset = []
di = {1:0,2:0,3:0,4:0,5:0,6:0}
for i, item in enumerate(x_full):
    if di[y_full[i]] < 500:
        x_subset.append(item)
        y_subset.append(y_full[i])
        di[y_full[i]]+=1;
#convert to numpy type
x_subset = np.array(x_subset)
y_subset = np.array(y_subset)
print("Shape of reduced data = ", x_subset.shape)
unique, counts = np.unique(y_subset, return_counts=True)
print("[Label: Count] of reduced data") 
dict(zip(unique, counts))


Shape of reduced data =  (3000, 561)
[Label: Count] of reduced data


{1: 500, 2: 500, 3: 500, 4: 500, 5: 500, 6: 500}

### Cross-validation and model testing with all features

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x_subset, y_subset, test_size=0.33, random_state=42)
classifier=svm.SVC()
parameters=[{'kernel': ['rbf'], 'gamma': [0.001, 0.0001], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
model=GridSearchCV(classifier,parameters,n_jobs=-1,cv=4,verbose=4)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print('Best Parameters: '+ str(model.best_params_))
print('Accuracy Score: '+ str(accuracy*100) + ' %')
print(classification_report(y_test, y_pred))

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Best Parameters: {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
Accuracy Score: 98.08080808080808 %
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       173
           2       0.99      1.00      1.00       148
           3       1.00      1.00      1.00       184
           4       0.96      0.92      0.94       156
           5       0.93      0.96      0.95       165
           6       1.00      1.00      1.00       164

    accuracy                           0.98       990
   macro avg       0.98      0.98      0.98       990
weighted avg       0.98      0.98      0.98       990



### Understanding Important Features
**Dataset Desctiption**
1. The raw acceleromter signal was separated into **body** and **gravity** acceleration signals using a low pass Butterworth filter with a corner frequency of 0.3 Hz. (```tBodyAcc-XYZ and tGravityAcc-XYZ```)
2. Linear acceleration and angular velocity were derived in time to obtain Jerk signals (```tBodyAccJerk-XYZ and tBodyGyroJerk-XYZ```)
3. Fast Fourier Transform (FFT) was applied to some of these signals producing ```BodyAcc-XYZ, fBodyAccJerk-XYZ, fBodyGyro-XYZ, fBodyAccJerkMag, fBodyGyroMag, fBodyGyroJerkMag```

For each of the above the following statistical features were also computed
```
mean(): Mean value
std(): Standard deviation
mad(): Median absolute deviation 
max(): Largest value in array
min(): Smallest value in array
sma(): Signal magnitude area
energy(): Energy measure. Sum of the squares divided by the number of values. 
iqr(): Interquartile range 
entropy(): Signal entropy
arCoeff(): Autorregresion coefficients with Burg order equal to 4
correlation(): correlation coefficient between two signals
maxInds(): index of the frequency component with largest magnitude
meanFreq(): Weighted average of the frequency components to obtain a mean frequency
skewness(): skewness of the frequency domain signal 
kurtosis(): kurtosis of the frequency domain signal 
bandsEnergy(): Energy of a frequency interval within the 64 bins of the FFT of each window.
angle(): Angle between to vectors
```

Below are the index information for time domain features:<br>

1. x_subset[:,00:40] All statistical components of ```tBodyAcc```
2. x_subset[:,41:80] All statistical components of ```tGravityAcc```
3. x_subset[:,81:120] All statistical components of ```tBodyAccJerk```
4. x_subset[:,121:160] All statistical components of ```tBodyGyro```
5. x_subset[:,161:200] All statistical components of ```tBodyGyroJerk```
5. x_subset[:,201:213] All statistical components of ```tBodyAccMag```
6. x_subset[:,214:226] All statistical components of ```tGravityAccMag```
7. x_subset[:,227:239] All statistical components of ```tBodyAccJerkMag```
8. x_subset[:,240:252] All statistical components of ```tBodyGyroMag```
9. x_subset[:,253:265] All statistical components of ```tBodyGyroJerkMag```

Frequency doamin features follow
1. x_subset[:,265:344] All statistical components of ```fBodyAcc```
2. x_subset[:,345:423] All statistical components of ```fBodyAccJerk```
3. x_subset[:,424:502] All statistical components of ```fBodyGyro```
4. x_subset[:,503:515] All statistical components of ```fBodyAccMag```
5. x_subset[:,516:528] All statistical components of ```fBodyBodyAccJerkMag```
6. x_subset[:,529:541] All statistical components of ```fBodyBodyGyroMag```
7. x_subset[:,542:554] All statistical components of ```fBodyBodyGyroJerkMag```
8. x_subset[:,555:561] Angle Information


**Question 1: Which class predictions remain accurate even after removing Gyroscope data?**

In [36]:
#removes gyroscope features
indicesToRemove =  [[i for i in range(120,200)], [i for i in range(240,265)], [i for i in range(424,502)], [i for i in range(529,554)]]
indicesToRemove = flatten_list(indicesToRemove)
x_subset_wo_gyro = np.delete(x_subset, indicesToRemove, axis=1)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(x_subset_wo_gyro, y_subset, test_size=0.33, random_state=42)
classifier=svm.SVC()
parameters=[{'kernel': ['rbf'], 'gamma': [0.001, 0.0001], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
model=GridSearchCV(classifier,parameters,n_jobs=-1,cv=4,verbose=4)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print('Best Parameters: '+ str(model.best_params_))
print('Accuracy Score: '+ str(accuracy*100) + ' %')
print(classification_report(y_test, y_pred))

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Best Parameters: {'C': 1, 'kernel': 'linear'}
Accuracy Score: 96.96969696969697 %
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       173
           2       0.99      1.00      1.00       148
           3       1.00      1.00      1.00       184
           4       0.91      0.89      0.90       156
           5       0.90      0.92      0.91       165
           6       1.00      1.00      1.00       164

    accuracy                           0.97       990
   macro avg       0.97      0.97      0.97       990
weighted avg       0.97      0.97      0.97       990



**Question 2: What will be the affect on accuracy if frequency domain information is removed?** <br>

In [44]:
#removes gyroscope features
indicesToRemove =  [[i for i in range(265,561)]]
indicesToRemove = flatten_list(indicesToRemove)
x_subset_wo_freq = np.delete(x_subset, indicesToRemove, axis=1)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(x_subset_wo_gyro, y_subset, test_size=0.33, random_state=42)
classifier=svm.SVC()
parameters=[{'kernel': ['rbf'], 'gamma': [0.001, 0.0001], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
model=GridSearchCV(classifier,parameters,n_jobs=-1,cv=4,verbose=4)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print('Best Parameters: '+ str(model.best_params_))
print('Accuracy Score: '+ str(accuracy*100) + ' %')
print(classification_report(y_test, y_pred))

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Best Parameters: {'C': 1, 'kernel': 'linear'}
Accuracy Score: 96.96969696969697 %
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       173
           2       0.99      1.00      1.00       148
           3       1.00      1.00      1.00       184
           4       0.91      0.89      0.90       156
           5       0.90      0.92      0.91       165
           6       1.00      1.00      1.00       164

    accuracy                           0.97       990
   macro avg       0.97      0.97      0.97       990
weighted avg       0.97      0.97      0.97       990

