In [1]:
import pandas as pd
import numpy as np

In [8]:
pd.set_option('display.max_rows', None)

In [16]:

features = [line.strip() for line in open('features.txt', 'r').readlines()]

activity_labels = pd.read_csv(
    'activity_labels.txt', 
    sep=r'\s+', 
    names=['type', 'str', 'dum']
).drop(labels=['dum'], axis=1)


X_test = pd.read_csv('Test/X_test.txt', sep=' ', names=features)

y_test = pd.read_csv('Test/Y_test.txt', sep=' ', names=['type'])
y_test_readable = y_test.merge(activity_labels)

X_train = pd.read_csv('Train/X_train.txt', sep=' ', names=features)
y_train = pd.read_csv('Train/Y_train.txt', sep=' ', names=['type'])


In [24]:
activity_labels

Unnamed: 0,type,str
0,1,WALKING
1,2,WALKING_UPSTAIRS
2,3,WALKING_DOWNSTAIRS
3,4,SITTING
4,5,STANDING
5,6,LAYING
6,7,STAND_TO_SIT
7,8,SIT_TO_STAND
8,9,SIT_TO_LIE
9,10,LIE_TO_SIT


In [26]:
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

Evaluate performance of SVC out of the box over all the data

In [28]:
svc_model = SVC()

svc_model.fit(X_train, np.ravel(y_train))

svc_y_pred = svc_model.predict(X_test)

In [29]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.93      0.99      0.96       496
           2       0.92      0.95      0.94       471
           3       0.99      0.91      0.95       420
           4       0.93      0.89      0.91       508
           5       0.91      0.95      0.93       556
           6       1.00      1.00      1.00       545
           7       0.86      0.78      0.82        23
           8       1.00      0.80      0.89        10
           9       0.64      0.88      0.74        32
          10       0.67      0.72      0.69        25
          11       0.82      0.55      0.66        49
          12       0.71      0.56      0.63        27

    accuracy                           0.94      3162
   macro avg       0.87      0.83      0.84      3162
weighted avg       0.94      0.94      0.94      3162

[[489   4   3   0   0   0   0   0   0   0   0   0]
 [ 22 449   0   0   0   0   0   0   0   0   0   0]
 [ 10  26 384   0   0   0   0  

<p>
    Performance from:
    
    Davide Anguita, Alessandro Ghio, Luca Oneto, Xavier Parra and Jorge L. Reyes-Ortiz. Human Activity Recognition on Smartphones using a Multiclass Hardware-Friendly Support Vector Machine. International Workshop of Ambient Assisted Living (IWAAL 2012). Vitoria-Gasteiz, Spain. Dec 2012
    
</p>
<img src='benchmark.png'/>

The sci-kit SVM classifier out of the box seems to perform better than what was reported in the paper the original data came from.

Possible reasons are:
* This SVM uses RBF kernel
* Using 1 to 1 comparisons to make classifications. Paper uses 1 to many.

In [41]:
grid_search = GridSearchCV(
    estimator=SVC(),
    param_grid={
        'C':[1],
        'kernel':['poly', 'rbf', 'sigmoid']
    },
    cv=5,
)

grid_search.fit(X_train, np.ravel(y_train))

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1], 'kernel': ['poly', 'rbf', 'sigmoid']})

In [42]:
def print_results(gs):
    print('Best Params: ', gs.best_params_)
    print('Best score : ', gs.best_score_)
    
    means = gs.cv_results_['mean_test_score']
    stds  = gs.cv_results_['std_test_score']
    params= gs.cv_results_['params']
    
    #for m, s, p in zip(means, stds, params):
    #    print('%.3f %.3f %s' %(m, s, p))
    
    df = pd.DataFrame(params)
    df.insert(0, 'means', means)
    
    df.insert(0, 'stds', stds)

    return df

print_results(grid_search)

Best Params:  {'C': 1, 'kernel': 'poly'}
Best score :  0.9326698605513801


Unnamed: 0,stds,means,C,kernel
0,0.020992,0.93267,1,poly
1,0.019589,0.919021,1,rbf
2,0.019317,0.782805,1,sigmoid


In [43]:
grid_search = GridSearchCV(
    estimator=SVC(),
    param_grid={
        'C':[0.1, 0.5, 1],
        'kernel':['poly']
    },
    cv=5,
)

grid_search.fit(X_train, np.ravel(y_train))

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.1, 0.5, 1], 'kernel': ['poly']})

In [44]:
print_results(grid_search)

Best Params:  {'C': 1, 'kernel': 'poly'}
Best score :  0.9326698605513801


Unnamed: 0,stds,means,C,kernel
0,0.017392,0.907948,0.1,poly
1,0.02068,0.928936,0.5,poly
2,0.020992,0.93267,1.0,poly


Polynomial Kernel