# SVM Model for Human Activity Recognition of WISDM Dataset


## A. 
### 1. Read data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
%matplotlib inline

class Reader:
    def __init__(self, path, mode='f'):
        
        self.df = self.wrapper(path,mode)
    
    # read arff file
    def readarff(self, filename,collect=True): #collect if you need to collect attribute names
        with open(filename) as f:
            content = f.read().splitlines()
        data = False
        metalist = [] # storets metadata as list of rows
        datalist = [] # store data as list of rows
        
        # read data line-by-line
        for line in content:
            if data == True:
                line = line.split(",")
                datalist.append(line)
            elif line == "@data":
                data = True # read lines before "@data" as metadata and after as data
            else:
              # clean up metadata header
              if collect:
                line = line.replace(' "', ".")
                line = line.replace('" ', ".")
                line = line.replace(" ","")
                line = line.split(".")
                if len(line)==3: #ignore first two lines of file
                    line = line[1:3] #remove repetitive "@attribute"
                    metalist.append(line)
        
        # create dataframes from lists of rows
        if not collect:
            dataframe = pd.DataFrame(datalist,dtype=float)
            return dataframe
        else:
            dataframe = pd.DataFrame(datalist,dtype=float)
            metaframe = pd.DataFrame(metalist,columns=["attribute","description"])
            attributes = metaframe["attribute"].rename("SAMPLE")
            return dataframe, attributes
    
    def readdirectory(self, path,quiet=False): # make sure path ends in a slash
        alldata = []
        count = 0
        for filename in os.listdir(path):
            if filename.endswith(".arff"):
                if count == 0: #only collect attributes once
                    if not quiet:
                          print("processing "+filename+"; collecting attribute names")
                    dataframe, attributes = self.readarff(path+filename)
                    alldata.append(dataframe)
                else:
                    if not quiet:
                        print("processing "+filename)
                    dataframe = self.readarff(path+filename,collect=False)
                    alldata.append(dataframe)
                count += 1
                continue
            else:
                continue
        if not quiet:
            print("Concatenating data")
        alldata = pd.concat(alldata).reset_index(drop=True) #reset indices so it is continuous
        alldata.columns = attributes #assign column names
        return alldata
    
    def wrapper(self, path, mode='f'):
        if mode == 'f':
            try:
                df =  self.readarff(path, collect = True)
                return df
            except:
                print("make sure you inputted the correct arff FILE path")
        elif mode == 'd':
            try:
                df =  self.readdirectory(path, quiet=True)
                return df
            except:
                print("make sure you inputted the correct arff DIRECTORY path, ending with a slash")
        else:
            print("mode must either be 'f' or 'd'")
            return 0

In [2]:
phone_accel = Reader("phone_accel/",mode='d').df
phone_accel

SAMPLE,ACTIVITY,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,ZMFCC11,ZMFCC12,XYCOS,XZCOS,YZCOS,XYCOR,XZCOR,YZCOR,RESULTANT,class
0,A,0.060,0.110,0.215,0.255,0.240,0.070,0.030,0.010,0.005,...,0.550898,0.543819,0.751094,-0.005809,-0.376951,0.383184,0.377359,-0.103380,10.74990,1610.0
1,A,0.050,0.130,0.170,0.220,0.285,0.090,0.020,0.020,0.010,...,0.545153,0.538148,0.741898,0.069865,-0.368142,0.361264,0.488030,-0.070615,10.86330,1610.0
2,A,0.070,0.135,0.165,0.250,0.190,0.130,0.025,0.015,0.010,...,0.580717,0.573254,0.689070,0.227904,-0.299957,0.255459,0.568801,-0.134642,10.97220,1610.0
3,A,0.075,0.145,0.140,0.205,0.275,0.105,0.010,0.025,0.020,...,0.578014,0.570586,0.686601,0.244267,-0.280844,0.259784,0.567380,-0.128378,10.98140,1610.0
4,A,0.085,0.145,0.145,0.175,0.285,0.100,0.030,0.020,0.005,...,0.568111,0.560811,0.700471,0.057924,-0.399338,0.311718,0.398679,-0.245695,10.87960,1610.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23069,S,0.085,0.870,0.045,0.000,0.000,0.000,0.000,0.000,0.000,...,0.301115,0.297245,0.819750,0.843326,0.885157,-0.240015,0.404147,-0.194501,9.86525,1623.0
23070,S,0.045,0.930,0.025,0.000,0.000,0.000,0.000,0.000,0.000,...,0.250726,0.247505,0.882858,0.843529,0.924976,-0.375267,0.105853,-0.347694,9.83254,1623.0
23071,S,0.115,0.850,0.035,0.000,0.000,0.000,0.000,0.000,0.000,...,0.314565,0.310523,0.846374,0.783956,0.816690,-0.096962,0.283083,-0.055892,9.85928,1623.0
23072,S,0.085,0.850,0.060,0.005,0.000,0.000,0.000,0.000,0.000,...,0.353186,0.348647,0.739531,0.765779,0.813776,0.034647,0.414515,-0.073013,9.88992,1623.0


### 2. Preprocess data

In [3]:
# FORMAT DATA
# remove ACTIVITY, RESULTANT, MFCC values, COS values, and CORRELATION values (same as publication)
X = phone_accel[['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9',
       'Y0', 'Y1', 'Y2', 'Y3', 'Y4', 'Y5', 'Y6', 'Y7', 'Y8', 'Y9', 'Z0', 'Z1',
       'Z2', 'Z3', 'Z4', 'Z5', 'Z6', 'Z7', 'Z8', 'Z9', 'XAVG', 'YAVG', 'ZAVG',
       'XPEAK', 'YPEAK', 'ZPEAK', 'XABSOLDEV', 'YABSOLDEV', 'ZABSOLDEV',
       'XSTANDDEV', 'YSTANDDEV', 'ZSTANDDEV', 'XVAR', 'YVAR', 'ZVAR', 'RESULTANT']]
columns = X.columns
# FORMAT LABELS
# TODO, group similar activities
conversions = {'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'H':7, 'I':8, 
               'J':9, 'K':10, 'L':11, 'M':12, 'O':13, 'P':14, 'Q':15, 'R':16, 'S':17}
y_raw = phone_accel['ACTIVITY']
y = []
# convert letters to 1's and 0's in a matrix
for i in y_raw:
    j = conversions[i] # convert activity letter to a number, j
    row = [0]*18 # initialize an array of zeros
    row[j]=1 # set the j'th item in the row to 1, indicating that the sample belongs to the j'th category
    y.append(row)
y = pd.DataFrame(y)
y.columns = conversions.keys()
print("Class labels:")
y.head()

Class labels:


Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,O,P,Q,R,S
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
from sklearn.preprocessing import MinMaxScaler

# Scale data
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X))
X.columns = columns
print("Attribute values:")
X.head()

Attribute values:


SAMPLE,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,XABSOLDEV,YABSOLDEV,ZABSOLDEV,XSTANDDEV,YSTANDDEV,ZSTANDDEV,XVAR,YVAR,ZVAR,RESULTANT
0,0.06,0.11,0.215,0.255,0.24,0.07,0.03,0.033898,0.029412,0.014925,...,0.193351,0.181758,0.191292,0.201925,0.179769,0.242555,0.43898,0.414337,0.479647,0.117847
1,0.05,0.13,0.17,0.22,0.285,0.09,0.02,0.067797,0.058824,0.014925,...,0.207436,0.177501,0.194296,0.211756,0.17584,0.247004,0.449987,0.409602,0.484254,0.125911
2,0.07,0.135,0.165,0.25,0.19,0.13,0.025,0.050847,0.058824,0.029851,...,0.224824,0.183938,0.210485,0.228667,0.182259,0.290345,0.468341,0.417311,0.527131,0.133655
3,0.075,0.145,0.14,0.205,0.275,0.105,0.01,0.084746,0.117647,0.0,...,0.231089,0.184034,0.204281,0.231104,0.184393,0.290078,0.47093,0.419843,0.526878,0.134309
4,0.085,0.145,0.145,0.175,0.285,0.1,0.03,0.067797,0.029412,0.029851,...,0.231962,0.174996,0.182487,0.229676,0.177651,0.247354,0.469416,0.411791,0.484616,0.12707


### 3. Train model with SVC


In [6]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import train_test_split

pca = PCA(n_components=15)
pca.fit(X)
# print(sum(pca.explained_variance_ratio_))
X = pca.fit_transform(X)
y = phone_accel['ACTIVITY'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

params_grid = [{'kernel': ['rbf'], 'gamma': [10,1,1e-1,1e-2],
                     'C': [10, 100,500,1000]}]
svc = GridSearchCV(SVC(), params_grid, cv=5)
svc.fit(X_train, y_train)

# View the best parameters for the model found using grid search
print('Best C:',svc.best_estimator_.C) 
print('Best Kernel:',svc.best_estimator_.kernel)
print('Best Gamma:',svc.best_estimator_.gamma)

Best C: 1000
Best Kernel: rbf
Best Gamma: 10


### 4. Evaluate model performance

In [7]:
preds = svc.predict(X_test)
acc_svc = (preds == y_test).sum().astype(float) / len(preds)*100

print("Scikit-Learn's Support Vector Machine Classifier's prediction accuracy is: %3.2f" % (acc_svc))


Scikit-Learn's Support Vector Machine Classifier's prediction accuracy is: 80.70
