In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy

from sklearn import svm
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from numpy import asarray as arr
from numpy import atleast_2d as twod

In [34]:
#Helper Functions

#To shuffle data
def shuffleData(X, Y=None):
    """
    Shuffle (randomly reorder) data in X and Y.

    Parameters
    ----------
    X : MxN numpy array: N feature values for each of M data points
    Y : Mx1 numpy array (optional): target values associated with each data point

    Returns
    -------
    X,Y  :  (tuple of) numpy arrays of shuffled features and targets
            only returns X (not a tuple) if Y is not present or None
    
    Ex:
    X2    = shuffleData(X)   : shuffles the rows of the data matrix X
    X2,Y2 = shuffleData(X,Y) : shuffles rows of X,Y, preserving correspondence
    """
    nx,dx = twod(X).shape
    Y = arr(Y).flatten()
    ny = len(Y)

    np.random.seed(0)
    pi = np.random.permutation(nx)
    X = X[pi,:]

    if ny > 0:
        assert ny == nx, 'shuffleData: X and Y must have the same length'
        Y = Y[pi] if Y.ndim <= 1 else Y[pi,:]
        return X,Y

    return X

#No of seconds considered for window size
no_of_sec = 10;

#finds Mean of data given
def findMean(data, window_size = no_of_sec*50, stride =25):
    return ([np.mean(data[i:i+window_size]) for i in range(0,len(data),stride) if i+window_size<=len(data)])

#finds Median of data given
def findMedian(data, window_size = no_of_sec*50, stride =25):
    return ([np.median(data[i:i+window_size]) for i in range(0,len(data),stride) if i+window_size<=len(data)])

#finds Standard Deviation of data given
def findStd(data, window_size = no_of_sec*50, stride =25):
    return ([np.std(data[i:i+window_size]) for i in range(0,len(data),stride) if i+window_size<=len(data)])

In [35]:
#Read data from CSV datafile
dataset = pd.read_excel("data.xlsx", nrows=35000, indexcols=[0-19]) 
array_data = np.array(dataset);

In [36]:
#Values of activity 1 (Sitting)
x_1 = array_data[:,2];
y_1 = array_data[:,3];
z_1 = array_data[:,4];

#Values of activity 2 (Sleeping)
x_2 = array_data[:,7];
y_2 = array_data[:,8];
z_2 = array_data[:,9];

#Values of activity 3 (Standing)
x_3 = array_data[:,12];
y_3 = array_data[:,13];
z_3 = array_data[:,14];

#Values of activity 4 (Walking)
x_4 = array_data[:,17];
y_4 = array_data[:,18];
z_4 = array_data[:,19];

In [37]:
#Finding mean along each axis; appending results for all activities in a single column
x_mean = findMean(x_1) + findMean(x_2) + findMean(x_3) + findMean(x_4)
y_mean = findMean(y_1) + findMean(y_2) + findMean(y_3) + findMean(y_4)
z_mean = findMean(z_1) + findMean(z_2) + findMean(z_3) + findMean(z_4)

In [38]:
#Finding median along each axis; appending results for all activities in a single column
x_median = findMedian(x_1) + findMedian(x_2) + findMedian(x_3) + findMedian(x_4)
y_median = findMedian(y_1) + findMedian(y_2) + findMedian(y_3) + findMedian(y_4)
z_median = findMedian(z_1) + findMedian(z_2) + findMedian(z_3) + findMedian(z_4)

In [39]:
#Finding standard deviation along each axis; appending results for all activities in a single column
x_std = findStd(x_1) + findStd(x_2) + findStd(x_3) + findStd(x_4)
y_std = findStd(y_1) + findStd(y_2) + findStd(y_3) + findStd(y_4)
z_std = findStd(z_1) + findStd(z_2) + findStd(z_3) + findStd(z_4)

In [40]:
#Preparing data for training the model
data = pd.DataFrame(data={'x_mean':x_mean, 'y_mean':y_mean, 'z_mean':z_mean,
                              'x_med':x_median, 'y_med': y_median,'z_med':z_median,
                               'x_std': x_std, 'y_std':y_std, 'z_std':z_std})
#To remove any null values
data = data.fillna(method='ffill')
data_X = np.array(data)
#Preparing the classes column for training
no_of_repeats = len(x_mean)/4;    #number of feature rows extracted for each activity
data_Y = np.repeat(1,no_of_repeats).tolist() + np.repeat(2,no_of_repeats).tolist() + np.repeat(3,no_of_repeats).tolist()+ np.repeat(4,no_of_repeats).tolist()

In [41]:
#Shuffling the data so the rows pertaining to each data are distributed in training and test data equally (on an average)
data_X, data_Y = shuffleData(data_X, data_Y)

ind = int(np.round(0.8*len(data_X)))
#Splitting training data as the first 80% of data
train_X = data_X[:ind]
train_Y = data_Y[:ind]

#Splitting test data as the last 20% of data
test_X = data_X[ind:]
test_Y = data_Y[ind:]

In [42]:
#Train a model and predict results
def perform_model(clf, test_X = test_X, test_Y = test_Y):
    clf.fit(train_X, train_Y)
    #Predicting class for test and training data
    pred_Y=clf.predict(test_X)
    pred_Y_train = clf.predict(train_X)

    errTrain=0
    for j in range(0,len(train_Y)):
        if train_Y[j]!=pred_Y_train[j]: #predicted!=actual
            errTrain+=1
    print "Error rate for training data: ", 100*errTrain/len(train_Y)
    
    errTest=0
    for j in range(0,len(test_Y)):
        if test_Y[j]!=pred_Y[j]: #predicted!=actual
            errTest+=1
    print "Error rate for training data: ", 100*errTest/len(test_Y)

    print "Score: ", clf.score(test_X,test_Y);
    print "Accuracy score: ", accuracy_score(test_Y, pred_Y)

In [45]:
#SVM Classifier
print "SVM Classifier results:"
clf = svm.SVC(kernel = 'rbf')
perform_model(clf);

SVM Classifier results:
Error rate for training data:  23
Error rate for training data:  24
Score:  0.759298245614
Accuracy score:  0.759298245614


In [44]:
#Decision Tree
print "\nDecision Tree results:"
clf = tree.DecisionTreeClassifier(max_depth=5)
perform_model(clf);

#KNN Classifier
print "\nKNN Classifier results:"
clf = KNeighborsClassifier(n_neighbors = 5)
perform_model(clf);


Decision Tree results:
Error rate for training data:  21
Error rate for training data:  22
Score:  0.774736842105
Accuracy score:  0.774736842105

KNN Classifier results:
Error rate for training data:  16
Error rate for training data:  27
Score:  0.724912280702
Accuracy score:  0.724912280702
