## Data pre-processing

In [7]:
import pandas as pd
import os, glob
import pickle
import os.path
import numpy as np
#from sklearn.preprocessing import Imputer

def preprocess_data(basepath, infile, outfile, wrt):
    headers = ["timestamp", "activityid", "heartrate", "imu1temp", "imu1ac1_x", "imu1ac1_y", "imu1ac1_z", "imu1ac2_x", "imu1ac2_y", "imu1ac2_z",
               "imu1gy1_x", "imu1gy1_y", "imu1gy1_z", "imu1mag1_x", "imu1mag1_y", "imu1mag1_z", "inv11", "inv12", "inv13", "inv14", "imu2temp",
               "imu2ac1_x", "imu2ac1_y", "imu2ac1_z", "imu2ac2_x", "imu2ac2_y", "imu2ac2_z", "imu2gy1_x", "imu2gy1_y", "imu2gy1_z", "imu2mag1_x",
               "imu2mag1_y", "imu2mag1_z", "inv21", "inv22", "inv23", "inv24", "imu3temp", "imu3ac1_x", "imu3ac1_y", "imu3ac1_z", "imu3ac2_x",
               "imu3ac2_y", "imu3ac2_z", "imu3gy1_x", "imu3gy1_y", "imu3gy1_z", "imu3mag1_x", "imu3mag1_y", "imu3mag1_z", "inv31", "inv32", "inv33",
               "inv34"]
    subject = pd.read_csv(basepath + infile, sep = '\s+', names = headers)
    drop_columns = ["inv11", "inv12", "inv13", "inv14", "inv21", "inv22", "inv23", "inv24", "inv31", "inv32", "inv33", "inv34", "imu1ac2_x", 
                    "imu1ac2_y", "imu1ac2_z", "imu2ac2_x", "imu2ac2_y", "imu2ac2_z", "imu3ac2_x", "imu3ac2_y", "imu3ac2_z"]
    
    
    #Interpolate nans
    subject = subject.astype(float).interpolate(method = 'linear', limit_direction = 'forward', axis = 0)
    #imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    #subject = imp.fit_transform(subject)
    #subject = pd.DataFrame(subject)
    #subject.columns = headers
    subject = subject.drop(drop_columns, axis = 1)
    subject = subject[subject.activityid != 0]
    
    if wrt == 'subject':
        target = subject['activityid']
        subject = subject.drop(['activityid'], axis = 1)
        subject['subjectid'] = int(infile.split('.')[0][7:])
        
    if wrt == 'activity':
        target = infile.split('.')[0][7:]
        
    subject_data = {'data': subject, 'target': target}
    
    #Store processed data into pickle file  
    if wrt == 'subject':
        with open(outfile, 'wb') as file:
            pickle.dump(subject_data, file)
    
    elif wrt == 'activity':
        activities = subject.activityid.unique()
        for activity in activities:
            activity_df = subject.loc[subject['activityid'] == activity]
            activity_df.drop(['activityid'], axis = 1)
            rows_count = activity_df.shape[0]
            activity_target_list = [target] * rows_count
            index = np.array(list(range(rows_count)))
            activity_target = pd.Series(activity_target_list, index.tolist())
            activity_data = {'data': activity_df, 'target': activity_target}
            
            if os.path.exists(basepath + 'activity' + str(int(activity)) + '.pkl'):
                activity_file = open(basepath + 'activity' + str(int(activity)) + '.pkl', 'rb')
                act = pickle.load(activity_file)
                rows = act['data'].shape[0]
                act['data'] = act['data'].append(activity_df)
                index = index + rows
                act['target'] = act['target'].append(activity_target)
                activity_data = act
            
            with open('activity' + str(int(activity)) + '.pkl', 'wb') as file:
                    pickle.dump(activity_data, file)
        
#basepath = os.path.abspath('PAMAP2_Dataset/Protocol/')
#basepath = 'F:/Study/2nd_Semester/AML/Project/Data/PAMAP2_Dataset/Protocol'
basepath = '/content'

os.chdir(basepath)
data_files = glob.glob('*.dat')
old_pickle_files = glob.glob('*.pkl')

for oldfile in old_pickle_files:
    os.remove(oldfile)

for infile in data_files:
    print(infile)
    outfile = infile.split('.')[0] + '.pkl'
    preprocess_data(basepath + '/', infile, outfile, 'subject')
    preprocess_data(basepath + '/', infile, outfile, 'activity')

subject103.dat
subject102.dat
subject101.dat


## Data Windowing

In [8]:
import pickle
import numpy as np
import pandas as pd
import os, glob

def downsample(x,factor):
    n = int(x.shape[0]/factor)*factor
    d1 = x[:n].values.reshape(-1, factor, x.shape[1]).mean(1)
    if x.shape[0] % n == 0: dfn = pd.DataFrame(d1)
    else:
        d2 = x[n:].values.mean(axis = 0).reshape(1,x.shape[1])
        dfn = pd.DataFrame(np.concatenate((d1,d2),axis = 0))
    dfn.columns = x.columns
    return dfn
    
def window_stack(a, width, stepsize=1):
    target = a['target'].iloc[0]
    a = a.drop(['target'], axis = 1)
    a = downsample(a, 10)
    a = a.drop(['timestamp'], axis = 1)
    if a.shape[0] < width:
        return '', pd.DataFrame()
    return target, np.hstack( a[i:1+i-width or None:stepsize] for i in range(0,width))

def getWIndowedDataInContinuousChunks(dataframe):
    new_dataframe = pd.DataFrame()
    startIdx = 0
    idx = startIdx
    size = dataframe.shape[0]
    if size > 489 and round(dataframe.values[0,0] + ((size - 1) * 0.01), 2) == dataframe.values[-1,0]:
        target, df = window_stack(dataframe, 50)
        df =  pd.DataFrame(df)
        df['target'] = target
        return df
    while idx < size - 1:
        if (dataframe['timestamp'].index[idx+1] - dataframe['timestamp'].index[idx]) == 1:
            idx += 1
        else:
            start = dataframe['timestamp'].index[startIdx]
            end = dataframe['timestamp'].index[idx]
            df = dataframe.loc[start : end - 1, : ]
            target, df = window_stack(df, 50)
            df = pd.DataFrame(df)
            startIdx = idx + 1
            idx = startIdx
            if df.shape[0] > 0:
                df['target'] = target
                new_dataframe = new_dataframe.append(df)
    return new_dataframe

def getChunk(file, outfile):
    new_df = pd.DataFrame()
    pklFile = open(file, 'rb')
    data_from_pickle = pickle.load(pklFile)
    target = data_from_pickle['target']
    data = data_from_pickle['data']
    if file[0] == 's':
        data = data.drop(['subjectid'], axis = 1)
    elif file[0] == 'a':
        data = data.drop(['activityid'], axis = 1)
    groups = target.unique()
    data['target'] = target.values
    outdf = pd.DataFrame()
    for group in groups:
        df = data.loc[data['target'] == group]
        df = df.sort_values(by=['timestamp'])
        df = getWIndowedDataInContinuousChunks(df)
        new_df = new_df.append(df)
    outdf = outdf.append(new_df)
    with open(outfile, 'wb') as file:
            pickle.dump(outdf, file)

#basepath = os.path.abspath('../../Data/PAMAP2_Dataset/Protocol/')
#basepath = 'F:/Study/2nd_Semester/AML/Project/Data/PAMAP2_Dataset/Protocol'
basepath = '/content'

os.chdir(basepath)

old_pickle_files = glob.glob('windowed*.pkl')
for oldfile in old_pickle_files:
    os.remove(oldfile)

pickle_files = glob.glob('*.pkl')

for file in pickle_files:
    print('Processing', file)
    outfile = 'windowed_' + file 
    getChunk(file, outfile)

Processing activity7.pkl
Processing activity16.pkl
Processing activity5.pkl
Processing activity4.pkl
Processing activity12.pkl
Processing subject101.pkl
Processing subject102.pkl
Processing activity6.pkl
Processing subject103.pkl
Processing activity1.pkl
Processing activity13.pkl
Processing activity2.pkl
Processing activity3.pkl
Processing activity24.pkl
Processing activity17.pkl


## LOSO and nn Model training and testing

In [3]:
import os, glob, pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

def preprocess_dataframe(data,split=False):
    y=data['target'].values
    data=data.drop(['target'],axis=1)
    y=y.astype(np.int)
    X=data.values
    data=None
    X=preprocessing.normalize(X)
    if split:
        return train_test_split(X,y)
    else:
        return X,y

def Run_LOSO():
    #basepath = os.path.abspath('../Data/PAMAP2_Dataset/Protocol/')
    #basepath = 'F:/Study/2nd_Semester/AML/Project/Data/PAMAP2_Dataset/Protocol'
    basepath = '/content'
    os.chdir(basepath)
    subject_files = glob.glob('windowed_subject*.pkl')
    for i in range(len(subject_files)):
        print(subject_files[i])
        temp_file = np.copy(subject_files).tolist()
        pklfile = open(subject_files[i], 'rb')
        test_data = pickle.load(pklfile)
        X_test,y_test=preprocess_dataframe(test_data)
        temp_file.remove(subject_files[i])
        train_data = pd.DataFrame()
        for file in temp_file:
            pklfile = open(file, 'rb')
            data_from_pickle = pickle.load(pklfile)
            train_data = train_data.append(data_from_pickle) 
        X_train,y_train = preprocess_dataframe(train_data)
        
        sess = tf.Session()
        X_train = tf.convert_to_tensor(X_train, dtype=tf.float32)
        y_train = tf.convert_to_tensor(y_train, dtype=tf.int32)
        
        X_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
        y_test = tf.convert_to_tensor(y_test, dtype=tf.int32)
        
        X_train = sess.run(X_train)
        y_train = tf.cast(y_train, tf.int32)
        y_train = tf.one_hot(y_train, len(np.unique(y_train)))
        y_train = sess.run(y_train)
        
        X_test = sess.run(X_test)
        y_test = tf.cast(y_test, tf.int32)
        y_test = tf.one_hot(y_test, len(np.unique(y_test)))
        y_test = sess.run(y_test)
        
        #print(type(X_train), type(X_test), type(y_train), type(y_test))
        nn(X_train, y_train, X_test, y_test)

def nn(X_train, y_train, X_test, y_test):
    print('inside nn')
    training_epochs = 5
    n_neurons_in_h1 = 10
    n_neurons_in_h2 = 10
    learning_rate = 0.01
    n_features = X_train.shape[1]
    n_classes = len(np.unique(y_train))
    
    X = tf.placeholder(tf.float32, [None, n_features], name='features')
    Y = tf.placeholder(tf.float32, [None, n_classes], name='labels')        
    
    W1 = tf.Variable(tf.truncated_normal([n_features, n_neurons_in_h1], mean=0, stddev=1 / np.sqrt(n_features)), name='weights1')
    b1 = tf.Variable(tf.truncated_normal([n_neurons_in_h1],mean=0, stddev=1 / np.sqrt(n_features)), name='biases1')
    y1 = tf.nn.tanh((tf.matmul(X, W1)+b1), name='activationLayer1')
    
    #network parameters(weights and biases) are set and initialized(Layer2)
    W2 = tf.Variable(tf.random_normal([n_neurons_in_h1, n_neurons_in_h2],mean=0,stddev=1/np.sqrt(n_features)),name='weights2')
    b2 = tf.Variable(tf.random_normal([n_neurons_in_h2],mean=0,stddev=1/np.sqrt(n_features)),name='biases2')
    #activation function(sigmoid)
    y2 = tf.nn.sigmoid((tf.matmul(y1,W2)+b2),name='activationLayer2')
    
    #output layer weights and biasies
    Wo = tf.Variable(tf.random_normal([n_neurons_in_h2, n_classes], mean=0, stddev=1/np.sqrt(n_features)), name='weightsOut')
    bo = tf.Variable(tf.random_normal([n_classes], mean=0, stddev=1/np.sqrt(n_features)), name='biasesOut')
    #activation function(softmax)
    a = tf.nn.softmax((tf.matmul(y2, Wo) + bo), name='activationOutputLayer')
    
    #cost function
    cross_entropy = tf.reduce_mean(-tf.reduce_sum(Y * tf.log(a),reduction_indices=[1]))
    
    #optimizer
    train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)
    
    #compare predicted value from network with the expected value/target
    correct_prediction = tf.equal(tf.argmax(a, 1), tf.argmax(Y, 1))

    #accuracy determination
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="Accuracy")
    
    # initialization of all variables
    initial = tf.global_variables_initializer()
    
    #creating a session
    with tf.Session() as sess:
        sess.run(initial)
        merged_summary = tf.summary.merge_all()
        
        # training loop over the number of epoches
        batchsize=10000
        for epoch in range(training_epochs):
            if len(X_train)%batchsize == 0:
                Totalbatches = int(len(X_train)/batchsize)
            else:
                Totalbatches = int(len(X_train)/batchsize + 1)
                
            for i in range(Totalbatches):
                start=i
                end=i+batchsize
                x_batch=X_train[start:end]
                y_batch=y_train[start:end]
                
                # feeding training data/examples
                sess.run(train_step, feed_dict={X:x_batch , Y:y_batch})
                i+=batchsize
            
            # feeding testing data to determine model accuracy
            y_pred = sess.run(tf.argmax(a, 1), feed_dict={X: X_test})
            y_true = sess.run(tf.argmax(y_test, 1))

            acc = sess.run([accuracy], feed_dict={X: X_test, Y: y_test})

            # print accuracy for each epoch
            print('epoch',epoch, acc)
            print ('---------------')
            #print(y_pred, y_true)
    
Run_LOSO()



windowed_subject103.pkl
inside nn
epoch 0 [1.0]
---------------
epoch 1 [1.0]
---------------
epoch 2 [1.0]
---------------
epoch 3 [1.0]
---------------
epoch 4 [1.0]
---------------
windowed_subject102.pkl
inside nn
epoch 0 [1.0]
---------------
epoch 1 [1.0]
---------------
epoch 2 [1.0]
---------------
epoch 3 [1.0]
---------------
epoch 4 [1.0]
---------------
windowed_subject101.pkl
inside nn
epoch 0 [1.0]
---------------
epoch 1 [1.0]
---------------
epoch 2 [1.0]
---------------
epoch 3 [1.0]
---------------
epoch 4 [1.0]
---------------
