In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun May 22 18:49:50 2022

@author: poojap

This is an algorithm detecting eating movements. This is calculated taking many
input actions and descerning eating movements from the rest.

It is an algorithm meant to take movement data from a wearable device in a designated time period of seconds,
and determine if the action is eating or non eating.

An accelerometer dataset is provided by watch measurements and gives us x, y, and z coordinates in space.

Time datasets are best analyzed by sectioning off a specific portion as a consequence of some number of seconds.

Windows are extracted with some overlap to ensure continuity even within consecutive windows. The windows
are further analyzed for feature engineering.

The large dataset is minimized via feature engineering. New features are calculated by the extracted windows.
The following features are extracted from each sliding window: 
    mean, median, mode, standard deviation, absolute average deviation, log average,
    square root average, squared average, minimum, maximum, range 
    
The data is considerably reduced after feature enigneering and the data is more appropriate and adept to 
train a classifier network.

A support vector machine (SVM) is used to classify the data as eating and non eating.
The following metrics are used to calculate model performance:
        Accuracy, Precision, Recall


"""

'\nCreated on Sun May 22 18:49:50 2022\n\n@author: poojap\n\nThis is an algorithm detecting eating movements. This is calculated taking many\ninput actions and descerning eating movements from the rest.\n\nIt is an algorithm meant to take movement data from a wearable device in a designated time period of seconds,\nand determine if the action is eating or non eating.\n\nAn accelerometer dataset is provided by watch measurements and gives us x, y, and z coordinates in space.\n\nTime datasets are best analyzed by sectioning off a specific portion as a consequence of some number of seconds.\n\nWindows are extracted with some overlap to ensure continuity even within consecutive windows. The windows\nare further analyzed for feature engineering.\n\nThe large dataset is minimized via feature engineering. New features are calculated by the extracted windows.\nThe following features are extracted from each sliding window: \n    mean, median, mode, standard deviation, absolute average deviation

In [2]:
# To import all necessary packages, please run: pip install -r ./requirements.txt 
!pip install -r ./requirements.txt

from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn import svm, datasets
from sklearn.metrics import *
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold 
import os
import shutil
import glob
import statistics
import csv
import matplotlib.pyplot as plt
import math
import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import time
from sklearn.model_selection import TimeSeriesSplit



Collecting matplotlib==3.4.3
  Downloading matplotlib-3.4.3-cp39-cp39-macosx_10_9_x86_64.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting numpy==1.20.3
  Using cached numpy-1.20.3-cp39-cp39-macosx_10_9_x86_64.whl (16.1 MB)
Collecting pandas==1.3.4
  Downloading pandas-1.3.4-cp39-cp39-macosx_10_9_x86_64.whl (11.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting scikit_learn==1.1.1
  Using cached scikit_learn-1.1.1-cp39-cp39-macosx_10_13_x86_64.whl (8.6 MB)
Installing collected packages: numpy, pandas, matplotlib, scikit_learn
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.5
    Uninstalling numpy-1.21.5:
      Successfully uninstalled numpy-1.21.5
  Attempting uninstall: pandas
    Found existing installation: pandas 1.4.4
    Uninstall

In [3]:
def classifyData():
    """
    Takes in numerous .txt files all in one directory, with the filepath: '../DataSetFiles/raw/watch/accel'
    Inserts a header in all the .txt files and saves to a new directory: '../DataSetFiles/HeaderFiles'
    Concatenates all the data in the new folder 'HeaderFiles' and converts to a pandas dataframe
    Returns:
        'df': dataframe pulled directly from all concatenated data
        'dataframe': df converted to Pandas Dataframe
        'dataset': df converted to Numpy Array
        'target': Binary class indicating eating (1) or noneating (0)
        'x': feature of dataset
        'y': feature of dataset
        'z':feature of dataset
        Binary class indicating eating or noneating: 'target'
        
    """
    
    #Pathway of current algorithm
    dir_path = os.getcwd()
    
    #Entering one directory above current algorithm to access dataset files
    path_parent = os.path.dirname(dir_path)
    
    #Pathway for dataset all '.txt' files
    data ='DataSetFiles/raw/watch/accel'

    # Concatening the file paths to access dataset directory
    readData = os.path.join(path_parent, data)

    #Concatening all the .txt files to read in python
    files = os.path.join(readData, "*.txt")

    # list of merged files returned
    joinedfiles = glob.glob(files)

    #Header describing each column in .txt file
    header = ['SubjectID', 'Class', 'TimeStamp', 'x', 'y', 'z']
    
    #New directory name to put files with appended header 'HeaderFiles'
    newdirname = "DataSetFiles/HeaderFiles"
    
    #Create path filename for new directory
    newdir = os.path.join(path_parent, newdirname)
    
    #Remove header directory if it already exists
    if os.path.exists(newdir):
        shutil.rmtree(newdir)
        
    #Make new directory to put files with appended header
    os.makedirs(newdir)

    #Copy all the files from dataset folder to newly created header folder
    for f in joinedfiles:
        shutil.copy(f, newdir)
    
    #Join all the files in new 'HeaderFiles'
    joinnew = os.path.join(newdir, "*.txt")
    
    #List of merged files in 'HeaderFiles'
    newfiles = glob.glob(joinnew)
    
    #Loop to read data already existing in file
    for filename in joinedfiles:
        with open(filename) as infile:
            text = infile.read()
            reader = csv.reader(infile, delimiter=',' )
            
    #Loop to input headers followed by initial data
    for filename in newfiles:
        with open(filename, 'w') as outfile:
            # join the headers into a string with commas and add a newline
            outfile.write(f"{','.join(header)}\n") 
            outfile.write(text)
    
    #Concatenating all the new files with headers from 'HeaderFiles' directory and making a dataframe
    df = pd.concat(map(pd.read_csv, newfiles), ignore_index=True)
    
    #Replace Unknown time values with 0
    df['TimeStamp'] = df['TimeStamp'].replace(np.nan, 0)

    #Converting df dataframe to a Pandas Dataframe
    dataframe = pd.DataFrame(df)
    
    #Filling any unknown values with 0
    dataframe.fillna(0)

    #Convert dataframe to a numpy array
    dataset = dataframe.to_numpy()
    
    #Feature x from column 'x'
    x = df['x']
    
    #Feature y from column 'y'
    y = df['y']
    
    #Feature z from column 'z'
    z = df['z']
    
    #Removing all the semicolons from column 'z'
    df['z'] = df['z'].str.replace(';','')

    #Checking size of each feature to make sure they match
    print("Length of x feature: ", len(x), "Length of z feature: ",len(z))
    
    
    #Creating a new column 'binary_eating' to indicate eating or non-eating in a binary fahsion
    df.insert(loc=6,
          column='binary_eating',
          value=0)
    
    #Creating a list of all the Class values in column 'Class'
    classes = df['Class']
    
    #Copying the data from column 'Class' into column 'binary_eating'    
    df['binary_eating'] = df['Class']
    
    #Classes that indicate eating
    eatingClasses = ['H', 'I', 'J', 'K', 'L']
    
    #Classes that do not indicate eating
    noneatingclasses = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T']
    
    #Replacing all the eating classes with a value of 1
    for value in eatingClasses:
        df.loc[df['Class'] == value, 'binary_eating'] = 1
        
    #Replacing all the non-eating classes with a value of 0
    for nonval in noneatingclasses:
        df.loc[df['Class'] == nonval, 'binary_eating'] = 0

    #Creating a list of all the values in the column 'binary_eating'
    target = df['binary_eating']
    
    print(df.head())

    return df, dataframe, target, x, y, z 


#Call the function to print necessary output
df, dataframe, target, x, y, z = classifyData()

Length of x feature:  8200749 Length of z feature:  8200749
   SubjectID Class         TimeStamp         x         y           z  \
0       1638     A  1138138097322000  7.302415 -5.419930    4.485872   
1       1638     A  1138138117418000  6.540799 -3.321892  0.71371585   
2       1638     A  1138138137546000  3.264412 -2.723137  0.22513184   
3       1638     A  1138138157791000  1.070574 -3.319497   1.3771362   
4       1638     A  1138138177887000 -1.621428 -3.827241   1.0035132   

  binary_eating  
0             0  
1             0  
2             0  
3             0  
4             0  


In [4]:
def extract_windows(array, mHZ, seconds, overlap):
    """
    Function to extract windows across the entire dataset.
    The window size is described by mHZ multiplied by time second window
    The input parameters are the following:
        'array': array (dataframe or list) indicating the feature of interest
        'mHZ': Signal frequency of how many measurments taken a second
        'seconds': How many seconds are taken for the window
        'overlap': How much overlap between consecutive windows
    The function returns a list of windows sectioning the input feature vector:
        'windows': list of arrays of the extracted windows
    """
    #Initializing list holding all windows
    windows = []
    
    #Convert input list/dataframe into numpy array
    array = array.to_numpy()
    
    #Calculate how many datapoints should be in each window
    windowSize = mHZ * seconds
    
    #How many points should overlap between consecutive windows
    overlapamt = int(windowSize* overlap)
    
    #For loop appending each window to the list until no more data points are avaialble
    for i in range(len(array)):
        window = array[ (overlapamt *i) : (overlapamt*i) +windowSize ]
        if len(window)!= 0:
            windows.append(window)
            
    return windows

#Call the function to get necessary output

# 5 second windows

# Cutting the x feature into windows forming a list of arrays. 20 mHz taken per second for 10 second windows
# with 50% overlap
x_windows = extract_windows(x, 20, 5,  .5)

# Cutting the y feature into windows forming a list of arrays. 20 mHz taken per second for 10 second windows
# with 50% overlap
y_windows = extract_windows(y, 20, 5,  .5)

# Cutting the z feature into windows forming a list of arrays. 20 mHz taken per second for 10 second windows
# with 50% overlap
z_windows = extract_windows(z, 20, 5, .5)

# Cutting the target classification list into windows forming a list of arrays. 20 mHz taken per second for 10 second windows
# with 50% overlap
# Done so classes dimensions match that of input features
target_window = extract_windows(target, 20, 5, .5)
    

In [None]:


def featureEngineering(x_windows, y_windows, z_windows, target_windows):
    """
    This function engineers features to reduce dataset complexity and 
    make the data more readable for the model. This reduces the number of data points
    needing to be handled, significantly.
    The input parameters are the following:
        'x_windows': list of windows extracted from x feature
        'y_windows': list of windows extracted from y feature
        'z_windows': list of windows extracted from z feature
        'target_windows': list of windows extracted from target 
    The following features are engineereed and returned in terms of x, y, and z:
        'mean': average of the window
        'median': middle value of the entire window
        'mode': most common value in the window
        'std': standard deviation of the window
        'aad': absolute average deviation of the window
        'range': difference in values in each window
        'minimum': minimum value in the window
        'maximum': maximum value in the window
        'log': log of every value in the window averaged together
        'sqrt': square root of every value in the window averaged together
        'square': every value in the window squared and then averaged
        'sum': sum of all the values in the window
        
    """
    #Convert all string values to float
    vector = np.vectorize(float)
    for j in range(0, len(x_windows)):
        # print("array", j, ": ", x_windows[j])
        x_windows[j] = vector(x_windows[j])
        y_windows[j] = vector(y_windows[j])
        z_windows[j] = vector(z_windows[j])

#Engineer the mean
    #x feature
    x_mean = []
    for i in range(len(x_windows)):
        xavg = np.mean(x_windows[1])
        x_mean.append(xavg)

    #y feature
    y_mean = []
    for i in range(len(y_windows)):
        yavg = np.mean(y_windows[i])
        y_mean.append(yavg)
        
    #z feature 
    z_mean = []
    for i in range(len(z_windows)):
        zavg = np.mean(z_windows[i])
        z_mean.append(zavg)
        
#Engineer standard deviation
    #x std
    x_std = []
    for i in range(len(x_windows)):
        xstd = np.std(x_windows[i])
        x_std.append(xstd)
    #y std
    y_std = []
    for i in range(len(y_windows)):
        ystd = np.std(y_windows[i])
        y_std.append(ystd)
    
    #z std
    z_std = []
    for i in range(len(z_windows)):
        zstd = np.std(z_windows[i])
        z_std.append(zstd)
        
#Engineer absolute average deviation
    x_absavgdev = []
    for i in range(len(x_windows)):
        xaad = np.mean(np.absolute(x_windows[i] - np.mean(x_windows[i])))
        x_absavgdev.append(xaad)
    # np.mean(np.absolute(data - np.mean(data)))
    
    y_absavgdev = []
    for i in range(len(y_windows)):
        yaad = np.mean(np.absolute(y_windows[i] - np.mean(y_windows[i])))
        y_absavgdev.append(yaad)
    
    z_absavgdev = []
    for i in range(len(z_windows)):
        zaad = np.mean(np.absolute(z_windows[i] - np.mean(z_windows[i])))
        z_absavgdev.append(zaad)
        
        
#Engineer minimum value
    x_min = []
    for i in range(len(x_windows)):
        xmin = min(x_windows[i])
        x_min.append(xmin)
        
    y_min = []
    for i in range(len(y_windows)):
        ymin = min(y_windows[i])
        y_min.append(ymin)
        
    z_min = []
    for i in range(len(z_windows)):
        zmin = min(z_windows[i])
        z_min.append(zmin)
        
#Engineer maximum value
    x_max = []
    for i in range(len(x_windows)):
        xmax = min(x_windows[i])
        x_max.append(xmax)
    
    y_max = []
    for i in range(len(y_windows)):
        ymax = min(y_windows[i])
        y_max.append(ymax)
    
    z_max = []
    for i in range(len(z_windows)):
        zmax = min(z_windows[i])
        z_max.append(zmax)
#Median feature

    x_median = []
    for i in range(len(x_windows)):
        xmedian = statistics.median(x_windows[j])
        x_median.append(xmedian)
        
    y_median = []
    for i in range(len(y_windows)):
        ymedian = statistics.median(y_windows[j])
        y_median.append(ymedian)
        
    z_median = []
    print(len(z_windows))
    for i in range(len(z_windows)):
        zmedian = statistics.median(z_windows[i])
        z_median.append(zmedian)
        

#Mode Feature

    x_mode = []
    for i in range(len(x_windows)):
        xstat = statistics.mode(x_windows[i])
        x_mode.append(xstat)
        
    y_mode = []
    for i in range(len(y_windows)):
        ystat = statistics.mode(y_windows[i])
        y_mode.append(ystat)
        
    z_mode = []
    for i in range(len(z_windows)):
        zstat = statistics.mode(z_windows[i])
        z_mode.append(zstat)
        
# Range feature 

    x_range = []
    for i in range(len(x_windows)):
        xrange = np.ptp(x_windows[i])
        x_range.append(xrange)
        
    y_range = []
    for i in range(len(y_windows)):
        yrange = np.ptp(y_windows[i])
        y_range.append(yrange)
        
    z_range = []
    for i in range(len(z_windows)):
        zrange = np.ptp(z_windows[i])
        z_range.append(zrange)
        
        

# Log of data point and then all log(data) averaged together 

    x_log = []
    x_logavg =[]
    for i in range(len(x_windows)):
        xlogarr = x_windows[i]
        for j in range(len(xlogarr)):
            value = xlogarr[j]
            valuesqrt = math.sqrt(abs(value))
            xlogarr[j] = valuesqrt   
        x_log.append(xlogarr)
        x_loga = np.mean(x_log[i])
        x_logavg.append(x_loga)
        
    
    y_log = []
    y_logavg = []
    for i in range(len(y_windows)):
        ylogarr = y_windows[i]
        for j in range(len(ylogarr)):
            value = ylogarr[j]
            valuesqrt = math.sqrt(abs(value))
            ylogarr[j] = valuesqrt  
        y_log.append(ylogarr)
        y_loga = np.mean(y_log[i])
        y_logavg.append(y_loga)
        
        
    z_log = [] 
    z_logavg = []
    for i in range(len(z_windows)):
        zlogarr = z_windows[i]
        for j in range(len(zlogarr)):
            value = zlogarr[j]
            valuesqrt = math.sqrt(abs(value))
            zlogarr[j] = valuesqrt          
        z_log.append(zlogarr)
        z_loga = np.mean(z_log[i])
        z_logavg.append(z_loga)            


# Square root of each datapoint in the window and averaging them together to one value
    # feature x 
    sqrtx = []
    sqrtxavg = []
    for i in range(len(x_windows)):
        xsqrtarr = x_windows[i]
        for j in range(len(xsqrtarr)):
            value = xsqrtarr[j]
            valuesqrt = math.sqrt(abs(value))
            xsqrtarr[j] = valuesqrt
        sqrtx.append(xsqrtarr)
        x_sqrta = np.mean(sqrtx[i])
        sqrtxavg.append(x_sqrta)

    # feature y 
    sqrty = []
    sqrtyavg = []
    for i in range(len(y_windows)):
        ysqrtarr = y_windows[i]
        for j in range(len(ysqrtarr)):
            value = ysqrtarr[j]
            valuesqrt = math.sqrt(abs(value))
            ysqrtarr[j] = valuesqrt        
        sqrty.append(ysqrtarr)
        y_sqrta = np.mean(sqrty[i])
        sqrtyavg.append(y_sqrta)
        
    # feature z
    sqrtz = []
    sqrtzavg = []
    for i in range(len(z_windows)):
        zsqrtarr = z_windows[i]
        for j in range(len(zsqrtarr)):
            value = zsqrtarr[j]
            valuesqrt = math.sqrt(abs(value))
            zsqrtarr[j] = valuesqrt                
        sqrtz.append(zsqrtarr)
        z_sqrta = np.mean(sqrtz[i])
        sqrtzavg.append(z_sqrta)
        
# Square of each datapoint in the window and averaging them to one value
    # x feature
    x_sqrd = []
    x_sqrdavg = []
    for i in range(len(x_windows)):
        xsqrd = x_windows[i]
        for j in range(len(xsqrd)):
            xsqrd[j] = np.square(xsqrd[j])
        x_sqrd.append(xsqrd)
        x_sqrda = np.mean(x_sqrd[i])
        x_sqrdavg.append(x_sqrda)

    # y feature
    y_sqrd = []
    y_sqrdavg = []
    for i in range(len(y_windows)):
        ysqrd = y_windows[i]
        for j in range(len(ysqrd)):
            ysqrd[j] = np.square(ysqrd[j])
        y_sqrd.append(ysqrd)
        y_sqrda = np.mean(y_sqrd[i])
        y_sqrdavg.append(y_sqrda)

    # z feature 
    
    z_sqrd = []
    z_sqrdavg = []
    for i in range(len(z_windows)):
        zsqrd = z_windows[i]
        for j in range(len(zsqrd)):
            zsqrd[j] = np.square(zsqrd[j])
        z_sqrd.append(zsqrd)
        z_sqrda = np.mean(z_sqrd[i])
        z_sqrdavg.append(z_sqrda)

#Sum Feature
    # x feature
    x_sum = []
    for i in range(len(x_windows)):
        xsum = np.sum(x_windows[i])
        x_sum.append(xsum)
        
    # y feature
    y_sum = []
    for i in range(len(y_windows)):
        ysum = np.sum(y_windows[i])
        y_sum.append(ysum)
    
    # z feature
    z_sum = []
    for i in range(len(z_windows)):
        zsum = np.sum(z_windows[i])
        z_sum.append(zsum)
        
    #Target mode to reduce complexity and match feature sizes 
    target_mode = []
    for i in range(len(target_windows)):
        targetv = statistics.mode(target_windows[i])
        target_mode.append(targetv)
        
        
        
    return x_mean, y_mean, z_mean, x_std, y_std, z_std, \
            x_absavgdev, y_absavgdev, z_absavgdev, x_min, y_min, z_min, \
            x_max, y_max, z_max, x_median, y_median, z_median, x_mode, \
                y_mode, z_mode, target_mode, x_range, y_range, z_range \
                    , x_logavg, y_logavg,\
                z_logavg, sqrtxavg, sqrtyavg, sqrtzavg, x_sqrdavg, y_sqrdavg, z_sqrdavg \
                    , x_sum, y_sum, z_sum
    
#Call the function to get necessary output

x_mean, y_mean, z_mean, x_std, y_std, z_std, \
            x_absavgdev, y_absavgdev, z_absavgdev, x_min, y_min, z_min, \
            x_max, y_max, z_max, x_median, y_median, z_median, x_mode, \
                y_mode, z_mode, target_mode, x_range, y_range, z_range, x_logavg, y_logavg,\
                z_logavg, sqrtxavg, sqrtyavg, sqrtzavg, x_sqrdavg, y_sqrdavg, z_sqrdavg \
                    , x_sum, y_sum, z_sum = featureEngineering(x_windows, y_windows, z_windows, target_window)

In [36]:
# def AppendNewFeatures(x_mean, y_mean, z_mean, x_std, y_std, z_std, \
#             x_absavgdev, y_absavgdev, z_absavgdev, x_min, y_min, z_min, \
#             x_max, y_max, z_max, x_median, y_median, z_median, x_mode, \
#                 y_mode, z_mode, target_mode, x_range, y_range, z_range , df):

def AppendNewFeatures(x_mean, y_mean, z_mean, x_std, y_std, z_std, \
        x_absavgdev, y_absavgdev, z_absavgdev, x_min, y_min, z_min, \
        x_max, y_max, z_max, x_median, y_median, z_median, x_mode, \
            y_mode, z_mode, target_mode, x_range, y_range, z_range, x_logavg, y_logavg,\
            z_logavg, sqrtxavg, sqrtyavg, sqrtzavg, x_sqrdavg, y_sqrdavg, z_sqrdavg \
                , x_sum, y_sum, z_sum, df):
    """
    Create a new dataframe with newly engineered features.
    This dataframe will be used in the classification model, and allows for more robust analysis.
    The input parameters are the following:
        All input features are presented in terms of x, y, and z
        'mean': average of the window
        'median': middle value of the entire window
        'mode': most common value in the window
        'std': standard deviation of the window
        'aad': absolute average deviation of the window
        'range': difference in values in each window
        'minimum': minimum value in the window
        'maximum': maximum value in the window
        'log': log of every value in the window averaged together
        'sqrt': square root of every value in the window averaged together
        'square': every value in the window squared and then averaged
        'sum': every value in the window summed
        'df': Initial dataframe to compare with features dataframe
    This function returns:
        'df_features': New dataset with all engineered features
    
    """
   
    #Create a new dataframe with newly engineered feature 'x_mean'
    df_features = pd.DataFrame(data = x_mean, columns = ['x_mean'])
    
    #Append all newly engineered features to the dataframe with an appropriate column name
    df_features['y_mean'] = y_mean
    df_features['z_mean'] = z_mean
    
    df_features['x_std'] = x_std
    df_features['y_std'] = y_std
    df_features['z_std'] = z_std
    
    df_features['x_absavgdev'] = x_absavgdev
    df_features['y_absavgdev'] = y_absavgdev
    df_features['z_absavgdev'] = z_absavgdev
    
    df_features['x_min'] = x_min
    df_features['y_min'] = y_min
    df_features['z_min'] = z_min
    
    df_features['x_max'] = x_max
    df_features['y_max'] = y_max
    df_features['z_max'] = z_max
    
    df_features['x_median'] = x_median
    df_features['y_median'] = y_median
    df_features['z_median'] = z_median
    
    df_features['x_mode'] = x_mode
    df_features['y_mode'] = y_mode
    df_features['z_mode'] = z_mode
    
    df_features['x_logavg'] = x_logavg
    df_features['y_logavg'] = y_logavg
    df_features['z_logavg'] = z_logavg


    df_features['x_sqrtavg'] = sqrtxavg
    df_features['y_sqrtavg'] = sqrtyavg
    df_features['z_sqrtavg'] = sqrtzavg

    df_features['x_sqrdavg'] = x_sqrdavg
    df_features['y_sqrdavg'] = y_sqrdavg
    df_features['z_sqrdavg'] = z_sqrdavg


    df_features['x_sum'] = x_sum
    df_features['y_sum'] = y_sum
    df_features['z_sum'] = z_sum
    
    
    # Comparing the initial dataframe with the feature engineering dataframe
    print(df_features.head())
    print(df.head())
    
    #Comparing the length of the initial dataframe with the featured dataframe
    print("Initial dataframe length: ", np.shape(df))
    print("Features dataframe length: ", np.shape(df_features))

    return df_features

# #Call function to return output
appended_df = AppendNewFeatures(x_mean, y_mean, z_mean, x_std, y_std, z_std, \
                x_absavgdev, y_absavgdev, z_absavgdev, x_min, y_min, z_min, \
                x_max, y_max, z_max, x_median, y_median, z_median, x_mode, \
                    y_mode, z_mode, target_mode, x_range, y_range, z_range, x_logavg, y_logavg,\
                    z_logavg, sqrtxavg, sqrtyavg, sqrtzavg, x_sqrdavg, y_sqrdavg, z_sqrdavg \
                        , x_sum, y_sum, z_sum, df)
     

     x_mean    y_mean    z_mean     x_std     y_std     z_std  x_absavgdev  \
0  4.216959 -8.957493  0.423080  3.963555  4.527882  3.174834     2.840014   
1  4.216959 -8.609281  0.307425  3.795516  4.507444  2.952651     2.704928   
2  4.216959 -8.327435 -0.034680  3.932344  4.269922  3.040833     3.017129   
3  4.216959 -8.239730 -0.172034  3.638833  4.023284  3.183217     2.913560   
4  4.216959 -8.264662 -0.235885  3.563615  3.994478  3.080063     2.713212   

   y_absavgdev  z_absavgdev     x_min  ...  z_logavg  x_sqrtavg  y_sqrtavg  \
0     3.855340     2.516984 -9.101074  ...  1.472255   1.409860   1.683980   
1     3.937267     2.325417 -6.821016  ...  1.408520   1.417566   1.661942   
2     3.669260     2.255652 -6.267766  ...  1.334817   1.422222   1.648142   
3     3.280723     2.425811 -3.949387  ...  1.391960   1.415428   1.653570   
4     3.248390     2.413294 -6.217471  ...  1.411450   1.448140   1.661377   

   z_sqrtavg  x_sqrdavg  y_sqrdavg  z_sqrdavg       x_sum     

In [37]:
def TrainTestData(dataframe, Y):
    """
    Split the dataset to a training and testing set in terms of x (features) and y(classes)
    The input parameters are the following:
        'dataframe': dataframe used as the x variable to form Xtrain set and Xtest set
        'Y': list/array presented as the classification variables to for Ytrain set and Ytest set
    Returns:
        'X_train': X training set used to train the model with features
        'X_test': X testing set used to test model performance
        'Y_train': Y training set used to train the model in terms of classification
        'Y_test': Y testing set used to test the model in terms of classification
    """
    
    # Convert the classification list into a dataframe so the train_test_split function can utilize appropriately
    Y = pd.DataFrame(Y)
    
    # Split the 'dataframe' and 'Y'into a training and testing dataset
    # Outputs a list of the indexes in each set. The test size can be specified with the parameter 'test_size'
    X_trainindex , X_testindex, y_trainindex , y_testindex = train_test_split(dataframe.index,Y.index, \
                                                                              test_size=0.2, random_state= None)
    
    # Take the indexing of X_trainindex and create a new dataframe with the appropriate values for X training set
    #    from the input dataframe
    X_train = dataframe.iloc[X_trainindex] # return dataframe train
    
    # Take the indexing of X_testindex and create a new dataframe with the appropriate values 
    #     for X testing set from input dataframe
    X_test = dataframe.iloc[X_testindex]
    
    # Take the indexing of Y_trainindex and create a new dataframe with the appropriate values 
    #     for Y training set from input Y
    Y_train = Y.iloc[y_trainindex]
    
    # Take the indexing of Y_testindex and create a new dataframe with the appropriate values 
    #     for Y testing set from input Y
    Y_test = Y.iloc[y_testindex]
    
    return X_train, X_test, Y_train , Y_test

# Call function to return output
Xtrain , Xtest, Ytrain, Ytest = TrainTestData(appended_df, target_mode)


In [38]:
def SVM(Xtrain, Xtest, Ytrain, Ytest):
    """ 
    Classification model named Support Vector Machine that is able to distinguish between two-group classification
    problems. After training the model with labeled data for each class/group, the model is able to categorize
    new data with predictions.
    The input parameters are the following:
        'Xtrain': Training set used to train the model with features
        'Xtest': Testing set used to test how model performance as control group
        'Ytrain': Training set used to test the model's features
        'Ytest': Testing set used to test model performance as control group
    The model is trained on given input data of the features engineered, and model performance is tested with
    known data used for testing. Model performance is evaluated using accuracy, precision, F1, and recall.
    Returns:
        'accuracy': Percentage reflecting how much of model output is equivalent to real output
        'precision': Ratio of True positives by All positives. The ability of the classifier
                    to not label a sample positive, when it is negative (label non-eating as non-eating)
        'recall': Ratio of True positives by True Positives + False Negatives.
                    Ratio of ability of the classifier to find all positive samples (label eating as eating)
        'F1': Mean of precision and recall. A value of 1 is optimal, 0 is worst. Calculated by
                        F1 = 2 * (precision * recall) / (precision + recall)
    """
    
    #Flattening Ytrain to pass through SVM 
    Ytrain = Ytrain.values.ravel()
    
    #Flattening Ytest to pass through SVM
    Ytest = Ytest.values.ravel()
    
    print("Check model is running before model initialization")
    # Initalizing model as a Support Vector Classification
    model = SVC()

    # Fit the model to data
    model.fit(Xtrain, Ytrain)
    
    # Make predictions with Xtest
    y_pred = model.predict(Xtest)
    print("Check model is running before accuracy")
    #Accuracy Score comparing model predictions with model output
    accuracy = accuracy_score(Ytest, y_pred)
    print('Model Accuracy score: ', accuracy)    
    
    #Precision score comparing model predictions with model output with binary classification (Eating/Noneating)
    precision = precision_score(Ytest, y_pred, average = 'binary')
    print('\nModel Precision Score: ', precision)
    
    #Recall score comparing model predictions with model output with binary classification (Eating/Noneating)
    recall = recall_score(Ytest, y_pred, average = 'binary')
    print('\nModel Recall Score: ', recall)
    
    #F1 or F-score of model
    F1 = f1_score(Ytest, y_pred, average = 'binary')
    print('\nModel F1 Score: ', F1)
    
    return accuracy, precision, recall, F1
    
#Call function to return output
accuracy, precision, recall, F1 = SVM(Xtrain, Xtest, Ytrain, Ytest)



before model
before accuracy
Model Accuracy score:  0.8996128402889979

Model Precision Score:  0.7503040834057342

Model Recall Score:  0.9537272225289896

Model F1 Score:  0.8398735716022367


In [39]:
def main():
    #Starting timer
    start_time = time.time()
    
    # Calling classifyData() to read data and assign features and classes
    df, dataframe, target, x, y, z = classifyData()
    # Cutting the x feature into windows forming a list of arrays. 20 mHz taken per second for 10 second windows
    # with 50% overlap
    x_windows = extract_windows(x, 20, 10,  .5)
    
    # Cutting the y feature into windows forming a list of arrays. 20 mHz taken per second for 10 second windows
    # with 50% overlap
    y_windows = extract_windows(y, 20, 10,  .5)
    
    # Cutting the z feature into windows forming a list of arrays. 20 mHz taken per second for 10 second windows
    # with 50% overlap
    z_windows = extract_windows(z, 20, 10, .5)
    
    # Cutting the target classification list into windows forming a list of arrays. 20 mHz taken per second for 10 second windows
    # with 50% overlap
    # Done so classes dimensions match that of input features
    target_window = extract_windows(target, 20, 10, .5)
    
    
    # Engineering features from the extracted windows, and creating an appropriate categorical class array
    
    x_mean, y_mean, z_mean, x_std, y_std, z_std, \
        x_absavgdev, y_absavgdev, z_absavgdev, x_min, y_min, z_min, \
        x_max, y_max, z_max, x_median, y_median, z_median, x_mode, \
            y_mode, z_mode, target_mode, x_range, y_range, z_range, x_logavg, y_logavg,\
            z_logavg, sqrtxavg, sqrtyavg, sqrtzavg, x_sqrdavg, y_sqrdavg, z_sqrdavg \
                , x_sum, y_sum, z_sum = featureEngineering(x_windows, y_windows, z_windows, target_window)
            
    # Creating a new dataframe to be input into the classification model with new features
    
#     appended_df = AppendNewFeatures(x_mean, y_mean, z_mean, x_std, y_std, z_std, \
#             x_absavgdev, y_absavgdev, z_absavgdev, x_min, y_min, z_min, \
#             x_max, y_max, z_max, x_median, y_median, z_median, x_mode, \
#                 y_mode, z_mode, target_mode, x_range, y_range, z_range , df)
    
    appended_df = AppendNewFeatures(x_mean, y_mean, z_mean, x_std, y_std, z_std, \
                x_absavgdev, y_absavgdev, z_absavgdev, x_min, y_min, z_min, \
                x_max, y_max, z_max, x_median, y_median, z_median, x_mode, \
                    y_mode, z_mode, target_mode, x_range, y_range, z_range, x_logavg, y_logavg,\
                    z_logavg, sqrtxavg, sqrtyavg, sqrtzavg, x_sqrdavg, y_sqrdavg, z_sqrdavg \
                        , x_sum, y_sum, z_sum, df)
     
    # Splitting the features dataframe and targets to training and testing data
    Xtrain , Xtest, Ytrain, Ytest = TrainTestData(appended_df, target_mode)

    # Calling Support Vector Machine to create a classification model with training and testing data
    accuracy, precision, recall, F1= SVM(Xtrain, Xtest, Ytrain, Ytest)


    
    
    print("--- %s seconds ---" % (time.time() - start_time))
   

    

In [40]:
if __name__ == '__main__':
    main()

Length of x feature:  8200749 Length of z feature:  8200749
   SubjectID Class         TimeStamp         x         y           z  \
0       1638     A  1138138097322000  7.302415 -5.419930    4.485872   
1       1638     A  1138138117418000  6.540799 -3.321892  0.71371585   
2       1638     A  1138138137546000  3.264412 -2.723137  0.22513184   
3       1638     A  1138138157791000  1.070574 -3.319497   1.3771362   
4       1638     A  1138138177887000 -1.621428 -3.827241   1.0035132   

  binary_eating  
0             0  
1             0  
2             0  
3             0  
4             0  
82008
     x_mean    y_mean    z_mean     x_std     y_std     z_std  x_absavgdev  \
0  4.651607 -8.642464  0.194200  3.963171  4.412053  3.116970     2.933002   
1  4.651607 -8.296049 -0.135283  3.752544  4.134614  3.062164     2.865794   
2  4.651607 -8.423248  0.000647  4.016029  4.152091  2.967418     3.008576   
3  4.651607 -8.589977  0.227036  4.285565  4.238946  3.257598     3.331848   
4  

In [41]:
F1

0.8398735716022367