**Objective**: Exploring methods for building a model for identifying eating activity in Capture24

In [66]:
import os
import numpy as np
import pandas as pd
from glob import glob
import scipy.stats as stats
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from joblib import Parallel, delayed
import urllib
import shutil
from tqdm.auto import tqdm
import utils  # helper functions -- check out utils.py
import zipfile
import re

# For reproducibility
np.random.seed(42)

### Load data

In [5]:
def load_all_and_make_windows(datafiles, N=999):

    def worker(datafile):
        X, Y, T = utils.make_windows(utils.load_data(datafile), winsec=30)
        pid = os.path.basename(datafile).split(".")[0]  # participant ID
        pid = np.asarray([pid] * len(X))
        return X, Y, T, pid

    results = Parallel(n_jobs=4)(
        delayed(worker)(datafile) for datafile in tqdm(datafiles[:N])
    )

    X = np.concatenate([result[0] for result in results])
    Y = np.concatenate([result[1] for result in results])
    T = np.concatenate([result[2] for result in results])
    pid = np.concatenate([result[3] for result in results])

    return X, Y, T, pid

In [6]:
# get all accelerometer data files

datafiles = os.path.expanduser("~/capture24/accelerometer/P[0-9][0-9][0-9].csv.gz")
X, Y, T, pid = load_all_and_make_windows(glob(datafiles))

# save the arrays for later
outputpath = os.path.expanduser("~/eating_detect/data/")
os.makedirs(outputpath + "processed_data/", exist_ok=True)
np.save(outputpath + "processed_data/X.npy", X)
np.save(outputpath + "processed_data/Y.npy", Y)
np.save(outputpath + "processed_data/T.npy", T)
np.save(outputpath + "processed_data/pid.npy", pid)


  0%|          | 0/151 [00:00<?, ?it/s]

In [71]:
# Load processed files
X = np.load(outputpath + 'processed_data/X.npy', mmap_mode='r')
Y = np.load(outputpath + 'processed_data/Y.npy')
T = np.load(outputpath + 'processed_data/T.npy')
pid = np.load(outputpath + 'processed_data/pid.npy')


In [10]:
def extract_features(xyz):
    ''' Extract features. xyz is an array of shape (N,3) '''

    feats = {}
    feats['xMean'], feats['yMean'], feats['zMean'] = np.mean(xyz, axis=0)
    feats['xStd'], feats['yStd'], feats['zStd'] = np.std(xyz, axis=0)
    v = np.linalg.norm(xyz, axis=1)  # magnitude stream
    feats['mean'], feats['std'] = np.mean(v), np.std(v)

    return feats

# Extract features
X_feats = pd.DataFrame(Parallel(n_jobs=4)(delayed(extract_features)(x) for x in tqdm(X)))
X_feats.to_pickle(outputpath + 'processed_data/X_feats.pkl')
print(X_feats)


  0%|          | 0/312730 [00:00<?, ?it/s]

           xMean     yMean     zMean      xStd      yStd      zStd      mean  \
0      -0.472182 -0.534437  0.655677  0.006469  0.006348  0.003302  0.968795   
1      -0.475536 -0.531443  0.656365  0.007745  0.004139  0.002418  0.969249   
2      -0.477615 -0.530214  0.656677  0.007741  0.004491  0.003096  0.969811   
3      -0.478286 -0.530208  0.656646  0.007620  0.004762  0.003020  0.970119   
4      -0.477083 -0.530458  0.656750  0.007806  0.004137  0.003163  0.969732   
...          ...       ...       ...       ...       ...       ...       ...   
312725  0.108823  0.343010  0.813089  0.004272  0.003462  0.005327  0.889182   
312726  0.109135  0.342750  0.813104  0.003862  0.003888  0.005163  0.889134   
312727  0.108943  0.342651  0.813172  0.003960  0.004016  0.005083  0.889135   
312728  0.109828  0.342396  0.812891  0.004674  0.005394  0.005798  0.888900   
312729  0.109615  0.342552  0.813000  0.003904  0.004235  0.005030  0.889022   

             std  
0       0.004515  
1

In [72]:
# As before, let's map the text annotations to simplified labels
eat_indices = np.array([index for index, element in enumerate(Y) if 'eat' in element])

# Let's load the dictionary that maps the text labels to simplified labels
# and apply it to the Y array

# Load the dictionary
label_dict_path = os.path.expanduser("~/capture24/annotation-label-dictionary.csv")
anno_label_dict = pd.read_csv(
    label_dict_path,
    index_col='annotation', 
    dtype='string'
)

# remove the last bit of string after the last ";" in Y
#pattern = ";MET\ .*"
#Y = np.array([re.sub(pattern, '', element) for element in Y])


# apply the dictionary to simplify the labels
Y = np.array([anno_label_dict.loc[y, 'label:Willetts2018'] for y in Y])


array(['bicycling', 'mixed', 'sit-stand', 'sleep', 'vehicle', 'walking'],
      dtype='<U9')

In [70]:
# check if 'home activity' is part the string in any of the index of the dictionary
is_hc = np.array(['mixed-activity' in anno_label_dict.index[i] for i in range(len(anno_label_dict))])
# get the row index label based on the boolean array
hc_label = anno_label_dict.index[is_hc]
hc_label

Index([], dtype='string', name='annotation')

In [46]:
is_hc

array([False, False,  True, False,  True,  True, False,  True,  True,
        True,  True,  True,  True, False, False,  True,  True, False,
       False, False, False, False,  True,  True, False, False,  True,
       False,  True,  True, False, False,  True,  True, False, False,
       False, False,  True, False, False, False, False,  True, False,
        True, False, False, False,  True, False,  True, False,  True,
        True,  True,  True, False,  True, False, False,  True,  True,
       False, False,  True,  True,  True, False, False,  True, False,
        True, False, False, False,  True,  True,  True,  True, False,
       False, False, False, False, False,  True, False, False,  True,
        True, False,  True, False,  True, False, False,  True, False,
       False,  True, False, False, False, False, False,  True, False,
       False,  True,  True,  True, False,  True,  True, False, False,
       False,  True,  True,  True, False,  True, False, False, False,
       False,  True,

<class 'pandas.core.indexes.base.Index'>


In [27]:

data.head()

Unnamed: 0_level_0,x,y,z,temperature,light,annotation
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-10-11 02:30:00+00:00,0.90625,-0.15625,0.3125,23.75,3.417625,7030 sleeping;MET 0.95
2014-10-11 02:30:00.010000+00:00,0.90625,-0.15625,0.3125,23.75,3.417625,7030 sleeping;MET 0.95
2014-10-11 02:30:00.020000+00:00,0.90625,-0.171875,0.3125,23.75,3.417625,7030 sleeping;MET 0.95
2014-10-11 02:30:00.030000+00:00,0.90625,-0.15625,0.3125,23.75,3.417625,7030 sleeping;MET 0.95
2014-10-11 02:30:00.040000+00:00,0.90625,-0.15625,0.3125,23.75,3.417625,7030 sleeping;MET 0.95
