**Objective**: Exploring methods for building a model for identifying eating activity in Capture24

In [1]:
import os
import numpy as np
import pandas as pd
from glob import glob
import scipy.stats as stats
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from joblib import Parallel, delayed
import urllib
import shutil
from tqdm.auto import tqdm
import utils  # helper functions -- check out utils.py
import zipfile
import re

# For reproducibility
np.random.seed(42)

### Load data

In [5]:
def load_all_and_make_windows(datafiles, N=999):

    def worker(datafile):
        X, Y, T = utils.make_windows(utils.load_data(datafile), winsec=30)
        pid = os.path.basename(datafile).split(".")[0]  # participant ID
        pid = np.asarray([pid] * len(X))
        return X, Y, T, pid

    results = Parallel(n_jobs=4)(
        delayed(worker)(datafile) for datafile in tqdm(datafiles[:N])
    )

    X = np.concatenate([result[0] for result in results])
    Y = np.concatenate([result[1] for result in results])
    T = np.concatenate([result[2] for result in results])
    pid = np.concatenate([result[3] for result in results])

    return X, Y, T, pid

In [6]:
# get all accelerometer data files

datafiles = os.path.expanduser("~/capture24/accelerometer/P[0-9][0-9][0-9].csv.gz")
X, Y, T, pid = load_all_and_make_windows(glob(datafiles))

# save the arrays for later
outputpath = os.path.expanduser("~/eating_detect/data/")
os.makedirs(outputpath + "processed_data/", exist_ok=True)
np.save(outputpath + "processed_data/X.npy", X)
np.save(outputpath + "processed_data/Y.npy", Y)
np.save(outputpath + "processed_data/T.npy", T)
np.save(outputpath + "processed_data/pid.npy", pid)


  0%|          | 0/151 [00:00<?, ?it/s]

In [9]:
# Load processed files
outputpath = os.path.expanduser("~/eating_detect/data/")
X = np.load(outputpath + 'processed_data/X.npy', mmap_mode='r')
Y = np.load(outputpath + 'processed_data/Y.npy')
T = np.load(outputpath + 'processed_data/T.npy')
pid = np.load(outputpath + 'processed_data/pid.npy')


In [4]:
def extract_features(xyz):
    ''' Extract features. xyz is an array of shape (N,3) '''

    feats = {}
    feats['xMean'], feats['yMean'], feats['zMean'] = np.mean(xyz, axis=0)
    feats['xStd'], feats['yStd'], feats['zStd'] = np.std(xyz, axis=0)
    v = np.linalg.norm(xyz, axis=1)  # magnitude stream
    feats['mean'], feats['std'] = np.mean(v), np.std(v)

    return feats

# Extract features
X_feats = pd.DataFrame(Parallel(n_jobs=4)(delayed(extract_features)(x) for x in tqdm(X)))
X_feats.to_pickle(outputpath + 'processed_data/X_feats.pkl')
print(X_feats)


  0%|          | 0/312730 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [10]:
# Let's map the text annotations to simplified labels
eat_indices = np.array([index for index, element in enumerate(Y) if 'eat' in element])

# Let's load the dictionary that maps the text labels to simplified labels
# and apply it to the Y array

# Load the dictionary
label_dict_path = os.path.expanduser("~/capture24/annotation-label-dictionary.csv")
anno_label_dict = pd.read_csv(
    label_dict_path,
    index_col='annotation', 
    dtype='string'
)

# remove the last bit of string after the last ";" in Y
#pattern = ";MET\ .*"
#Y = np.array([re.sub(pattern, '', element) for element in Y])


# apply the dictionary to simplify the labels
Y_simple = np.array([anno_label_dict.loc[y, 'label:Willetts2018'] for y in Y])


In [18]:
# get the unique labels related to eating
eating_labels = np.unique(Y[eat_indices])

# write the eating labels to a file for manual inspection
with open(outputpath + 'eating_labels.txt', 'w') as f:
    for item in eating_labels:
        f.write("%s\n" % item)

In [60]:
# after inspection, I have manually created a dictionary that maps the eating labels to
# simplified labels

eating_label_dict_path = os.path.expanduser("~/eating_detect/data/eating_labels_simple.tsv")
eating_label_dict = pd.read_csv(
    eating_label_dict_path,
    sep='\t',
    dtype='string'
)

# modify the Y_simple array to add eating-specific labels
# only replace with eating and maybe-eating lables, and ignore not-eating labels
Y_simple_eating = np.copy(Y_simple)
for i in eat_indices:
    label = Y[i]
    eating_label = eating_label_dict.loc[label, 'simple']
    if eating_label != 'not-eating':
        Y_simple_eating[i] = eating_label


In [62]:
# now let's read the features extracted from the accelerometer data
X_feats = pd.read_pickle(outputpath + 'processed_data/X_feats.pkl')

array([], dtype=float64)