In [1]:
import copy
import math
import struct
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import math
import csv
from scipy.interpolate import interp1d
from scipy.stats import pearsonr, mode, kurtosis, skew
from scipy.signal import savgol_filter
import xgboost as xgb
import sklearn
import copy
import warnings
import multiprocessing

from scipy.stats import kurtosis
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import mean_squared_error, plot_confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from scipy.ndimage import gaussian_filter1d
from joblib import Parallel, delayed
import tsfel
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedShuffleSplit


if "/home/grads/s/" in os.getcwd():
    addressPrefix = "/home/grads/s/sorush.omidvar/CGMDataset/Hoover/"
else:
    addressPrefix = "C:/Users/sorush.omidvar/Google Drive/Documents/Educational/TAMU/Research/CGM Dataset/Hoover/"
    if not os.path.exists(addressPrefix):
        addressPrefix = "C:/GDrive/Documents/Educational/TAMU/Research/CGM Dataset/Hoover/"

pd.options.mode.chained_assignment = None  # default='warn'
plt.style.use({"figure.facecolor": "white"})
coreNumber = np.min([multiprocessing.cpu_count(), 24])
pd.set_option("display.max_rows", 500)

# import warnings
# warnings.filterwarnings("ignore")


In [2]:
def pdFormatter(df):
    for counter in range(len(df)):
        tempStr = df.iloc[counter, 1]
        tempVal = int(tempStr[0:2]) * 3600 + int(tempStr[3:5]) * 60 + int(tempStr[6:8])
        tempVal *= 1000
        df.iloc[counter, 1] = tempVal

        tempStr = df.iloc[counter, 2]
        tempVal = int(tempStr[0:2]) * 3600 + int(tempStr[3:5]) * 60 + int(tempStr[6:8])
        tempVal *= 1000
        df.iloc[counter, 2] = tempVal
        if df.iloc[counter, 1] > df.iloc[counter, 2]:
            df.iloc[counter, 2] += 24 * 3600 * 1000
    df.sort_values(by=["Name", "Start", "End"], inplace=True)

    return df


def labelReader(addressPrefix):
    labelFiles = []
    for root, dirs, files in os.walk(addressPrefix, topdown=False):
        for name in files:
            if ".txt" in name:
                labelFiles.append([os.path.join(root, name), name])
    mealTime = []
    sensorTiming = []
    for element in labelFiles:
        nameTemp = element[1]
        nameTemp = nameTemp[: nameTemp.find("-events")]
        with open(element[0], "r+") as txtfile:
            fileData = txtfile.read()
            fileData = fileData.splitlines()
            while "" in fileData:
                fileData.remove("")
            tempStart = fileData[0]
            tempStart = tempStart.split()
            tempStart = tempStart[2]

            tempEnd = fileData[-1]
            tempEnd = tempEnd.split()
            tempEnd = tempEnd[2]

            sensorTiming.append([nameTemp, tempStart, tempEnd])
            for counter in range(1, len(fileData) - 1):
                tempStr = fileData[counter]
                tempStr = tempStr.split()
                mealTime.append([nameTemp, tempStr[1], tempStr[2]])

    dfMeal = pd.DataFrame(mealTime, columns=["Name", "Start", "End"])
    dfMeal = pdFormatter(dfMeal)

    dfTime = pd.DataFrame(sensorTiming, columns=["Name", "Start", "End"])
    dfTime = pdFormatter(dfTime)

    return dfMeal, dfTime


def shimmerReader(element):
    nameTemp = element[1]
    dataList = []
    tempList = []
    with open(element[0], mode="rb") as txtfile:
        fileData = txtfile.read()
        for i in range(int(len(fileData) / 4)):
            if i % 6 == 0 and i != 0:
                tempList.append(nameTemp)
                dataList.append(tempList)
                tempList = []
            tempVal = fileData[i * 4 : (i + 1) * 4]
            tempVal = struct.unpack("f", tempVal)
            tempVal = tempVal[0]
            tempList.append(tempVal)
    txtfile.close()

    dfSensor = pd.DataFrame(dataList, columns=["X", "Y", "Z", "Yaw", "Pitch", "Roll", "Name"])
    dfSensor = dfSensor[["Name", "X", "Y", "Z", "Yaw", "Pitch", "Roll"]]
    del dataList
    return dfSensor


def timeFinder(dfSensor, dfTime):
    dfSensor.insert(1, "Time", float("nan"))
    name = dfSensor["Name"].tolist()
    name = name[0]
    dfTemp = dfTime[dfTime["Name"] == name]

    if len(dfTemp) > 1:
        print("More than one event file for:", name)
        return
    elif len(dfTemp) == 0:
        print("No event file for:", name)
        return
    startTemp = dfTemp["Start"].tolist()
    endTemp = dfTemp["End"].tolist()
    tempTimeStamp = np.linspace(startTemp, endTemp, num=len(dfSensor))
    dfSensor["Time"] = tempTimeStamp
    return dfSensor


def labelExtractor(dfMeal, features):
    dataTotal = []
    for feature in features:
        dataTemp = []
        windowName = feature[-3]
        windowStart = feature[-2]
        windowEnd = feature[-1]
        dfTemp = dfMeal[dfMeal["Name"] == windowName]
        if len(dfTemp) == 0:
            print("skipped", windowName)
            break
        eatingFlag = False
        for counter in range(0, len(dfTemp)):
            if dfTemp.iloc[counter, 1] < windowEnd and dfTemp.iloc[counter, 2] > windowStart:
                eatingFlag = True
                break
        dataTemp.extend(feature[: len(feature) - 3])
        dataTemp.extend([windowName, eatingFlag])
        dataTotal.append(dataTemp)

    return dataTotal


def labelFinder(dfMeals, windowLength, name, start, end):
    dfMeal = dfMeals[dfMeals["Name"] == name]
    eatingFlag = False
    for counter in range(0, len(dfMeal)):
        if dfMeal.iloc[counter]["Start"] <= end - windowLength / 2 and dfMeal.iloc[counter].loc["End"] >= start + windowLength / 2:
            eatingFlag = True
            break
    return eatingFlag


def windowMaker(dfSensor, dfMeal):
    windowLength = 60 * 1000
    features = []
    startTime = dfSensor["Time"].min()
    endTime = startTime + windowLength
    while startTime < 24 * 3600 * 1000:
        dfTemp = dfSensor[dfSensor["Time"] >= startTime]
        dfTemp = dfTemp[dfTemp["Time"] < endTime]
        if len(dfTemp) > 40 * 15:
            eatingFlag = labelFinder(dfMeal, windowLength, dfTemp.iloc[0].loc["Name"], dfTemp["Time"].min(), dfTemp["Time"].max())
            dfTemp.insert(8, "EatingFlag", eatingFlag)
            features.append(featureExtractor(dfTemp))
        startTime += windowLength
        endTime += windowLength
    columnList = ["Name", "EatingFlag", "Start", "End", "F1-Mean", "F1-Std", "F1-Range", "F2-Mean", "F2-Std", "F2-Range"]
    features = pd.DataFrame(features, columns=columnList)
    # print(features)
    return features


def featureExtractor(df):
    # cfg = tsfel.get_features_by_domain()
    name = df.iloc[0, 0]
    start = df.iloc[:, 1].min()
    end = df.iloc[:, 1].max()
    eatingFlag = df["EatingFlag"].iloc[0]

    f2 = df["X"].abs() + df["Y"].abs() + df["Z"].abs() + 0.0001  # To avoid getting nan for F1
    f2 = np.asarray(f2)
    f1 = df["Yaw"].abs() + df["Pitch"].abs() + df["Roll"].abs()
    f1 = np.asarray(f1)
    f1 = f1 / f2
    featureData = [name, eatingFlag, start, end, np.mean(f1), np.std(f1), np.max(f1) - np.min(f1), np.mean(f2), np.std(f2), np.max(f2) - np.min(f2)]
    return featureData


def parallelCall(element, dfMeal, dfTime):
    dfSensor = shimmerReader(element)
    dfSensor = timeFinder(dfSensor, dfTime)
    features = windowMaker(dfSensor, dfMeal)
    return features


def main(addressPrefix):
    shimmerFiles = []
    dfMeal, dfTime = labelReader(os.path.join(addressPrefix, "EVENTfiles"))
    for root, dirs, files in os.walk(os.path.join(addressPrefix, "SHMfiles"), topdown=False):
        for name in files:
            if ".shm" in name:
                shimmerFiles.append([os.path.join(root, name), name[:-4]])
    # shimmerFiles=shimmerFiles[0:20]
    print("The number of core to be used:", coreNumber)
    dfFeatures = Parallel(n_jobs=coreNumber)(delayed(parallelCall)(i, dfMeal, dfTime) for i in tqdm(shimmerFiles))
    # dfFeatures=parallelCall(shimmerFiles[0],dfMeal,dfTime)
    dfFeatures = pd.concat(dfFeatures)
    dfFeatures.sort_values(by=["Name", "Start"])
    dfFeatures.reset_index(drop=True, inplace=True)
    dfFeatures.to_csv(os.path.join(addressPrefix, "dfFeatures.csv"), index=False)

    return dfFeatures


dfFeatures = main(addressPrefix)


The number of core to be used: 24


100%|██████████| 354/354 [10:50<00:00,  1.84s/it]


In [52]:
def modelVisualizer(testData, testLabels, modelBest):
    slidingWindowPrediction = modelBest.predict(testData)
    confMatrix = sklearn.metrics.confusion_matrix(testLabels, slidingWindowPrediction, normalize="all")
    accuracy = sklearn.metrics.accuracy_score(testLabels, slidingWindowPrediction)
    recall = sklearn.metrics.recall_score(testLabels, slidingWindowPrediction)
    precision = sklearn.metrics.precision_score(testLabels, slidingWindowPrediction)

    print("Testing on test dataset:")
    print(confMatrix)
    print("Accuracy", np.round(100 * accuracy, 0), "Recall", np.round(100 * recall, 0), "Precision", np.round(100 * precision, 0))

    titles_options = [("Confusion matrix, without normalization", None), ("Normalized confusion matrix", "true")]

    for title, normalize in titles_options:
        disp = plot_confusion_matrix(modelBest, testData, testLabels, display_labels=["Non Eating", "Eating"], cmap=plt.cm.Blues, normalize=normalize)
        disp.ax_.set_title(title)
    plt.show()
    return confMatrix


def xgClassifier(xTrain, yTrain, xTest, yTest, randomSeed, NormalFlag, SMOTEFlag):
    if normalFlag:
        for dimensionCounter in range(xTrain.shape[1]):
            xTest[:, dimensionCounter] -= np.mean(xTrain[:, dimensionCounter])
            xTest[:, dimensionCounter] /= np.std(xTrain[:, dimensionCounter])

            xTrain[:, dimensionCounter] -= np.mean(xTrain[:, dimensionCounter])
            xTrain[:, dimensionCounter] /= np.std(xTrain[:, dimensionCounter])

    stratShuffle = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=randomSeed)
    for trainInd, valInd in stratShuffle.split(xTrain, yTrain):
        xTrain, xVal = xTrain[trainInd, :], xTrain[valInd, :]
        yTrain, yVal = yTrain[trainInd], yTrain[valInd]

    if SMOTEFlag:
        oversample = SMOTE()
        xTrain, yTrain = oversample.fit_resample(xTrain, yTrain)

    f1Best = -1
    for maxDepth in np.arange(4, 7, 5):
        for estimator in np.arange(200, 301, 150):
            posWeight = len(yTrain) / np.sum(yTrain)
            clf = xgb.XGBClassifier(n_estimators=estimator, max_depth=maxDepth, objective="binary:logistic", scale_pos_weight=posWeight, random_state=randomSeed, n_jobs=-1, eval_metric="logloss", use_label_encoder=False)
            clf.fit(xTrain, yTrain)
            valPrediction = clf.predict(xVal)
            accuracy = sklearn.metrics.accuracy_score(yVal, valPrediction)
            recall = sklearn.metrics.recall_score(yVal, valPrediction)
            precision = sklearn.metrics.precision_score(yVal, valPrediction)
            f1Score = sklearn.metrics.f1_score(yVal, valPrediction, average="weighted")

            if f1Score > f1Best:
                f1Best = f1Score
                modelBest = clf

    confMatrixTest = modelVisualizer(xTest, yTest, modelBest)
    return clf, f1Score, confMatrixTest


# def randomForestClassifier(dataList, labelList, randomSeed):
#     trainData, testData, trainLabels, testLabels = train_test_split(dataList, labelList, test_size=0.25, random_state=randomSeed)
#     trainData, valData, trainLabels, valLabels = train_test_split(trainData, trainLabels, test_size=0.33, random_state=randomSeed)
#     oversample = SMOTE()
#     trainData, trainLabels = oversample.fit_resample(trainData, trainLabels)
#     f1Best = 0
#     for treeNum in np.arange(10, 100, 10):
#         for maxDepth in np.arange(3, 10):
#             clf = RandomForestClassifier(n_estimators=treeNum, criterion="entropy", random_state=0, max_depth=maxDepth, n_jobs=coreNumber)
#             clf.fit(trainData, trainLabels)
#             slidingWindowPrediction = clf.predict(valData)
#             accuracy = sklearn.metrics.accuracy_score(valLabels, slidingWindowPrediction)
#             recall = sklearn.metrics.recall_score(valLabels, slidingWindowPrediction)
#             precision = sklearn.metrics.precision_score(valLabels, slidingWindowPrediction)
#             f1 = sklearn.metrics.f1_score(valLabels, slidingWindowPrediction, average="weighted")

#             if f1 > f1Best:
#                 f1Best = f1
#                 maxDepthBest = maxDepth
#                 treeNumBest = treeNum
#                 accuracyBest = accuracy
#                 modelBest = clf
#                 recallBest = recall
#                 precisionBest = precision

#     modelVisualizer(testData, testLabels, modelBest)


# def dataSplitter(normalFlag,randomSeed):




def foldRunnerAux(df,xData,yData,xTrain,yTrain,xTest,yTest):
    tempVal = df["Train"].to_list()
    tempVal = np.asarray(tempVal).astype(int)
    tempVal = tempVal.flatten()
    if len(yTrain)==0:
        xTrain = xData[tempVal, :]
        yTrain = yData[tempVal]
    else:
        xTrain = np.concatenate((xTrain, xData[tempVal, :]), axis=0)
        yTrain = np.concatenate((yTrain, yData[tempVal]), axis=0)        

    tempVal = df["Test"].to_list()
    tempVal = np.asarray(tempVal).astype(int)
    tempVal = tempVal.flatten()
    if len(yTest)==0:
        xTest = xData[tempVal, :]
        yTest = yData[tempVal]
    else:
        xTest = np.concatenate((xTest, xData[tempVal, :]), axis=0)
        yTest = np.concatenate((yTest, yData[tempVal]), axis=0)              

def foldRunner(dfData,posXData,posYData,negXData,negYData,xTrain,yTrain,xTest,yTest):
    dfTemp = dfData[dfData["Set"] == setData]
    dfPos = dfTemp[dfTemp["Stat"] == "Pos"]    
    foldRunnerAux(dfPos,posXData,posYData,xTrain,yTrain,xTest,yTest)

    dfTemp = dfData[dfData["Set"] == setData]
    dfNeg = dfTemp[dfTemp["Stat"] == "Neg"]
    foldRunnerAux(dfNeg,negXData,negYData,xTrain,yTrain,xTest,yTest)

def posNegSeparator(df,xData,yData,statLabel):
    dfTemp=df[df["EatingFlag"]==statLabel]
    yData = dfTemp["EatingFlag"].to_list()
    yData = np.asarray(yData).astype(float)

    df.drop(columns=["EatingFlag", "Name", "Start", "End"], inplace=True)
    xData = dfTemp.values
    xData = np.asarray(xData).astype(float)

def foldMaker(statLabel,yData,dfTemp):
    setCounter=0
    kf = KFold(n_splits=5, shuffle=False)
    for trainIndex, testIndex in kf.split(yData):
        dfTemp.append([statLabel, trainIndex, testIndex, setCounter])
        setCounter += 1


def dataSummarizer(f1ScoreSets,confMatrixSets):
    f1ScoreSets = np.asarray(f1ScoreSets).astype(float)
    f1ScoreSets = np.round(f1ScoreSets, 3)
    confMatrixMean = np.mean(np.asarray(confMatrixSets), axis=0)
    print("All F1 scores", f1ScoreSets)
    print("Average F1:", np.mean(f1ScoreSets))
    print("Average Conf Matrix:", confMatrixMean)
    accuracy = (confMatrixMean[0][0] + confMatrixMean[1][1]) / (confMatrixMean[0][0] + confMatrixMean[0][1] + confMatrixMean[1][0] + confMatrixMean[1][1])  # (TP+NP)/(TP+NP+FP+FN)
    sensitivity = confMatrixMean[1][1] / (confMatrixMean[1][0] + confMatrixMean[1][1])  # TP/(TP+FN)
    specificity = confMatrixMean[0][0] / (confMatrixMean[0][0] + confMatrixMean[0][1])  # TN/(TN+FP)
    precision = confMatrixMean[1][1] / (confMatrixMean[1][1] + confMatrixMean[0][1])  # TP/(TP+FP)
    # CONF Matrix Struct=[TN,FP;FN,TP]
    print("Average value of accuracy:", np.round(accuracy, 2), "\t recal:", np.round(sensitivity, 2), "\t specificity:", np.round(specificity, 2), "\t precisiion", np.round(precision, 2))

def dataSplitter(SMOTEFlag,normalFlag,randomSeed):
    dfData = pd.read_csv(os.path.join(addressPrefix, "dfFeatures.csv"))
    dfData.sort_values(["Name", "Start"], ascending=(True, True), inplace=True)
    dfData.reset_index(drop=True, inplace=True)

    xDataPos=[]
    yDataPos=[]    
    posNegSeparator(dfData,xDataPos,yDataPos,1)

    xDataNeg=[]
    yDataNeg=[]
    posNegSeparator(dfData,xDataNeg,yDataNeg,0)

    dfTemp = []
    foldMaker("Pos",yDataPos,dfTemp)
    foldMaker("Neg",yDataNeg,dfTemp)

    dfData = pd.DataFrame(dfData, columns=["Stat", "Train", "Test", "Set"])
    setDatas = list(set(dfData["Set"].to_list()))
    f1ScoreSets = []
    confMatrixSets = []
    for counter in len(setDatas):
        xTrain=[]
        yTrain=[]
        xTest=[]
        yTest=[]
        foldRunner(dfData,xDataPos,yDataPos,xDataNeg,yDataNeg,xTrain,yTrain,xTest,yTest)
        clf, f1ScoreTemp, confMatrixTest = xgClassifier(xTrain, yTrain, xTest, yTest, randomSeed, normalFlag, SMOTEFlag)
        f1ScoreSets.append(f1ScoreTemp)
        confMatrixSets.append(confMatrixTest)
    dataSummarizer()

normalFlag = True
SMOTEFlag=True
randomSeed = 78
dataSplitter(SMOTEFlag,normalFlag,randomSeed)



# posYData = positiveData["EatingFlag"].to_list()
# posYData = np.asarray(posYData).astype(float)

# negYData = negativeData["EatingFlag"].to_list()
# negYData = np.asarray(negYData).astype(float)

# positiveData.drop(columns=["EatingFlag", "Name", "Start", "End"], inplace=True)
# posXData = positiveData.values
# posXData = np.asarray(posXData).astype(float)

# negativeData.drop(columns=["EatingFlag", "Name", "Start", "End"], inplace=True)

# negXData = negativeData.values
# negXData = np.asarray(negXData).astype(float)

# kf = KFold(n_splits=5, shuffle=False)
# dfData = []
# setCounter = 0
# for trainIndex, testIndex in kf.split(posYData):
#     dfData.append(["Pos", trainIndex, testIndex, setCounter])
#     setCounter += 1

# setCounter = 0
# for trainIndex, testIndex in kf.split(negYData):
#     dfData.append(["Neg", trainIndex, testIndex, setCounter])
#     setCounter += 1

# dfData = pd.DataFrame(dfData, columns=["Stat", "Train", "Test", "Set"])
# setDatas = list(set(dfData["Set"].to_list()))
# f1ScoreSets = []
# confMatrixSets = []
# for setData in setDatas:
#     dfTemp = dfData[dfData["Set"] == setData]
#     dfPos = dfTemp[dfTemp["Stat"] == "Pos"]

#     tempVal = dfPos["Train"].to_list()
#     tempVal = np.asarray(tempVal).astype(int)
#     tempVal = tempVal.flatten()
#     xTrain = posXData[tempVal, :]
#     yTrain = posYData[tempVal]

#     tempVal = dfPos["Test"].to_list()
#     tempVal = np.asarray(tempVal).astype(int)
#     tempVal = tempVal.flatten()
#     xTest = posXData[tempVal, :]
#     yTest = posYData[tempVal]

#     dfTemp = dfData[dfData["Set"] == setData]
#     dfNeg = dfTemp[dfTemp["Stat"] == "Neg"]

#     tempVal = dfNeg["Train"].to_list()
#     tempVal = np.asarray(tempVal).astype(int)
#     tempVal = tempVal.flatten()
#     xTrain = np.concatenate((xTrain, negXData[tempVal, :]), axis=0)
#     yTrain = np.concatenate((yTrain, negYData[tempVal]), axis=0)

#     tempVal = dfNeg["Test"].to_list()
#     tempVal = np.asarray(tempVal).astype(int)
#     tempVal = tempVal.flatten()
#     xTest = np.concatenate((xTest, negXData[tempVal, :]), axis=0)
#     yTest = np.concatenate((yTest, negYData[tempVal]), axis=0)

#     clf, f1ScoreTemp, confMatrixTest = xgClassifier(xTrain, yTrain, xTest, yTest, randomSeed, NormalFlag=True, SMOTEFlag=True)
#     f1ScoreSets.append(f1ScoreTemp)
#     confMatrixSets.append(confMatrixTest)

# f1ScoreSets = np.asarray(f1ScoreSets).astype(float)
# f1ScoreSets = np.round(f1ScoreSets, 3)
# confMatrixMean = np.mean(np.asarray(confMatrixSets), axis=0)
# print("All F1 scores", f1ScoreSets)
# print("Average F1:", np.mean(f1ScoreSets))
# print("Average Conf Matrix:", confMatrixMean)
# accuracy = (confMatrixMean[0][0] + confMatrixMean[1][1]) / (confMatrixMean[0][0] + confMatrixMean[0][1] + confMatrixMean[1][0] + confMatrixMean[1][1])  # (TP+NP)/(TP+NP+FP+FN)
# sensitivity = confMatrixMean[1][1] / (confMatrixMean[1][0] + confMatrixMean[1][1])  # TP/(TP+FN)
# specificity = confMatrixMean[0][0] / (confMatrixMean[0][0] + confMatrixMean[0][1])  # TN/(TN+FP)
# precision = confMatrixMean[1][1] / (confMatrixMean[1][1] + confMatrixMean[0][1])  # TP/(TP+FP)
# # CONF Matrix Struct=[TN,FP;FN,TP]
# print("Average value of accuracy:", np.round(accuracy, 2), "\t recal:", np.round(sensitivity, 2), "\t specificity:", np.round(specificity, 2), "\t precisiion", np.round(precision, 2))


ValueError: could not convert string to float: 'P2001'