In [2]:
import copy
import math
from datetime import datetime,timedelta
from dateutil.relativedelta import relativedelta
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import math
import csv
from scipy.interpolate import interp1d
from scipy.stats import pearsonr, mode
from scipy.signal import savgol_filter
import xgboost as xgb
import sklearn
import copy
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from scipy.ndimage import gaussian_filter1d

addressPrefix='C:/Users/sorush.omidvar/Google Drive/Documents/Educational/TAMU/Research/CGM Dataset/Hoover/'
if not os.path.exists(addressPrefix):
    addressPrefix='C:/GDrive/Documents/Educational/TAMU/Research/CGM Dataset/Hoover/'
pd.options.mode.chained_assignment = None  # default='warn'
plt.style.use({'figure.facecolor':'white'})

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', 500)

In [3]:
def dfCompactor(df):
    df['Date']=df['Date'].astype(int)
    df['Time']=df['Time']*1000
    df['Time']=df['Time'].astype(int)
    df.rename(columns={"Time": "Time[ms]"})

    df['GyroX']=df['GyroX'].astype(float)
    df['GyroX']=df['GyroX']*1000*1000
    df['GyroX']=df['GyroX'].astype(int)
    df.rename(columns={"GyroX": "GyroX[microD/s]"})

    df['GyroY']=df['GyroY'].astype(float)
    df['GyroY']=df['GyroY']*1000*1000
    df['GyroY']=df['GyroY'].astype(int)
    df.rename(columns={"GyroY": "GyroY[microD/s]"})

    df['GyroZ']=df['GyroZ'].astype(float)
    df['GyroZ']=df['GyroZ']*1000*1000
    df['GyroZ']=df['GyroZ'].astype(int)
    df.rename(columns={"GyroZ": "GyroZ[microD/s]"})

    df['AccelX']=df['AccelX'].astype(float)
    df['AccelX']=df['AccelX']*1000*1000
    df['AccelX']=df['AccelX'].astype(int)
    df.rename(columns={"AccelX": "AccelX[microm/s2]"})

    df['AccelY']=df['AccelY'].astype(float)
    df['AccelY']=df['AccelY']*1000*1000
    df['AccelY']=df['AccelY'].astype(int)
    df.rename(columns={"AccelY": "AccelY[microm/s2]"})

    df['AccelZ']=df['AccelZ'].astype(float)
    df['AccelZ']=df['AccelZ']*1000*1000
    df['AccelZ']=df['AccelZ'].astype(int)
    df.rename(columns={"AccelZ": "AccelZ[microm/s2]"})

    return df

def dfOrganizer(df):
    df.columns.values[2]='TimeStamp'

    df.columns.values[8]='GyroX'
    df.columns.values[9]='GyroY'
    df.columns.values[10]='GyroZ'

    df.columns.values[11]='AccelX'
    df.columns.values[12]='AccelY'
    df.columns.values[13]='AccelZ'

    df = df.filter(['Name','TimeStamp','GyroX','GyroY','GyroZ','AccelX','AccelY','AccelZ'])
    df['TimeStamp'] = df['TimeStamp'].astype(float)
    df['TimeStamp']=df['TimeStamp']-1000*3600*4 #fixing the timezone

    df.insert(2,'Date',float('nan'))
    df.insert(3,'Time',float('nan'))

    df['Date']=pd.to_datetime(df['TimeStamp'],unit='ms')
    df['Time']=pd.to_datetime(df['TimeStamp'],unit='ms')
    df['Date']=df['Date'].dt.dayofyear
    df['Time']=df['Time'].dt.hour*3600+df['Time'].dt.minute*60+df['Time'].dt.second+df['Time'].dt.microsecond*0.001*0.001

    df.drop(columns=['TimeStamp'],inplace=True)
    return df

def csvReader(addressPrefix):
    dataFiles=[]
    for root, dirs, files in os.walk(addressPrefix, topdown=False):
       for name in files:
           if '.csv' in name:
               dataFiles.append([os.path.join(root,name)])
    for counter,element in enumerate(dataFiles):
        print(element)
        rows = []
        with open(element[0], 'r') as csvfile:
            csvreader = csv.reader(csvfile,delimiter = "\t")
            next(csvreader) #skipping the first junk line
            headers = next(csvreader) #column titles
            while '' in headers:
                headers.remove("")
            if len(headers)!=17:
                continue
            next(csvreader) #skipping the units
            for row in csvreader:
                rows.append(row)
        df = pd.DataFrame(rows,columns=headers)
        participantName=element[0]
        participantName=participantName[participantName.find('CSV1')+5:participantName.find('CSV1')+10]
        df.insert(0,'Name',participantName)

        df=dfOrganizer(df)
        df=dfCompactor(df)

        if counter==0:
            dfTotal=df
        else:
            frames=[dfTotal,df]
            dfTotal=pd.concat(frames)

        # if counter==1:
        #     break
    dfTotal.sort_values(by=['Name', 'Date','Time'],inplace=True)
    return dfTotal

def preProcessor(dfTotal,R):
    columns=dfTotal.columns.values
    names=dfTotal['Name'].tolist()
    names=list(set(names))

    dfProc=pd.DataFrame([],columns=columns)
    for counter,name in enumerate(names):
        print(name, (counter+1)/len(names))
        df=dfTotal[dfTotal['Name']==name]
        for column in columns:
            if column!='Time' and column!='Date' and column!='Name':
                df[column]=list(gaussian_filter1d(df[column].tolist(),sigma=R))
            df['GyroX']=df['GyroX'].astype(float)

        df['GyroX']=df['GyroX'].astype(int)
        df['GyroY']=df['GyroY'].astype(int)
        df['GyroZ']=df['GyroZ'].astype(int)
        df['AccelX']=df['AccelX'].astype(int)
        df['AccelY']=df['AccelY'].astype(int)
        df['AccelZ']=df['AccelZ'].astype(int)

        frames=[dfProc,df]
        dfProc=pd.concat(frames)
        nameIndex = dfTotal[(dfTotal.Name == name)].index
        dfTotal.drop(nameIndex,inplace=True)
    return dfProc

def funcCaller(addressPrefix):
    if os.path.exists(os.path.join(addressPrefix,'RawData.csv')):
        dfRaw=pd.read_csv(os.path.join(addressPrefix,'RawData.csv'))
    else:
        dfRaw=csvReader(os.path.join(addressPrefix,'CSV1'))
        dfRaw.to_csv(os.path.join(addressPrefix,'RawData.csv'),index=False)
    names=dfRaw['Name'].tolist()
    names=list(set(names))
    print('Total particpant number=',len(names))

    if os.path.exists(os.path.join(addressPrefix,'FilteredData.csv')):
        dfProcessed=pd.read_csv(os.path.join(addressPrefix,'FilteredData.csv'))
    else:
        dfProcessed=preProcessor(dfRaw,R=3)
        dfProcessed.to_csv(os.path.join(addressPrefix,'FilteredData.csv'),index=False)
    return dfProcessed

dfTotal=funcCaller(addressPrefix)

['C:/Users/sorush.omidvar/Google Drive/Documents/Educational/TAMU/Research/CGM Dataset/Hoover/CSV1\\P2001\\data\\csv\\P105_10_30_Session1_P105_10_30_Calibrated.csv']
['C:/Users/sorush.omidvar/Google Drive/Documents/Educational/TAMU/Research/CGM Dataset/Hoover/CSV1\\P2011\\data\\fcsv\\P011_09_15_Session2_P011_9_15_Calibrated.csv']
['C:/Users/sorush.omidvar/Google Drive/Documents/Educational/TAMU/Research/CGM Dataset/Hoover/CSV1\\P2012\\data\\csv\\P012_10_06_Session1_P012_10_06_Calibrated.csv']
['C:/Users/sorush.omidvar/Google Drive/Documents/Educational/TAMU/Research/CGM Dataset/Hoover/CSV1\\P2013\\data\\csv\\P013_10_14_Session1_P013_10_14_Calibrated.csv']
['C:/Users/sorush.omidvar/Google Drive/Documents/Educational/TAMU/Research/CGM Dataset/Hoover/CSV1\\P2014\\data\\csv\\P014_09_15_Session1_P014_09_15_Calibrated.csv']
['C:/Users/sorush.omidvar/Google Drive/Documents/Educational/TAMU/Research/CGM Dataset/Hoover/CSV1\\P2015\\data\\csv\\P015_10_22_Session1_P015_10_22_Calibrated.csv']
['C:

In [3]:
def labelReader(addressPrefix):
    dataFiles=[]
    for root, dirs, files in os.walk(addressPrefix, topdown=False):
       for name in files:
           if '.txt' in name:
               dataFiles.append([os.path.join(root,name),name])
    mealTime=[]
    for counter,element in enumerate(dataFiles):
        nameTemp=element[1]
        nameTemp=nameTemp[:nameTemp.find('-events')]
        with open(element[0], 'r+') as txtfile:
            fileData = txtfile.read()
            fileData=fileData.splitlines()
            while '' in fileData:
                fileData.remove('')
            for counter in range(1,len(fileData)-1):
                tempStr=fileData[counter]
                tempStr=tempStr.split()
                mealTime.append([nameTemp,tempStr[1],tempStr[2]])
    df=pd.DataFrame(mealTime,columns=['Name','Start','End'])
    df['Start'] = pd.to_datetime(df['Start'],format= '%H:%M:%S',errors='coerce')
    df = df[df['Start'].notna()]
    df['Start']=df['Start'].dt.hour*3600*1000+df['Start'].dt.minute*60*1000+df['Start'].dt.second*1000

    df['End'] = pd.to_datetime(df['End'],format= '%H:%M:%S',errors='coerce')
    df = df[df['End'].notna()]
    df['End']=df['End'].dt.hour*3600*1000+df['End'].dt.minute*60*1000+df['End'].dt.second*1000
    df.sort_values(by=['Name', 'Start','End'],inplace=True)
    return df

def featureExtractor(df):
    windowLength=30*1000
    featureData=[]
    names=df['Name'].tolist()
    names=list(set(names))
    for name in names:
        dfName=df[df['Name']==name]
        startTime=dfName['Time'].min()
        endTime=startTime+windowLength
        while startTime<24*3600*1000:
            dfTemp=dfName[dfName['Time']>=startTime]
            dfTemp=dfTemp[dfTemp['Time']<endTime]
            if len(dfTemp)>5:
                f2=abs(dfTemp['AccelX'].values)+abs(dfTemp['AccelY'].values)+abs(dfTemp['AccelZ'].values)
                f1=abs(dfTemp['GyroX'].values)+abs(dfTemp['GyroY'].values)+abs(dfTemp['GyroZ'].values)
                f1=f1/f2
                f1=np.mean(f1)
                f2=np.mean(f2)
                featureData.append([name,startTime,endTime,f1,f2])
            startTime+=windowLength
            endTime+=windowLength
        break
    return featureData

dfLabel=labelReader(os.path.join(addressPrefix,'EVENTfiles'))
featureData=featureExtractor(dfTotal)

In [4]:
def labelExtractor(dfLabel,features):
    dataTotal=[]
    for feature in features:
        windowName=feature[0]
        windowStart=feature[1]
        windowEnd=feature[2]
        f1=feature[3]
        f2=feature[4]
        dfTemp=dfLabel[dfLabel['Name']==windowName]
        if len(dfTemp)==0:
            print('skipped',windowName)
            continue
        eatingFlag=False
        for counter in range(0,len(dfTemp)):
            if dfTemp.iloc[counter,1]<windowEnd and dfTemp.iloc[counter,2]>windowStart:
                eatingFlag=True
                break
        dataTotal.append([f1,f2,eatingFlag])
    dataTotal=np.asarray(dataTotal)
    return dataTotal

data=labelExtractor(dfLabel,featureData)

In [6]:
def XGClassifier(dataList, labelList,randomSeed):
    trainData, testData, trainLabels, testLabels = train_test_split(dataList, labelList, test_size=0.25,random_state=randomSeed)
    trainData, valData, trainLabels, valLabels = train_test_split(trainData, trainLabels, test_size=0.33,random_state=randomSeed)
    accuracyBest=0
    for maxDepth in np.arange(3,30):
        for estimator in np.arange(5,50,2):
            clf = xgb.XGBClassifier(n_estimators=estimator,max_depth=maxDepth,objective = "binary:logistic",
                                    eval_metric = "logloss",use_label_encoder =False,scale_pos_weight=20)
            clf.fit(trainData, trainLabels)
            slidingWindowPrediction = clf.predict_proba(valData)
            slidingWindowPrediction=slidingWindowPrediction[:,1]
            slidingWindowPrediction[slidingWindowPrediction>=0.5]=1
            slidingWindowPrediction[slidingWindowPrediction<0.5]=0

            confMatrix=sklearn.metrics.confusion_matrix(valLabels,slidingWindowPrediction)
            accuracy=sklearn.metrics.accuracy_score(valLabels,slidingWindowPrediction)
            recall=sklearn.metrics.recall_score(valLabels,slidingWindowPrediction)
            precision=sklearn.metrics.precision_score(valLabels,slidingWindowPrediction)

            if accuracy>accuracyBest:
                confMatrixBest=confMatrix
                accuracyBest=accuracy
                modelBest=clf
                recallBest=recall
                precisionBest=precision
    print('Testing on validation dataset:')
    print(confMatrixBest)
    print('Accuracy',np.round(100*accuracyBest,0),'Recall',np.round(100*recallBest,0),'Precision',np.round(100*precisionBest,0))

    slidingWindowPrediction = modelBest.predict_proba(testData)
    slidingWindowPrediction=slidingWindowPrediction[:,1]
    slidingWindowPrediction[slidingWindowPrediction>=0.5]=1
    slidingWindowPrediction[slidingWindowPrediction<0.5]=0

    confMatrix=sklearn.metrics.confusion_matrix(testLabels,slidingWindowPrediction)
    accuracy=sklearn.metrics.accuracy_score(testLabels,slidingWindowPrediction)
    recall=sklearn.metrics.recall_score(testLabels,slidingWindowPrediction)
    precision=sklearn.metrics.precision_score(testLabels,slidingWindowPrediction)

    print('Testing on validation dataset:')
    print(confMatrix)
    print('Accuracy',np.round(100*accuracy,0),'Recall',np.round(100*recall,0),'Precision',np.round(100*precision,0))

allData=data[:,0:2]
allLabel=data[:,2]
XGClassifier(allData, allLabel,randomSeed=53)

Testing on validation dataset:
[[332   6]
 [  6   2]]
Accuracy 97.0 Recall 25.0 Precision 25.0
Testing on validation dataset:
[[325  12]
 [ 11   1]]
Accuracy 93.0 Recall 8.0 Precision 8.0
