### General Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import os
import glob
from cleanUp import cleanUp
from fillDf import fillDf
from fixYearStamp import fixYearStamp
from sklearn.cluster import KMeans
import time as clock


In [2]:
begin = clock.time()

### Data Cleaning
Passing the sensor data through the cleanUp function to get fix timestamps and delete null timestamps.

In [3]:
all_csv_files = glob.glob("./Data/*.txt")
# insert the desired start time
cutOffTime = '3/22/2021 9:30'
endTime = '3/22/2021 13:00'
# insert the time rectifying offsets. default of for nothing {'':0}
sensorConditions = {'S-01':7,'S-02':7,'S-03':7,'S-04':7,'S-05':7,'S-06':7,'S-15':7,'S-19':7}
#This indicates which columns to keep. Here we're taking all of the dP info and the timestamps
columns = [0,1,6,7,8,9,10,11]
# Enable Data Checking
DataChecking = False
# Here are obversed timestamps that need to removed from the data
badTimes = ['     0/0/0      0:0:0','2165/165/165 165:165:85']
# Controls wether zones will be created automatically or by k-means clusters
ZoneAutomation = False
# Sets either the binning or the manual zones
numberOfZones = 4
# Sensors to exclude from zone
outdoorSensors = ['S-16','S-17','S-18','S-19']
# 10s of seconds before nebulization to include in the expirement csv files
preCursorFactor = 6
# which particle to analyze
particle = 'Dp>0.3'

In [4]:
expTRange = {

    'EE502 Door Closed':
    [#pd.Timestamp('3/22/2021 9:40'), # removed this one due to odd behavior
    pd.Timestamp('3/22/2021 10:05:23'),
    pd.Timestamp('3/22/2021 10:23:52')],
    'EE502 Door Open':
    [pd.Timestamp('3/22/2021 10:42:43'),
    pd.Timestamp('3/22/2021 10:59:23'),
    pd.Timestamp('3/22/2021 11:15:37')],
    'EE502 Negative Pressure':
    [pd.Timestamp('3/22/2021 11:32:21'),
    pd.Timestamp('3/22/2021 11:42:27'),
    pd.Timestamp('3/22/2021 11:53:47')],
    'EE504 Door Open':
    [pd.Timestamp('3/22/2021 12:19:12'),
    pd.Timestamp('3/22/2021 12:30:10'),
    pd.Timestamp('3/22/2021 12:40:15')],
}

#enter in the expirement length as seconds/10
expTLen = {
    'EE502 Door Closed' : 15*6,
    'EE502 Door Open':15*6,
    'EE502 Negative Pressure':10*6,
    'EE504 Door Open':10*6,
}

# Manual Zone set up notice how we are missing S-14
zoneList = {
    'Zone 1' : ['S-01','S-04'],
    'Zone 2' : ['S-02','S-03','S-05','S-06'],
    'Zone 3' : ['S-07','S-08','S-09','S-10','S-11','S-12','S-13','S-15','S-16'],
    'Zone 4' : ['S-17','S-18'],
    'Zone 5' : ['S-19']
}
if not ZoneAutomation:
    numberOfZones = len(zoneList)

In [5]:
all_csv_files

['./Data\\S-01.txt',
 './Data\\S-02.txt',
 './Data\\S-03.txt',
 './Data\\S-04.txt',
 './Data\\S-05.txt',
 './Data\\S-06.txt',
 './Data\\S-07.txt',
 './Data\\S-08.txt',
 './Data\\S-09.txt',
 './Data\\S-10.txt',
 './Data\\S-11.txt',
 './Data\\S-12.txt',
 './Data\\S-13.txt',
 './Data\\S-15.txt',
 './Data\\S-16.txt',
 './Data\\S-17.txt',
 './Data\\S-18.txt',
 './Data\\S-19.txt']

Changed this to markdown so it won't run twice, had to fix the timestamps on S-12
filePath        = all_csv_files[11]
incorrectString = '21/3/22'
date            = '3/22/2021'
charTimeStart   = 11
charTimeEnd     = 21
offset          = 0
fixYearStamp(filePath,incorrectString,date,charTimeStart,charTimeEnd,offset)

In [6]:
data = cleanUp(cutOffTime,sensorConditions,all_csv_files,columns,badTimes)

S-01     2021-03-22 09:30:00      2021-03-22 13:00:19       mod: yes
S-02     2021-03-22 09:30:01      2021-03-22 12:59:21       mod: yes
S-03     2021-03-22 09:30:00      2021-03-22 12:59:09       mod: yes
S-04     2021-03-22 09:30:00      2021-03-22 12:59:00       mod: yes
S-05     2021-03-22 09:30:00      2021-03-22 12:59:29       mod: yes
S-06     2021-03-22 09:30:00      2021-03-22 13:00:49       mod: yes
S-07     2021-03-22 09:30:05      2021-03-22 12:58:50       mod: no
S-08     2021-03-22 09:30:05      2021-03-22 12:58:46       mod: no
S-09     2021-03-22 09:30:16      2021-03-22 12:59:26       mod: no
S-10     2021-03-22 09:30:06      2021-03-22 12:59:32       mod: no
S-11     2021-03-22 09:30:03      2021-03-22 12:59:53       mod: no
S-12     2021-03-22 09:30:08      2021-03-22 12:59:30       mod: no
S-13     2021-03-22 09:30:05      2021-03-22 12:59:15       mod: no
S-15     2021-03-22 09:30:00      2021-03-22 12:59:03       mod: yes
S-16     2021-03-22 09:30:00      2021-03

### Exporting Data
Here we can export the organized data frames as csv files

In [7]:
directory = './proccessedData'
for x in data:
    temp=data[x]
    if not os.path.exists(directory):
        os.makedirs(directory)
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

### Checking Data
Here we scan through the data for irregularities in data recording.

In [8]:
if DataChecking:
    directory = './dataInfo'
    if not os.path.exists(directory):
        os.makedirs(directory)
    fout = open('./dataInfo/time_Frequency_Error_Log.txt','wt')
    errors = {}
    errorCount = {}
    # Enter the expected interval here
    interval = 10
    for x in data:
        # errors keeps track of length of each time interval error that occurs
        errors[x] = set(())
        # errorCount keeps track of how many times each time interval error occured
        errorCount[x] = {}
        # counter keeps track of the total time interval errors per sensor
        counter = 0
        #shows the total
        temp = data[x]
        for idx,i in enumerate(temp['Date_Time']):
            try:
                if not ((temp['Date_Time'][idx+1] - i) == pd.Timedelta(seconds=interval)):
                    timeErr = temp['Date_Time'][idx+1] - i
                    if str(timeErr.seconds) in errorCount[x]:
                        errorCount[x][str(timeErr.seconds)] +=1
                    else:
                        errorCount[x][str(timeErr.seconds)] = 1

                    errors[x].add(timeErr)


                    counter += 1
            except:
                continue

        print(str(round(counter/len(temp)*100,2)),'% potential error in ', x)
        fout.write('potential error in '+ x +'\n' + str(round(counter/len(temp)*100,2))+'%'+'\n')

        # display the different types of errors
        lst = [i.seconds for i in errors[x]]
        frmt = "{:>4}"*len(lst)
        print(frmt.format(*lst))
        fout.write("Time Errors" + frmt.format(*lst)+ '\n')

        # display the quantity of each type of error
        lst = [errorCount[x][str(i.seconds)] for i in errors[x]]
        frmt = "{:>4}"*len(lst)
        print(frmt.format(*lst))
        fout.write("# Observed " + frmt.format(*lst)+ '\n')

        print()
        fout.write('\n')


    fout.close()

Notice there are quite a few repeating errors here in our data set. We can either choose to interpolate the data inbetween or pad it with 0s. For gaps <40s i will interpolate, but for gaps >40 i will 0 pad.

In [9]:

fout = open('./dataInfo/interpolation_Effect_Log.txt','wt')
interpDF = {}

for x in data:
    df = data[x]
    cutoff = 40
    freq = '10S'
    try:
        interpDF[x],accuracy = fillDf(df,freq,cutOffTime,endTime,cutoff)
        print(x,' ',accuracy)
        fout.write(x+' '+ '\n' + accuracy[0]+ '\n'+ accuracy[1]+ '\n'+ accuracy[2] +'\n\n')
    except IndexError:
        print(x,'NO DATA')
        fout.write(x+'NO DATA'+'\n')
fout.close()        

S-01   ['% of values from interpolation : 0.0', '% of values from 0-padding : 0.0', '% of values not changed : 100.0']
S-02   ['% of values from interpolation : 0.0', '% of values from 0-padding : 0.0', '% of values not changed : 100.0']
S-03   ['% of values from interpolation : 0.159', '% of values from 0-padding : 0.0', '% of values not changed : 99.841']
S-04   ['% of values from interpolation : 0.398', '% of values from 0-padding : 0.0', '% of values not changed : 99.602']
S-05   ['% of values from interpolation : 0.477', '% of values from 0-padding : 6.762', '% of values not changed : 92.761']
S-06   ['% of values from interpolation : 0.0', '% of values from 0-padding : 0.0', '% of values not changed : 100.0']
S-07   ['% of values from interpolation : 33.413', '% of values from 0-padding : 0.0', '% of values not changed : 66.587']
S-08   ['% of values from interpolation : 33.36', '% of values from 0-padding : 0.0', '% of values not changed : 66.64']
S-09   ['% of values from inter

### Export Data
export the newly interpolated data

In [10]:
directory = './interpolatedData'
for x in interpDF:
    temp=interpDF[x]
    if not os.path.exists(directory):
        os.makedirs(directory)
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

### Merge the DataFrames

In [11]:
length = []
for x in interpDF:
    length.append(len(interpDF[x]))
index = min(length)
lowIDX,lowValue = [[i,value] for i,value in enumerate(length) if value == index][0]
print(lowIDX,lowValue)

7 1253


for count,key in enumerate(list(interpDF.keys())):
    print(count+1,key,temp[count+1])

In [12]:
columns = list(interpDF.keys())
mergedData = pd.DataFrame({'Date_Time':interpDF[columns[lowIDX]]['Date_Time']})
for idx,column in enumerate(columns):
    mergedData[column] = interpDF[column][particle]
Average = np.mean(mergedData,axis=1)
Variance = np.var(mergedData,axis=1)
mergedData['Average'] = Average
mergedData['Variance'] = Variance
mergedData

Unnamed: 0,Date_Time,S-01,S-02,S-03,S-04,S-05,S-06,S-07,S-08,S-09,...,S-11,S-12,S-13,S-15,S-16,S-17,S-18,S-19,Average,Variance
0,2021-03-22 09:30:00,0,0,0,0,0,21,9,0,702,...,0,0,0,0,0,0,0,0,40.666667,25753.222222
1,2021-03-22 09:30:10,0,0,0,0,0,30,0,0,702,...,0,0,0,0,0,9,0,0,41.166667,25737.805556
2,2021-03-22 09:30:20,0,0,0,0,0,21,0,0,99,...,0,0,0,0,0,9,0,0,7.166667,522.138889
3,2021-03-22 09:30:30,42,9,0,0,0,0,0,0,99,...,0,0,0,0,45,0,0,0,10.833333,642.138889
4,2021-03-22 09:30:40,0,9,0,0,0,0,0,0,49,...,21,30,0,0,0,0,0,4,6.277778,173.867284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1248,2021-03-22 12:58:00,9,0,0,0,0,0,0,9,0,...,0,0,0,0,0,30,0,0,2.666667,51.888889
1249,2021-03-22 12:58:10,0,0,0,0,0,0,0,9,0,...,0,0,0,0,0,30,0,0,2.166667,49.805556
1250,2021-03-22 12:58:20,0,9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.500000,4.250000
1251,2021-03-22 12:58:30,0,9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.500000,4.250000


### Increase Resolution on mergedData

In [13]:

for i in mergedData:
    tempFrame = mergedData.values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    hiResMergedDF = pd.DataFrame(tempList, columns = mergedData.keys())

### Export Merged Frames

In [14]:
directory = './mergedData/'
if not os.path.exists(directory):

    os.makedirs(directory)

location = os.path.join(directory+'mergedFrame.csv')
hiResMergedDF.to_csv(location,index=False)

### Create csv files for each animation
We have 3 expirements in each that we want to average across the range

In [15]:
# mergedData = pd.read_csv('./mergedData/mergedFrame.csv',parse_dates=[0])

In [16]:
time = mergedData['Date_Time']
expIndexes = {}
for i in expTRange:
    expIndexes[i] = []
    for x in expTRange[i]:
        for start,n in enumerate(time):
           if n >= x:
               expIndexes[i].append(start)
               break

## Determining Zones
Here we first create 'averagedFrame's. These are dictionaries that at each 'label' (which corresponds to the name of an expirement) we have a pandas dataframe containing the results of all of the trails in an expirement summed, and then divided by the total number of trails.
Anytime you are adjusting the Zones, everything below here must be run. The values of many of these DataFrames are mutated

In [17]:
expIndexes

{'EE502 Door Closed': [213, 324],
 'EE502 Door Open': [437, 537, 634],
 'EE502 Negative Pressure': [735, 795, 863],
 'EE504 Door Open': [1016, 1081, 1142]}

In [18]:
# preCursorFactor is defined at the start
averagedFrame = {}
expirementFrame = {}

for label in expIndexes:
    runSumFrames = expIndexes[label][0]-expIndexes[label][0]
    for idx,time in enumerate(expIndexes[label]):
        start = expIndexes[label][idx] - preCursorFactor
        end = expIndexes[label][idx] + expTLen[label]
        expirementFrame[label+' Exp '+str(idx+1)] = mergedData.iloc[ start : end , 1: ].reset_index(drop = True)
        runSumFrames += expirementFrame[label+' Exp '+str(idx+1)]
        
    averagedFrame[label] = runSumFrames/(idx+1)

Calculating the correct Zones for each expirement

In [19]:
if ZoneAutomation:
    # numberOfZones is defined at the start
    ZoneAssignments = {}
    for frame in averagedFrame:
        # at this point averagedFrame should just be the averaged sum of the expirementFrame trails. Last two columns are overall average and varaince so they should be ignored.
        avgFrm = averagedFrame[frame]
        # outdoorSensors must have its spelling exactly match
        columns = list(set(avgFrm.keys()[:-2])- set(outdoorSensors))
        columns.sort()

        X = {}
        for column in columns:
            value,index = max([(value,index) for index,value in enumerate(avgFrm[column])]) 
            X[column] = np.array([np.log(value),index])
        X = [X[i] for i in X]
        kmeans = KMeans(n_clusters=numberOfZones,random_state=0).fit(X)
        ZoneAssignments[frame] = kmeans.labels_
    z = numberOfZones
    ZDf = pd.DataFrame(ZoneAssignments)
    ZDf = ZDf.append(pd.DataFrame([[z]*len(outdoorSensors)]*len(expIndexes),columns = ZoneAssignments.keys()),ignore_index=True)
    ZoneAssignments = ZDf

In [20]:
if not ZoneAutomation:
    ZoneAssignments = {}
    for frame in averagedFrame:
        # at this point averagedFrame should just be the averaged sum of the expirementFrame trails. Last two columns are overall average and varaince so they should be ignored.
        avgFrm = averagedFrame[frame]
        # outdoorSensors must have its spelling exactly match
        columns = list(set(avgFrm.keys()[:-2]))
        columns.sort()
        ZoneAssignments[frame] = [0]*len(columns)
        for value,zone in enumerate(zoneList):
            for sensor in zoneList[zone]:
                ZoneAssignments[frame][columns.index(sensor)] = value
    ZDf = pd.DataFrame(ZoneAssignments)

Zone 1
Zone 2
Zone 3
Zone 4
Zone 5
Zone 1
Zone 2
Zone 3
Zone 4
Zone 5
Zone 1
Zone 2
Zone 3
Zone 4
Zone 5
Zone 1
Zone 2
Zone 3
Zone 4
Zone 5


In [25]:
directory = './dataInfo'
if not os.path.exists(directory):
    os.makedirs(directory)
location = os.path.join(directory,'ZoneAssignments.csv')
ZDf.to_csv(location,index=False)

Zoning the expirement data.

## Zoning the Data

In [27]:
zonedAvgFrame = {}
for key in ZoneAssignments:
    occourances = [list(ZoneAssignments[key]).count(x) for x in set(ZoneAssignments[key])]
    zoneRunSum = [0]*numberOfZones
    zonedAvgFrame[key] = averagedFrame[key]
    for idx,column in enumerate(columns):
        zoneRunSum[ZoneAssignments[key][idx]] += zonedAvgFrame[key][column]
    for idx in range(numberOfZones):
        zonedAvgFrame[key]['Zone '+str(idx+1)] = zoneRunSum[idx]/occourances[idx]

In [28]:
zonedAvgFrame[key]['Zone '+str(idx)]

0     40.5
1     40.5
2      3.0
3     27.5
4     33.5
      ... 
61    45.0
62    42.5
63    12.5
64    48.0
65    48.5
Name: Zone 4, Length: 66, dtype: float64

In [29]:
# relies on columns still being the values of S-01 - last sensor

# Declare an empty dictionary for storing the averaged data for each expirement at the end
zonedExpFrame = {}
# create a list of all of the various dict keys in expirementFrame so that we can iterate through them to get the data
labels = list(expirementFrame.keys())
# Take the labels list and remove the Exp # from it, so that now we have a list of keys that we can use to correctly save to create correctly corresponding keys for a dictionary that will store the averages
keyList = [x.split(' Exp')[0] for x in labels]

for index,exp in enumerate(labels):
    # set the key variable to correspond to the exp variable
    key = keyList[index]
    # Create a runnning sum to keep track of the values
    zoneRunSum = [0]*numberOfZones
    # set the give the zoneExpFrame the same 
    zonedExpFrame[exp] = expirementFrame[exp]
    occourances = [list(ZoneAssignments[key]).count(x) for x in set(ZoneAssignments[key])]
    for idx,column in enumerate(columns):
        zoneRunSum[ZoneAssignments[key][idx]] += zonedExpFrame[exp][column]
    for idx in range(numberOfZones):
        zonedExpFrame[exp]['Zone '+str(idx+1)] = zoneRunSum[idx]/occourances[idx]
        


In [30]:
column

'S-19'

In [31]:
zonedExpFrame[exp]['Zone '+str(idx+1)]==zonedExpFrame['EE504 Door Open Exp 1']['Zone '+str(idx+1)]

0     False
1     False
2     False
3      True
4     False
      ...  
61     True
62     True
63    False
64    False
65    False
Name: Zone 5, Length: 66, dtype: bool

In [32]:
column

'S-19'

In [33]:
directory = './averagedData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in averagedFrame:
    temp=averagedFrame[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)



In [34]:
directory = './expirementData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in expirementFrame:
    temp=expirementFrame[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

### Increase the Resolution
pad out the dataframes to have values for every second.

In [35]:
stretchedDF = {}
for i in averagedFrame:
    tempFrame = averagedFrame[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchedDF[i] = pd.DataFrame(tempList, columns = expirementFrame[list(expirementFrame.keys())[0]].columns)     

In [36]:
stretchExpDf = {}
for i in expirementFrame:
    tempFrame = expirementFrame[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchExpDf[i] = pd.DataFrame(tempList, columns = expirementFrame[list(expirementFrame.keys())[0]].columns)

In [37]:
directory = './stretchedAvgData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchedDF:
    temp=stretchedDF[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

In [38]:
directory = './stretchedExpirementData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchExpDf:
    temp=stretchExpDf[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

In [39]:
end = clock.time()
print(end-begin)

17.53507113456726
