### General Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import os
import glob
from cleanUp import cleanUp
from fillDf import fillDf
from fixYearStamp import fixYearStamp
from sklearn.cluster import KMeans

### Data Cleaning
Passing the sensor data through the cleanUp function to get fix timestamps and delete null timestamps.

In [2]:
all_csv_files = sorted(glob.glob("./Data/*.txt"))
# insert the desired start time
cutOffTime = '4/19/2021 17:00:00'
endTime = '4/19/2021 20:00:00'
# insert the time rectifying offsets. default of for nothing {'':0}
sensorConditions = {'S-01':7,'S-02':7,'S-03':7,'S-04':7,'S-05':7,'S-06':7,'S-15':7,'S-19':7}
#This indicates which columns to keep. Here we're taking all of the dP info and the timestamps
columns = [0,1,6,7,8,9,10,11]
# Here are obversed timestamps that need to removed from the data
badTimes = ['     0/0/0      0:0:0','2165/165/165 165:165:85']
# Controls wether zones will be created automatically or by k-means clusters
ZoneAutomation = False
# Sensors to exclude from zone
outdoorSensors = ['S-16','S-17','S-18','S-19']
# Controls the binning
numberOfZones = 3
# 10s of seconds before nebulization to include in the expirement csv files
preCursorFactor = 6
# which particle to analyze
particle = 'Dp>0.3'

In [3]:
day = '4/19/2021'
expTRange = {

    'OR 5 Unblocked':
    [
    pd.Timestamp(day + ' 5:23:24 PM'),
    pd.Timestamp(day + ' 5:32:20 PM'),
    pd.Timestamp(day + ' 5:42:00 PM'),
    pd.Timestamp(day + ' 5:52:00 PM'),
    pd.Timestamp(day + ' 5:58:00 PM'),
    pd.Timestamp(day + ' 6:25:20 PM')],
    'OR 5 Blocked':
    [
    pd.Timestamp(day + ' 6:08:50 PM'),
    pd.Timestamp(day + ' 6:16:50 PM'),
    pd.Timestamp(day + ' 6:25:20 PM')],
    'OR 12 Unblocked':
    [
    pd.Timestamp(day + ' 6:52:50 PM'),
    pd.Timestamp(day + ' 7:03:30 PM'),
    pd.Timestamp(day + ' 7:13:30 PM')],
    'OR 12 Blocked':
    [
    pd.Timestamp(day + ' 7:25:24 PM'),
    pd.Timestamp(day + ' 7:34:45 PM'),
    pd.Timestamp(day + ' 7:38:24 PM')],
}

#enter in the expirement length as seconds/10
expTLen = {
    'OR 5 Unblocked' : 5*6,
    'OR 5 Blocked':5*6,
    'OR 12 Unblocked':8*6,
    'OR 12 Blocked':7*6,
}
# Manual Zone set up
zoneList = {
    'Zone 1' : ['S-01','S-02','S-03','S-04','S-05','S-06'],
    'Zone 2' : ['S-7','S-8','S-9','S-10''S-11','S-12','S-13','S-14'],
    'Zone 3' : ['S-15','S-18','S-19']
}

In [4]:
all_csv_files

['./Data\\S-01.txt',
 './Data\\S-02.txt',
 './Data\\S-03.txt',
 './Data\\S-04.txt',
 './Data\\S-05.txt',
 './Data\\S-06.txt',
 './Data\\S-07.txt',
 './Data\\S-08.txt',
 './Data\\S-09.txt',
 './Data\\S-10.txt',
 './Data\\S-11.txt',
 './Data\\S-12.txt',
 './Data\\S-13.txt',
 './Data\\S-14.txt',
 './Data\\S-15.txt',
 './Data\\S-16.txt',
 './Data\\S-18.txt',
 './Data\\S-19.txt']

Changed this to markdown so it won't run twice, had to fix the timestamps on S-12
filePath        = all_csv_files[11]
incorrectString = '21/3/22'
date            = '3/22/2021'
charTimeStart   = 11
charTimeEnd     = 21
offset          = 0
fixYearStamp(filePath,incorrectString,date,charTimeStart,charTimeEnd,offset)

In [5]:
data = cleanUp(cutOffTime,sensorConditions,all_csv_files,columns,badTimes)

S-01     2021-04-19 17:22:10      2021-04-19 19:58:59       mod: yes
S-02     2021-04-19 17:03:05      2021-04-19 19:59:29       mod: yes
S-03     2021-04-19 17:19:10      2021-04-19 19:59:09       mod: yes
S-04     2021-04-19 17:39:10      2021-04-19 19:58:59       mod: yes
S-05     2021-04-19 17:06:10      2021-04-19 19:58:29       mod: yes
S-06     2021-04-19 17:11:10      2021-04-19 19:59:09       mod: yes
S-07     2021-04-19 17:07:34      2021-04-19 19:59:18       mod: no
S-08     2021-04-19 17:05:08      2021-04-19 19:59:11       mod: no
S-09     2021-04-19 17:10:11      2021-04-19 19:58:58       mod: no
S-10     2021-04-19 17:09:59      2021-04-19 19:58:36       mod: no
S-11     2021-04-19 17:19:22      2021-04-19 19:59:09       mod: no
S-12     2021-04-19 17:11:50      2021-04-19 19:58:57       mod: no
S-13     2021-04-19 17:21:07      2021-04-19 19:59:14       mod: no
S-14     2021-04-19 17:05:18      2021-04-19 19:59:15       mod: no
S-15     2021-04-19 17:12:10      2021-04-

### Exporting Data
Here we can export the organized data frames as csv files

In [6]:
directory = './proccessedData'
for x in data:
    temp=data[x]
    if not os.path.exists(directory):
        os.makedirs(directory)
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

### Checking Data
Here we scan through the data for irregularities in data recording.

In [7]:
directory = './dataInfo'
if not os.path.exists(directory):
    os.makedirs(directory)
fout = open('./dataInfo/time_Frequency_Error_Log.txt','wt')
errors = {}
errorCount = {}
# Enter the expected interval here
interval = 10
for x in data:
    # errors keeps track of length of each time interval error that occurs
    errors[x] = set(())
    # errorCount keeps track of how many times each time interval error occured
    errorCount[x] = {}
    # counter keeps track of the total time interval errors per sensor
    counter = 0
    #shows the total
    temp = data[x]
    for idx,i in enumerate(temp['Date_Time']):
        try:
            if not ((temp['Date_Time'][idx+1] - i) == pd.Timedelta(seconds=interval)):
                timeErr = temp['Date_Time'][idx+1] - i
                if str(timeErr.seconds) in errorCount[x]:
                    errorCount[x][str(timeErr.seconds)] +=1
                else:
                    errorCount[x][str(timeErr.seconds)] = 1

                errors[x].add(timeErr)


                counter += 1
        except:
            continue

    print(str(round(counter/len(temp)*100,2)),'% potential error in ', x)
    fout.write('potential error in '+ x +'\n' + str(round(counter/len(temp)*100,2))+'%'+'\n')

    # display the different types of errors
    lst = [i.seconds for i in errors[x]]
    frmt = "{:>4}"*len(lst)
    print(frmt.format(*lst))
    fout.write("Time Errors" + frmt.format(*lst)+ '\n')

    # display the quantity of each type of error
    lst = [errorCount[x][str(i.seconds)] for i in errors[x]]
    frmt = "{:>4}"*len(lst)
    print(frmt.format(*lst))
    fout.write("# Observed " + frmt.format(*lst)+ '\n')

    print()
    fout.write('\n')


fout.close()

0.21 % potential error in  S-01
   9  20
   1   1

0.09 % potential error in  S-02
  14
   1

0.21 % potential error in  S-03
   9  20
   1   1

0.6 % potential error in  S-04
  11   9
   2   3

0.68 % potential error in  S-05
  16   9  40  30  20   4
   1   1   1   1   2   1

0.2 % potential error in  S-06
   9  20
   1   1

19.9 % potential error in  S-07
  27 627  20
   1   1 159

49.86 % potential error in  S-08
  59  27  20
   1   2 343

50.0 % potential error in  S-09
  26  21  22  18  19  20  11
   1   2   1   1   2 330   1

49.85 % potential error in  S-10
  17  20
   1 336

19.88 % potential error in  S-11
  27  20
   1 158

99.8 % potential error in  S-12
  27  20
   1 500

20.08 % potential error in  S-13
  23  19  20  58  27
   1   1 154   1   1

49.92 % potential error in  S-14
 939  21  20  27
   1   1 314   1

0.49 % potential error in  S-15
   9  17  13  15  11
   1   1   1   1   1

0.1 % potential error in  S-16
  14
   1

0.39 % potential error in  S-18
   9  14  17  

Notice there are quite a few repeating errors here in our data set. We can either choose to interpolate the data inbetween or pad it with 0s. For gaps <40s i will interpolate, but for gaps >40 i will 0 pad.

In [8]:
fout = open('./dataInfo/interpolation_Effect_Log.txt','wt')
interpDF = {}

for x in data:
    df = data[x]
    cutoff = 40
    freq = '10S'
    try:
        interpDF[x],accuracy = fillDf(df,freq,cutOffTime,endTime,cutoff)
        print(x,' ',accuracy)
        fout.write(x+' '+ '\n' + accuracy[0]+ '\n'+ accuracy[1]+ '\n'+ accuracy[2] +'\n\n')
    except IndexError:
        print(x,'NO DATA')
        fout.write(x+'NO DATA'+'\n')
fout.close()        

S-01   ['% of values from interpolation : 0.186', '% of values from 0-padding : 12.477', '% of values not changed : 87.337']
S-02   ['% of values from interpolation : 0.0', '% of values from 0-padding : 1.764', '% of values not changed : 98.236']
S-03   ['% of values from interpolation : 0.186', '% of values from 0-padding : 10.791', '% of values not changed : 89.023']
S-04   ['% of values from interpolation : 0.372', '% of values from 0-padding : 21.974', '% of values not changed : 77.654']
S-05   ['% of values from interpolation : 1.027', '% of values from 0-padding : 3.548', '% of values not changed : 95.425']
S-06   ['% of values from interpolation : 0.186', '% of values from 0-padding : 6.326', '% of values not changed : 93.488']
S-07   ['% of values from interpolation : 29.74', '% of values from 0-padding : 10.13', '% of values not changed : 60.13']
S-08   ['% of values from interpolation : 64.312', '% of values from 0-padding : 3.439', '% of values not changed : 32.249']
S-09   

### Export Data
export the newly interpolated data

In [9]:
directory = './interpolatedData'
for x in interpDF:
    temp=interpDF[x]
    if not os.path.exists(directory):
        os.makedirs(directory)
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

### Merge the DataFrames

In [10]:
length = []
for x in interpDF:
    length.append(len(interpDF[x]))
index = min(length)
lowIDX,lowValue = [[i,value] for i,value in enumerate(length) if value == index][0]
print(lowIDX,lowValue)

4 1071


for count,key in enumerate(list(interpDF.keys())):
    print(count+1,key,temp[count+1])

In [11]:
columns = list(interpDF.keys())
mergedData = pd.DataFrame({'Date_Time':interpDF[columns[lowIDX]]['Date_Time']})
for idx,column in enumerate(columns):
    mergedData[column] = interpDF[column][particle]
Average = np.mean(mergedData,axis=1)
Variance = np.var(mergedData,axis=1)
mergedData['Average'] = Average
mergedData['Variance'] = Variance
mergedData

Unnamed: 0,Date_Time,S-01,S-02,S-03,S-04,S-05,S-06,S-07,S-08,S-09,...,S-11,S-12,S-13,S-14,S-15,S-16,S-18,S-19,Average,Variance
0,2021-04-19 17:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000000,0.000000
1,2021-04-19 17:00:10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000000,0.000000
2,2021-04-19 17:00:20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000000,0.000000
3,2021-04-19 17:00:30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000000,0.000000
4,2021-04-19 17:00:40,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066,2021-04-19 19:57:40,51,96,81,69,138,84,90,18,94,...,198,81,63,154,9,102,33,18,81.111111,2221.432099
1067,2021-04-19 19:57:50,72,114,129,69,108,72,63,18,84,...,117,105,126,144,30,96,54,27,83.833333,1273.472222
1068,2021-04-19 19:58:00,57,81,105,69,114,21,108,57,84,...,81,129,126,144,48,84,0,39,78.222222,1404.506173
1069,2021-04-19 19:58:10,153,54,114,60,84,84,108,129,79,...,81,121,157,126,60,51,0,48,86.666667,1622.888889


### Increase Resolution on mergedData

In [12]:

for i in mergedData:
    tempFrame = mergedData.values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    hiResMergedDF = pd.DataFrame(tempList, columns = mergedData.keys())

### Export Merged Frames

In [13]:
directory = './mergedData/'
if not os.path.exists(directory):

    os.makedirs(directory)

location = os.path.join(directory+'mergedFrame.csv')
hiResMergedDF.to_csv(location,index=False)

### Create csv files for each animation
We have 3 expirements in each that we want to average across the range

In [14]:
# mergedData = pd.read_csv('./mergedData/mergedFrame.csv',parse_dates=[0])

In [15]:
time = mergedData['Date_Time']
expIndexes = {}
for i in expTRange:
    expIndexes[i] = []
    for x in expTRange[i]:
        for start,n in enumerate(time):
           if n >= x:
               expIndexes[i].append(start)
               break

## Determining Zones
Here we first create 'averagedFrame's. These are dictionaries that at each 'label' (which corresponds to the name of an expirement) we have a pandas dataframe containing the results of all of the trails in an expirement summed, and then divided by the total number of trails.
Anytime you are adjusting the Zones, everything below here must be run. The values of many of these DataFrames are mutated

In [16]:
# preCursorFactor is defined at the start
averagedFrame = {}
expirementFrame = {}

for label in expIndexes:
    runSumFrames = expIndexes[label][0]-expIndexes[label][0]
    for idx,time in enumerate(expIndexes[label]):
        start = expIndexes[label][idx] - preCursorFactor
        end = expIndexes[label][idx] + expTLen[label]
        expirementFrame[label+' Exp '+str(idx+1)] = mergedData.iloc[ start : end , 1: ].reset_index(drop = True)
        runSumFrames += expirementFrame[label+' Exp '+str(idx+1)]
        
    averagedFrame[label] = runSumFrames/(idx+1)

Calculating the correct Zones for each expirement

In [17]:
if ZoneAutomation:
    # numberOfZones is defined at the start
    ZoneAssignments = {}
    for frame in averagedFrame:
        # at this point averagedFrame should just be the averaged sum of the expirementFrame trails. Last two columns are overall average and varaince so they should be ignored.
        avgFrm = averagedFrame[frame]
        # outdoorSensors must have its spelling exactly match
        columns = list(set(avgFrm.keys()[:-2])- set(outdoorSensors))
        columns.sort()

        X = {}
        for column in columns:
            value,index = max([(value,index) for index,value in enumerate(avgFrm[column])]) 
            X[column] = np.array([np.log(value),index])
        X = [X[i] for i in X]
        kmeans = KMeans(n_clusters=numberOfZones,random_state=0).fit(X)
        ZoneAssignments[frame] = kmeans.labels_
    z = numberOfZones
    ZDf = pd.DataFrame(ZoneAssignments)
    ZDf = ZDf.append(pd.DataFrame([[z]*len(outdoorSensors)]*len(expIndexes),columns = ZoneAssignments.keys()),ignore_index=True)
    ZoneAssignments = ZDf

In [18]:
if not ZoneAutomation:
    ZoneAssignments = {}
    for frame in averagedFrame:
        # at this point averagedFrame should just be the averaged sum of the expirementFrame trails. Last two columns are overall average and varaince so they should be ignored.
        avgFrm = averagedFrame[frame]
        # outdoorSensors must have its spelling exactly match
        columns = list(set(avgFrm.keys()[:-2])- set(outdoorSensors))
        columns.sort()
        ZoneAssignments[frame] = []
        for value,zone in enumerate(zoneList):
            ZoneAssignments[frame].extend([value for i in range(len(zone))])
    ZDf = pd.DataFrame(ZoneAssignments)

In [19]:
directory = './dataInfo'
if not os.path.exists(directory):
    os.makedirs(directory)
location = os.path.join(directory,'ZoneAssignments.csv')
ZDf.to_csv(location,index=False)

Zoning the expirement data.

In [20]:
zonedAvgFrame = {}
for key in ZoneAssignments:
    occourances = [list(ZoneAssignments[key]).count(x) for x in set(ZoneAssignments[key])]
    zoneRunSum = [0]*numberOfZones
    zonedAvgFrame[key] = averagedFrame[key]
    for idx,column in enumerate(columns):
        zoneRunSum[ZoneAssignments[key][idx]] += zonedAvgFrame[key][column]
    for idx in range(numberOfZones):
        zonedAvgFrame[key]['Zone '+str(idx+1)] = zoneRunSum[idx]/occourances[idx]

In [21]:
# relies on columns still being the values of S-01 - last sensor

# Declare an empty dictionary for storing the averaged data for each expirement at the end
zonedExpFrame = {}
# create a list of all of the various dict keys in expirementFrame so that we can iterate through them to get the data
labels = list(expirementFrame.keys())
# Take the labels list and remove the Exp # from it, so that now we have a list of keys that we can use to correctly save to create correctly corresponding keys for a dictionary that will store the averages
keyList = [x.split(' Exp')[0] for x in labels]

for index,exp in enumerate(labels):
    # set the key variable to correspond to the exp variable
    key = keyList[index]
    # Create a runnning sum to keep track of the values
    zoneRunSum = [0]*numberOfZones
    # set the give the zoneExpFrame the same 
    zonedExpFrame[exp] = expirementFrame[exp]
    occourances = [list(ZoneAssignments[key]).count(x) for x in set(ZoneAssignments[key])]
    for idx,column in enumerate(columns):
        zoneRunSum[ZoneAssignments[key][idx]] += zonedAvgFrame[key][column]
    for idx in range(numberOfZones):
        zonedExpFrame[exp]['Zone '+str(idx+1)] = zoneRunSum[idx]/occourances[idx]
        


In [22]:
directory = './averagedData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in averagedFrame:
    temp=averagedFrame[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)



In [23]:
directory = './expirementData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in expirementFrame:
    temp=expirementFrame[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

### Increase the Resolution
pad out the dataframes to have values for every second.

In [24]:
stretchedDF = {}
for i in averagedFrame:
    tempFrame = averagedFrame[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchedDF[i] = pd.DataFrame(tempList, columns = expirementFrame[list(expirementFrame.keys())[0]].columns)     

In [25]:
stretchExpDf = {}
for i in expirementFrame:
    tempFrame = expirementFrame[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchExpDf[i] = pd.DataFrame(tempList, columns = expirementFrame[list(expirementFrame.keys())[0]].columns)

In [26]:
directory = './stretchedAvgData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchedDF:
    temp=stretchedDF[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

In [27]:
directory = './stretchedExpirementData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchExpDf:
    temp=stretchExpDf[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)