### General Imports

In [69]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import os
import glob
from cleanUp import cleanUp
from fillDf import fillDf
from fixYearStamp import fixYearStamp
from sklearn.cluster import KMeans
import time as clock
import copy


In [70]:
begin = clock.time()

### Data Cleaning
Passing the sensor data through the cleanUp function to get fix timestamps and delete null timestamps.

In [71]:
all_csv_files = glob.glob("./Data/*.csv")
# insert the desired start time
cutOffTime = pd.Timestamp("8-8-20 10:30")
endTime = pd.Timestamp("8-8-20 14:00")
# insert the time rectifying offsets. default of for nothing {'':0}
# sensorConditions = {'S-01':7,'S-02':7,'S-03':7,'S-04':7,'S-05':7,'S-06':7,'S-15':7,'S-19':7}
#This indicates which columns to keep. Here we're taking all of the dP info and the timestamps
# columns = [0,1,6,7,8,9,10,11]
# Enable Data Checking
DataChecking = False
# Here are obversed timestamps that need to removed from the data
# badTimes = ['     0/0/0      0:0:0','2165/165/165 165:165:85']
# Controls wether zones will be created automatically or by k-means clusters
ZoneAutomation = False
# Sets either the binning or the manual zones
numberOfZones = 4
# Sensors to exclude from zone
outdoorSensors = []#['S-15','S-16','S-18','S-19']
# 10s of seconds before nebulization to include in the expirement csv files
preCursorFactor = 6
# which particle to analyze
particle = 'Dp>0.3'

In [72]:
expTRange = {
    'HMC Expirement 1': [
    # pd.Timestamp("2020-08-08 11:22:51"),
    pd.Timestamp("2020-08-08 11:36:01"),
    pd.Timestamp("2020-08-08 11:52:01")],
    'HMC Expirement 2': [
    pd.Timestamp("2020-08-08 12:00:11"),
    pd.Timestamp("2020-08-08 12:11:01")],
    # pd.Timestamp("2020-08-08 12:20:51")],
    'HMC Expirement 3': [
    pd.Timestamp("2020-08-08 12:20:51"),
    pd.Timestamp("2020-08-08 12:36:01"),
    pd.Timestamp("2020-08-08 12:51:21"),
    pd.Timestamp("2020-08-08 13:06:01")]

}

#enter in the expirement length as seconds/10
expTLen = {
    'HMC Expirement 1' : 8*6,
    'HMC Expirement 2' : 8*6,
    'HMC Expirement 3' : 14*6
}
# Manual Zone set up notice how we are missing S-14
zoneList = {
    'Zone 1' : ['B-10'],
    'Zone 2' : ['B-17','B-14','B-15','B-07','B-08'],
    'Zone 3' : ['B-01','B-02','B-13','B-18','B-23','B-04','B-05','B-06','B-11','B-12','B-21','B-22','B-16','B-09','B-19','B-20']
}

if not ZoneAutomation:
    numberOfZones = len(zoneList)

In [73]:
for i in expTRange:
        for time in expTRange[i]:
            print(i,time)

HMC Expirement 1 2020-08-08 11:36:01
HMC Expirement 1 2020-08-08 11:52:01
HMC Expirement 2 2020-08-08 12:00:11
HMC Expirement 2 2020-08-08 12:11:01
HMC Expirement 3 2020-08-08 12:20:51
HMC Expirement 3 2020-08-08 12:36:01
HMC Expirement 3 2020-08-08 12:51:21
HMC Expirement 3 2020-08-08 13:06:01


Removed file 13 here for some reason

In [74]:
# all_csv_files.pop(11)

Changed this to markdown so it won't run twice, had to fix the timestamps on S-12
filePath        = all_csv_files[11]
incorrectString = '21/3/22'
date            = '3/22/2021'
charTimeStart   = 11
charTimeEnd     = 21
offset          = 0
fixYearStamp(filePath,incorrectString,date,charTimeStart,charTimeEnd,offset)

In [75]:
data = {}
for x in all_csv_files:
    if not (x == './Data\\B-02.csv'):
        name = x.split('\\')[1].split('.')[0]
        data[name] = pd.read_csv(x, parse_dates=[[0,2]])
        data[name] = data[name][data[name].columns[[0,6]]]
    else:
        name = x.split('\\')[1].split('.')[0]
        data[name] = pd.read_csv(x, parse_dates=[[0,1]],usecols=[0,1,6])


### Exporting Data
Here we can export the organized data frames as csv files

In [76]:
directory = './proccessedData'
for x in data:
    temp=data[x]
    if not os.path.exists(directory):
        os.makedirs(directory)
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

### Checking Data
Here we scan through the data for irregularities in data recording.

In [77]:
if DataChecking:
    directory = './dataInfo'
    if not os.path.exists(directory):
        os.makedirs(directory)
    fout = open('./dataInfo/time_Frequency_Error_Log.txt','wt')
    errors = {}
    errorCount = {}
    # Enter the expected interval here
    interval = 10
    for x in data:
        # errors keeps track of length of each time interval error that occurs
        errors[x] = set(())
        # errorCount keeps track of how many times each time interval error occured
        errorCount[x] = {}
        # counter keeps track of the total time interval errors per sensor
        counter = 0
        #shows the total
        temp = data[x]
        for idx,i in enumerate(temp['Date_Time']):
            try:
                if not ((temp['Date_Time'][idx+1] - i) == pd.Timedelta(seconds=interval)):
                    timeErr = temp['Date_Time'][idx+1] - i
                    if str(timeErr.seconds) in errorCount[x]:
                        errorCount[x][str(timeErr.seconds)] +=1
                    else:
                        errorCount[x][str(timeErr.seconds)] = 1

                    errors[x].add(timeErr)


                    counter += 1
            except:
                continue

        print(str(round(counter/len(temp)*100,2)),'% potential error in ', x)
        fout.write('potential error in '+ x +'\n' + str(round(counter/len(temp)*100,2))+'%'+'\n')

        # display the different types of errors
        lst = [i.seconds for i in errors[x]]
        frmt = "{:>4}"*len(lst)
        print(frmt.format(*lst))
        fout.write("Time Errors" + frmt.format(*lst)+ '\n')

        # display the quantity of each type of error
        lst = [errorCount[x][str(i.seconds)] for i in errors[x]]
        frmt = "{:>4}"*len(lst)
        print(frmt.format(*lst))
        fout.write("# Observed " + frmt.format(*lst)+ '\n')

        print()
        fout.write('\n')


    fout.close()

else:
    print("Data Checking Flag is False, no tests ran")

Data Checking Flag is False, no tests ran


Notice there are quite a few repeating errors here in our data set. We can either choose to interpolate the data inbetween or pad it with 0s. For gaps <40s i will interpolate, but for gaps >40 i will 0 pad.

In [78]:

fout = open('./dataInfo/interpolation_Effect_Log.txt','wt')
interpDF = {}

for x in data:
    df = data[x]
    cutoff = 40
    freq = '10S'
    try:
        interpDF[x],accuracy = fillDf(df,freq,cutOffTime,endTime,cutoff)
        print(x,' ',accuracy)
        fout.write(x+' '+ '\n' + accuracy[0]+ '\n'+ accuracy[1]+ '\n'+ accuracy[2] +'\n\n')
    except IndexError:
        print(x,'NO DATA')
        fout.write(x+'NO DATA'+'\n')
fout.close()        

B-01   ['% of values from interpolation : 5.775', '% of values from 0-padding : 9.721', '% of values not changed : 84.504']
B-02   ['% of values from interpolation : 32.724', '% of values from 0-padding : 2.502', '% of values not changed : 64.774']
B-04   ['% of values from interpolation : 33.812', '% of values from 0-padding : 3.736', '% of values not changed : 62.452']
B-05   ['% of values from interpolation : 13.404', '% of values from 0-padding : 9.45', '% of values not changed : 77.146']
B-06   ['% of values from interpolation : 32.56', '% of values from 0-padding : 3.382', '% of values not changed : 64.058']
B-07   ['% of values from interpolation : 12.692', '% of values from 0-padding : 12.885', '% of values not changed : 74.423']
B-08   ['% of values from interpolation : 7.781', '% of values from 0-padding : 12.872', '% of values not changed : 79.347']
B-09   ['% of values from interpolation : 0.0', '% of values from 0-padding : 12.066', '% of values not changed : 87.934']
B-10

### Export Data
export the newly interpolated data

In [79]:
directory = './interpolatedData'
for x in interpDF:
    temp=interpDF[x]
    if not os.path.exists(directory):
        os.makedirs(directory)
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

### Merge the DataFrames

In [80]:
length = []
for x in interpDF:
    length.append(len(interpDF[x]))
index = min(length)
lowIDX,lowValue = [[i,value] for i,value in enumerate(length) if value == index][0]
print(lowIDX,lowValue)

4 1035


for count,key in enumerate(list(interpDF.keys())):
    print(count+1,key,temp[count+1])

In [81]:
columns = list(interpDF.keys())
mergedData = pd.DataFrame({'Date_Time':interpDF[columns[lowIDX]]['Date_Time']})
for idx,column in enumerate(columns):
    mergedData[column] = interpDF[column][particle]
Average = np.mean(mergedData,axis=1)
Variance = np.var(mergedData,axis=1)
Average = np.mean(mergedData,axis=1)
Variance = np.var(mergedData,axis=1)
mergedData['Average'] = Average
mergedData['Variance'] = Variance
mergedData

Unnamed: 0,Date_Time,B-01,B-02,B-04,B-05,B-06,B-07,B-08,B-09,B-10,...,B-16,B-17,B-18,B-19,B-20,B-21,B-22,B-23,Average,Variance
0,2020-08-08 10:30:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000000,0.000000
1,2020-08-08 10:30:10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000000,0.000000
2,2020-08-08 10:30:20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000000,0.000000
3,2020-08-08 10:30:30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000000,0.000000
4,2020-08-08 10:30:40,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030,2020-08-08 13:21:40,0,0,18,0,0,0,0,0,0,...,69,0,0,0,0,9,0,0,9.545455,648.520661
1031,2020-08-08 13:21:50,0,0,18,0,0,0,0,18,0,...,69,0,0,0,0,9,0,0,9.818182,549.966942
1032,2020-08-08 13:22:00,0,0,18,21,0,0,0,81,0,...,9,0,0,0,0,0,4,0,10.454545,621.975207
1033,2020-08-08 13:22:10,0,0,9,21,0,0,0,114,9,...,9,0,0,0,0,9,9,0,14.227273,1230.811983


### Increase Resolution on mergedData

In [82]:

for i in mergedData:
    tempFrame = mergedData.values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    hiResMergedDF = pd.DataFrame(tempList, columns = mergedData.keys())

### Export Merged Frames

In [83]:
directory = './mergedData/'
if not os.path.exists(directory):

    os.makedirs(directory)

location = os.path.join(directory+'mergedFrame.csv')
hiResMergedDF.to_csv(location,index=False)

### Create csv files for each animation
We have 3 expirements in each that we want to average across the range

In [84]:
# mergedData = pd.read_csv('./mergedData/mergedFrame.csv',parse_dates=[0])

In [85]:
time = mergedData['Date_Time']
expIndexes = {}
for i in expTRange:
    expIndexes[i] = []
    for x in expTRange[i]:
        for start,n in enumerate(time):
           if n >= x:
               expIndexes[i].append(start)
               break

## Determining Zones
Here we first create 'averagedFrame's. These are dictionaries that at each 'label' (which corresponds to the name of an expirement) we have a pandas dataframe containing the results of all of the trails in an expirement summed, and then divided by the total number of trails.
Anytime you are adjusting the Zones, everything below here must be run. The values of many of these DataFrames are mutated

In [86]:
# preCursorFactor is defined at the start
averagedFrame = {}
expirementFrame = {}

for label in expIndexes:
    runSumFrames = expIndexes[label][0]-expIndexes[label][0]
    for idx,time in enumerate(expIndexes[label]):
        start = expIndexes[label][idx] - preCursorFactor
        end = expIndexes[label][idx] + expTLen[label]
        expirementFrame[label+' Exp '+str(idx+1)] = mergedData.iloc[ start : end , 1: ].reset_index(drop = True)
        runSumFrames += expirementFrame[label+' Exp '+str(idx+1)]
        
    averagedFrame[label] = runSumFrames/(idx+1)

Calculating the correct Zones for each expirement

In [87]:
# numberOfZones -= 1
# numberOfZones is defined at the start
AutoZoneAssignments = {}
for frame in averagedFrame:
    # at this point averagedFrame should just be the averaged sum of the expirementFrame trails. Last two columns are overall average and varaince so they should be ignored.
    avgFrm = averagedFrame[frame]
    # outdoorSensors must have its spelling exactly match
    columns = list(set(avgFrm.keys()[:-2])- set(outdoorSensors))
    columns.sort()

    X = {}
    for column in columns:
        value,index = max([(value,index) for index,value in enumerate(avgFrm[column])]) 
        X[column] = np.array([np.log(value+.01),index])
    X = [X[i] for i in X]
    kmeans = KMeans(n_clusters=numberOfZones,random_state=0).fit(X)
    idx = np.argsort(kmeans.cluster_centers_.sum(axis=1))
    lut = np.zeros_like(idx)
    lut[idx] = np.arange(numberOfZones)
    #lut = lut[::-1]
    orderedZones = [[]]*numberOfZones
    for index, zone in enumerate(lut):
        orderedZones[index] = [index if zone == kmeans.labels_[i] else 0 for i in range(len(kmeans.labels_))]
    AutoZoneAssignments[frame] = np.sum(orderedZones,axis=0)
z = numberOfZones
ZDfAuto = pd.DataFrame(AutoZoneAssignments)
ZDfAuto = ZDfAuto.append(pd.DataFrame([[z]*len(expIndexes)]*len(outdoorSensors),columns = AutoZoneAssignments.keys()),ignore_index=True)
AutoZoneAssignments = ZDfAuto
# numberOfZones += 1

if not ZoneAutomation:
    ZoneAssignments = {}
    for frame in averagedFrame:
        # at this point averagedFrame should just be the averaged sum of the expirementFrame trails. Last two columns are overall average and varaince so they should be ignored.
        avgFrm = averagedFrame[frame]
        # outdoorSensors must have its spelling exactly match
        columns = list(set(avgFrm.keys()[:-2]))
        columns.sort()
        ZoneAssignments[frame] = [0]*len(columns)
        for value,zone in enumerate(zoneList):
            for sensor in zoneList[zone]:
                ZoneAssignments[frame][columns.index(sensor)] = value
    ZDf = pd.DataFrame(ZoneAssignments)

In [88]:
directory = './dataInfo'
if not os.path.exists(directory):
    os.makedirs(directory)
location = os.path.join(directory,'ZoneAssignments.csv')
ZDf.to_csv(location,index=False)

directory = './dataInfo'
if not os.path.exists(directory):
    os.makedirs(directory)
location = os.path.join(directory,'AutoZoneAssignments.csv')
ZDfAuto.to_csv(location,index=False)

expirementFrameAuto = copy.deepcopy(expirementFrame)
averagedFrameAuto = copy.deepcopy(averagedFrame)

## Zoning the Data

manual zones

In [89]:
exp

'HMC Expirement 3 Exp 4'

In [90]:
zonedAvgFrame = {}
for key in ZoneAssignments:
    occourances = [list(ZoneAssignments[key]).count(x) for x in set(ZoneAssignments[key])]
    zoneRunSum = [0]*numberOfZones
    zonedAvgFrame[key] = averagedFrame[key]
    for idx,column in enumerate(columns):
        zoneRunSum[ZoneAssignments[key][idx]] += zonedAvgFrame[key][column]
    for idx in range(numberOfZones):
        zonedAvgFrame[key]['Zone '+str(idx+1)] = zoneRunSum[idx]/occourances[idx]

# relies on columns still being the values of S-01 - last sensor

# Declare an empty dictionary for storing the averaged data for each expirement at the end
zonedExpFrame = {}
# create a list of all of the various dict keys in expirementFrame so that we can iterate through them to get the data
labels = list(expirementFrame.keys())
# Take the labels list and remove the Exp # from it, so that now we have a list of keys that we can use to correctly save to create correctly corresponding keys for a dictionary that will store the averages
keyList = [x.split(' Exp ')[0] for x in labels]

for index,exp in enumerate(labels):
    # set the key variable to correspond to the exp variable
    key = keyList[index]
    # Create a runnning sum to keep track of the values
    zoneRunSum = [0]*numberOfZones
    # set the give the zoneExpFrame the same 
    zonedExpFrame[exp] = expirementFrame[exp]
    occourances = [list(ZoneAssignments[key]).count(x) for x in set(ZoneAssignments[key])]
    for idx,column in enumerate(columns):
        zoneRunSum[ZoneAssignments[key][idx]] += zonedExpFrame[exp][column]
    for idx in range(numberOfZones):
        zonedExpFrame[exp]['Zone '+str(idx+1)] = zoneRunSum[idx]/occourances[idx]

        
zonedAvgFrameAuto = {}
for key in AutoZoneAssignments:
    occourances = [list(AutoZoneAssignments[key]).count(x) for x in set(AutoZoneAssignments[key])]
    zoneRunSum = [0]*numberOfZones
    zonedAvgFrameAuto[key] = averagedFrameAuto[key]
    for idx,column in enumerate(columns):
        zoneRunSum[AutoZoneAssignments[key][idx]] += zonedAvgFrameAuto[key][column]
    for idx in range(numberOfZones):
        zonedAvgFrameAuto[key]['Zone '+str(idx+1)] = zoneRunSum[idx]/occourances[idx]
        
# relies on columns still being the values of S-01 - last sensor

# Declare an empty dictionary for storing the averaged data for each expirement at the end
zonedExpFrameAuto = {}
# create a list of all of the various dict keys in expirementFrameAuto so that we can iterate through them to get the data
labels = list(expirementFrameAuto.keys())
# Take the labels list and remove the Exp # from it, so that now we have a list of keys that we can use to correctly save to create correctly corresponding keys for a dictionary that will store the averages
keyList = [x.split(' Exp ')[0] for x in labels]

for index,exp in enumerate(labels):
    # set the key variable to correspond to the exp variable
    key = keyList[index]
    # Create a runnning sum to keep track of the values
    zoneRunSum = [0]*numberOfZones
    # set the give the zoneExpFrame the same 
    zonedExpFrameAuto[exp] = expirementFrameAuto[exp]
    occourances = [list(AutoZoneAssignments[key]).count(x) for x in set(AutoZoneAssignments[key])]
    for idx,column in enumerate(columns):
        zoneRunSum[AutoZoneAssignments[key][idx]] += zonedExpFrameAuto[exp][column]
    for idx in range(numberOfZones):
        zonedExpFrameAuto[exp]['Zone '+str(idx+1)] = zoneRunSum[idx]/occourances[idx]

auto zones

In [91]:
directory = './averagedData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in averagedFrame:
    temp=averagedFrame[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)
directory = './averagedDataAuto'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in averagedFrameAuto:
    temp=averagedFrameAuto[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)
directory = './expirementData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in expirementFrame:
    temp=expirementFrame[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)
directory = './expirementDataAuto'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in expirementFrame:
    temp=expirementFrame[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

### Increase the Resolution
pad out the dataframes to have values for every second.

In [92]:
expirementFrameAuto[list(expirementFrameAuto.keys())[0]].columns

Index(['B-01', 'B-02', 'B-04', 'B-05', 'B-06', 'B-07', 'B-08', 'B-09', 'B-10',
       'B-11', 'B-12', 'B-13', 'B-14', 'B-15', 'B-16', 'B-17', 'B-18', 'B-19',
       'B-20', 'B-21', 'B-22', 'B-23', 'Average', 'Variance', 'Zone 1',
       'Zone 2', 'Zone 3'],
      dtype='object')

In [93]:
stretchedDF = {}
for i in averagedFrame:
    tempFrame = averagedFrame[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchedDF[i] = pd.DataFrame(tempList, columns = expirementFrame[list(expirementFrame.keys())[0]].columns)

stretchExpDf = {}
for i in expirementFrame:
    tempFrame = expirementFrame[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchExpDf[i] = pd.DataFrame(tempList, columns = expirementFrame[list(expirementFrame.keys())[0]].columns) 
stretchedDFAuto = {}
for i in averagedFrameAuto:
    tempFrame = averagedFrameAuto[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchedDFAuto[i] = pd.DataFrame(tempList, columns = expirementFrameAuto[list(expirementFrameAuto.keys())[0]].columns) 

stretchExpDfAuto = {}
for i in expirementFrameAuto:
    tempFrame = expirementFrameAuto[i].values
    tempList = []
    for idx,x in enumerate(tempFrame):
        try:
            increment = (tempFrame[idx+1] - x)/10
            for count in range(10):
                tempList.append(x+increment*count)
        except IndexError:
            tempList.append(x)
            continue
    stretchExpDfAuto[i] = pd.DataFrame(tempList, columns = expirementFrameAuto[list(expirementFrameAuto.keys())[0]].columns)    

In [94]:
directory = './stretchedAvgData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchedDF:
    temp=stretchedDF[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)
directory = './stretchedExpirementData'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchExpDf:
    temp=stretchExpDf[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)
directory = './stretchedAvgDataAuto'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchedDFAuto:
    temp=stretchedDFAuto[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)
directory = './stretchedExpirementDataAuto'
if not os.path.exists(directory):
    os.makedirs(directory)
for x in stretchExpDfAuto:
    temp=stretchExpDfAuto[x]
    location = os.path.join(directory,x+'.csv')
    temp.to_csv(location,index=False)

In [95]:
end = clock.time()
print(end-begin)

16.463077306747437
