In [1]:
import numpy as np
import pandas as pd

import glob
import os
import os.path

from datetime import datetime
from tqdm import tqdm

from argparse import ArgumentParser

# Gather input files (POPS + CSAT3b)
create lists of the anemometer and pops files to be processed. Make sure the files are sorted chronologically!

In [3]:
# process chess data
# data_dir = "/Volumes/T7/FALCON_SITES/SGP" #Make sure to have POPS 10 Hz and Anemometer data folders in this directory
data_dir = "./Example_data/CST_data" #Make sure to have POPS 10 Hz and Anemometer data folders in this directory
time_zone = 'US/Central' #'UTC' #'US/Eastern' or 'US/Central' 
anemometer_type = "windmaster" # "csat3b"
loc = "SGP" # "SCR" or "CHESS"

In [4]:
def import_POPS(POPS_source_directory):
    # walk each subdirectory for the 10Hz files
    # if folders F20230722 - F20230730 exist, then the 10Hz CSVs found in
    # all the folders will be added to the POPS_files_2_process list.
    POPS_files_list = []
    for dirpath, dirnames, filenames in os.walk(POPS_source_directory):
        for filename in [f for f in filenames if f.endswith("_10Hz.csv")]:
            full_path = os.path.join(dirpath, filename)
            POPS_files_list.append(full_path)
    # POPS_files_2_process.sort(key=os.path.getmtime)
    POPS_files_list.sort()
    return POPS_files_list

In [5]:
# POPS_source_directory = "./Example_data/CST_data"
POPS_files_list = import_POPS(data_dir)
print('POPS files that will be processed:')
print(*POPS_files_list, sep= "\n")

POPS files that will be processed:
/Volumes/T7/FALCON_SITES/SGP/pops_data/F20230201/Peak_20230201x001_10Hz.csv
/Volumes/T7/FALCON_SITES/SGP/pops_data/F20230201/Peak_20230201x002_10Hz.csv
/Volumes/T7/FALCON_SITES/SGP/pops_data/F20230201/Peak_20230201x003_10Hz.csv
/Volumes/T7/FALCON_SITES/SGP/pops_data/F20230201/Peak_20230201x004_10Hz.csv
/Volumes/T7/FALCON_SITES/SGP/pops_data/F20230201/Peak_20230201x005_10Hz.csv
/Volumes/T7/FALCON_SITES/SGP/pops_data/F20230201/Peak_20230201x006_10Hz.csv
/Volumes/T7/FALCON_SITES/SGP/pops_data/F20230201/Peak_20230201x007_10Hz.csv
/Volumes/T7/FALCON_SITES/SGP/pops_data/F20230201/Peak_20230201x008_10Hz.csv
/Volumes/T7/FALCON_SITES/SGP/pops_data/F20230201/Peak_20230201x009_10Hz.csv
/Volumes/T7/FALCON_SITES/SGP/pops_data/F20230201/Peak_20230201x010_10Hz.csv
/Volumes/T7/FALCON_SITES/SGP/pops_data/F20230201/Peak_20230201x011_10Hz.csv
/Volumes/T7/FALCON_SITES/SGP/pops_data/F20230201/Peak_20230201x012_10Hz.csv
/Volumes/T7/FALCON_SITES/SGP/pops_data/F20230201/Peak

In [6]:
def import_Anemometer(Anemometer_source_directory, anemometer_type):
    anemometer_files_list = []
    for dirpath, dirnames, filenames in os.walk(Anemometer_source_directory):
        for filename in [f for f in filenames if(f.startswith(anemometer_type)) and (f.endswith(".csv"))]:
            full_path = os.path.join(dirpath, filename)
            anemometer_files_list.append(full_path)
    #anemometer_files_list = glob.glob(Anemometer_source_directory+"/*.csv")
    anemometer_files_list.sort()
    #anemometer_files_list.sort(key=os.path.getmtime)
    return anemometer_files_list

In [7]:
# Anemometer_source_directory = "./Example_data/CST_data/"
#Anemometer_source_directory = "./Example_data/CST_data/windmaster_data"
anemometer_files_list = import_Anemometer(data_dir, anemometer_type)

print('Anemometer files that will be processed:')
print(*anemometer_files_list, sep= "\n")


Anemometer files that will be processed:
/Volumes/T7/FALCON_SITES/SGP/windmaster_data/windmaster_230708090000.csv
/Volumes/T7/FALCON_SITES/SGP/windmaster_data/windmaster_230708093000.csv
/Volumes/T7/FALCON_SITES/SGP/windmaster_data/windmaster_230708100000.csv
/Volumes/T7/FALCON_SITES/SGP/windmaster_data/windmaster_230708103000.csv
/Volumes/T7/FALCON_SITES/SGP/windmaster_data/windmaster_230708110000.csv
/Volumes/T7/FALCON_SITES/SGP/windmaster_data/windmaster_230708113001.csv
/Volumes/T7/FALCON_SITES/SGP/windmaster_data/windmaster_230708120001.csv
/Volumes/T7/FALCON_SITES/SGP/windmaster_data/windmaster_230708123001.csv
/Volumes/T7/FALCON_SITES/SGP/windmaster_data/windmaster_230708130001.csv
/Volumes/T7/FALCON_SITES/SGP/windmaster_data/windmaster_230708133001.csv
/Volumes/T7/FALCON_SITES/SGP/windmaster_data/windmaster_230708140001.csv
/Volumes/T7/FALCON_SITES/SGP/windmaster_data/windmaster_230708143001.csv
/Volumes/T7/FALCON_SITES/SGP/windmaster_data/windmaster_230708150001.csv
/Volumes/T

# Resample perfect 10Hz CSAT3b data to perfect 30 min (10Hz) files
* The CSAT3b data is thirty minutes and was resampled to represent perfect 10Hz data.
* While the file is 30 minutes long, it does not start on the hour or half hour due to time slippage
* I account for time slippage by rechunking the data into 30 minute chunks after concatanating two csat files

In [8]:
### anemometer chunking function
def anemometer_thirty_chunk(anemometer_file, timezone, instrument_type):
    # format time stamp and drop useless descriptors (CSAT specific)
    anemometer_file['timestamp'] = pd.to_datetime(anemometer_file['Day_CPU(YYYY-MM-DD)'] 
                                                  + anemometer_file['time_CPU(HH:MM:SS.FFF)'], format='mixed')
    # convert eastern time to utc
    anemometer_file['timestamp'] = anemometer_file['timestamp'].dt.tz_localize(timezone).dt.tz_convert('UTC')
    # drop unnecessary columns
    drop_columns = ['Index','Day_CPU(YYYY-MM-DD)','time_CPU(HH:MM:SS.FFF)',
                    'Diagnostic_code','record_counter','sig_hex']
    if instrument_type == 'windmaster':
        drop_columns = ['Index','Day_CPU(YYYY-MM-DD)','time_CPU(HH:MM:SS.FFF)',
                        'Start_codon','unit_ident','Error_code','Check_sum']
    anemometer_file = anemometer_file.set_index('timestamp').drop(columns=drop_columns) # set the timestamps as the index

    #print(len(anemometer_file))
    # Chunk merged data frame into 30 minute periods
    time_series = [member[1] for member in list(anemometer_file.groupby(pd.Grouper(freq='30min')))]#[1][1]
    # Determine largest df. That is the true 30 min.
    thirty_min_series = max(time_series, key=len)
    # resample chunk to perfect 10Hz data
    thirty_min_series = thirty_min_series.apply(pd.to_numeric, errors='coerce') # ensure that all columns are numeric, if not, set NAN.
    thirty_min_resample = thirty_min_series.resample('100ms').mean()
    return thirty_min_resample

In [9]:
def chunk_anem_files(anemometer_files_2_process, timezone, instrument_type):
    # set up loop to extract all 30 minute periods
    resampled_anemometer_dfs = []
    for i,file in enumerate(tqdm(anemometer_files_2_process)):
        start_idx, end_idx = i, i+2
        if end_idx <= len(anemometer_files_2_process):
            # read CSVs and merge
            try:
                specific_anem_dfs = [pd.read_csv(df, on_bad_lines='skip') for df in anemometer_files_2_process[start_idx:end_idx]] # 'warn' --> raise warning, but skip bad line
                #check for empty anemometer files
                if (len(specific_anem_dfs[0]) == 0) or (len(specific_anem_dfs[1]) == 0):
                    print("there is no data in one of these files: ", anemometer_files_2_process[start_idx:end_idx])
                    break
                # Concatanate files into hour long durations
                anemometer_file = pd.concat(specific_anem_dfs).reset_index(drop=True)
                resampled_data = anemometer_thirty_chunk(anemometer_file, timezone, instrument_type) # cut the hour into 30 min chunks
                resampled_anemometer_dfs.append(resampled_data)
            except Exception as e:
                print("length of file", len(specific_anem_dfs[0]))
                print("There is something strange about these files. Take a look!")
                print("failed files:", anemometer_files_2_process[start_idx:end_idx])
                print("resulting error:\n",e)
                break
    return resampled_anemometer_dfs

In [10]:
# time_zone = 'US/Central' #'UTC' #'US/Eastern' or 'US/Central'
# anemometer_type = "windmaster"
resampled_anemometer_list = chunk_anem_files(anemometer_files_list, time_zone, anemometer_type)

  0%|          | 0/1691 [00:00<?, ?it/s]

100%|██████████| 1691/1691 [03:18<00:00,  8.50it/s]


## A look into the anemometer files before and after chunking

In [84]:
print('Here is the a 10Hz anemometer file before chunking:')
print('notice that the file starts about 2 seconds off of the 0 second point.')
pd.read_csv(anemometer_files_list[1])

Here is the a 10Hz anemometer file before chunking:
notice that the file starts about 2 seconds off of the 0 second point.


Unnamed: 0,Index,Day_CPU(YYYY-MM-DD),time_CPU(HH:MM:SS.FFF),Start_codon,u(m/s),v(m/s),w(m/s),unit_ident,SOS(m/s),T(Celsius),Error_code,Check_sum
0,0,2023-09-17,00:30:10.921928,Q,1.613,-1.274,-0.210,M,344.18,20.94,0,1A
1,1,2023-09-17,00:30:11.017925,Q,1.730,-1.205,-0.097,M,344.19,20.96,0,12
2,2,2023-09-17,00:30:11.113905,Q,1.665,-1.202,0.009,M,344.15,20.89,0,17
3,3,2023-09-17,00:30:11.209929,Q,1.606,-1.067,0.061,M,344.09,20.79,0,1F
4,4,2023-09-17,00:30:11.321831,Q,1.782,-0.949,0.110,M,344.11,20.82,0,1C
...,...,...,...,...,...,...,...,...,...,...,...,...
18018,18018,2023-09-17,01:00:10.431454,Q,1.302,-0.802,-0.030,M,344.26,21.08,0,1C
18019,18019,2023-09-17,01:00:10.543355,Q,1.303,-0.841,-0.019,M,344.26,21.08,0,11
18020,18020,2023-09-17,01:00:10.639329,Q,1.315,-0.809,-0.039,M,344.21,20.99,0,16
18021,18021,2023-09-17,01:00:10.735332,Q,1.301,-0.752,-0.037,M,344.20,20.97,0,13


In [85]:
print('Here is an 10 Hz anemometer file after chunking that is now perfectly 30 minutes:')
resampled_anemometer_list[0]

Here is an 10 Hz anemometer file after chunking that is now perfectly 30 minutes:


Unnamed: 0_level_0,u(m/s),v(m/s),w(m/s),SOS(m/s),T(Celsius)
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-09-17 05:30:00+00:00,1.476,-1.223,-0.151,344.30,21.15
2023-09-17 05:30:00.100000+00:00,1.595,-1.195,-0.187,344.19,20.96
2023-09-17 05:30:00.200000+00:00,1.563,-1.120,-0.220,344.20,20.97
2023-09-17 05:30:00.300000+00:00,1.461,-1.246,-0.217,344.12,20.84
2023-09-17 05:30:00.400000+00:00,1.093,-1.269,0.000,344.07,20.75
...,...,...,...,...,...
2023-09-17 05:59:59.500000+00:00,1.190,-0.845,-0.026,344.28,21.11
2023-09-17 05:59:59.600000+00:00,1.189,-0.828,-0.011,344.26,21.08
2023-09-17 05:59:59.700000+00:00,1.186,-0.878,-0.056,344.34,21.21
2023-09-17 05:59:59.800000+00:00,1.263,-0.863,-0.049,344.31,21.16


# Chunk POPS data into perfect 30 min
* POPS 10Hz data is cut into 30 minute segments
* Segments that are not 30 minutes long are relabeled as the rollover chunk and are appended to the next "30-minute" chunk.
* The POPS files are not uniform in how long they run for, so a file may be 2 hours long or 30 minutes long.
* For a file that starts at 1:35 pm and runs to 2:40 pm, cutting into 30 minute chunks results in 3 dataframes of different lengths. The first dataframe stretches from 1:35 - 2:00 pm, the second chunk is from 2:00 - 2:30 pm, and the thirst chunk is from 2:30 - 2:40 pm. Only the second chunk is recognized as a perfect 30-minute 10Hz file (>=18000 rows of data). the first chunk will be ignored if the POPS file is the first in the series., The third chunk is called the rollover and is concatenated to the next file. After concatanation, the new POPS file undergoes 30 minute chunking again.

In [11]:
### pops_chunking functions
def pops_thirty_chunk(pops_file):
    chunked_pops_list = list(pops_file.groupby(pd.Grouper(freq='30min'))) # 30 minute chunking
    chunked_pops_list_len = len(chunked_pops_list) # find length of list
    # Chunked pops data frames into 30 minute periods and rollover last chunk if not 30 min
    thirty_min_pops, rollover_chunk = [], []
    for i, member in enumerate(chunked_pops_list): # scan each 30 minute chunk
        if len(member[1])>=17999:
            thirty_min_pops.append(member[1])
        elif (i > 0) and (chunked_pops_list_len > 1): # rollover last chunk
            rollover_chunk.append(member[1])
        #take into account pops files that are less than 30 minutes long
        # add that entire file/chunk to rollover to reach 30 minutes
        elif (chunked_pops_list_len == 1):
            rollover_chunk.append(member[1])
        else: # left over files that fell through if filter. 
            # These files likely have missing data in the middle of the 
            # file that can't be accounted for. Merging will turn these missing
            # rows to NaN.
            thirty_min_pops.append(member[1])
    return thirty_min_pops, rollover_chunk


In [12]:
def chunk_pops_files(POPS_files_2_process):
    # create perfect thirty minute POPS files
    pops_rollover_df = pd.DataFrame()
    pops_perfect_thirty_archive = []
    for f in tqdm(POPS_files_2_process):
        # read csv data and set timestamp as index
        pops_raw_file = pd.read_csv(f)
        pops_raw_file['timestamp'] = pd.to_datetime(pops_raw_file['timestamp']).dt.tz_localize('UTC') #put into tz format
        pops_file = pops_raw_file.set_index('timestamp')
        # check for rollover
        if pops_rollover_df.empty ==  False: 
            pops_file = pd.concat([pops_rollover_df,pops_file])
        # chunk the file into perfect 30 min. Keep last chunk if not 30 and rollover to next file
        try:
            thirty_min_pops, rollover_chunk = pops_thirty_chunk(pops_file)
            pops_rollover_df = rollover_chunk[0][:-1] # pass the rollover dataframe 
                                                # on to be appended to the next df.
        except Exception as e:
            # stop processing if there is a fatal error
            print("Stopped processing, fatal error!")
            print("exception clause:\n")
            print("initial file:", pops_file)
            print(e, "\n")
            break
        pops_perfect_thirty_archive += thirty_min_pops
    return pops_perfect_thirty_archive

In [13]:
chunked_pops = chunk_pops_files(POPS_files_list)

  pops_raw_file['timestamp'] = pd.to_datetime(pops_raw_file['timestamp']).dt.tz_localize('UTC') #put into tz format
100%|██████████| 2303/2303 [01:47<00:00, 21.44it/s]


## A look into the POPS files before and after chunking

In [89]:
print('Here is the first 10Hz POPS file before chunking:')
print('notice that the file starts at 12:18.9')
pd.read_csv(POPS_files_list[1])

Here is the first 10Hz POPS file before chunking:
notice that the file starts at 12:18.9


Unnamed: 0,timestamp,119.98_138.88,138.88_164.03,164.03_200.06,200.06_280.73,280.73_464.13,464.13_676.38,676.38_1259.19,1259.19_1913.06,1913.06_2999.25
0,2023-09-17 01:31:01.600,43,26,13,0,1,0,0,0,0
1,2023-09-17 01:31:01.700,55,57,19,4,0,0,0,0,0
2,2023-09-17 01:31:01.800,64,57,29,2,0,0,0,0,0
3,2023-09-17 01:31:01.900,71,53,28,4,1,0,0,0,0
4,2023-09-17 01:31:02.000,88,42,27,5,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
24086,2023-09-17 02:11:10.200,73,54,40,3,1,0,0,0,0
24087,2023-09-17 02:11:10.300,86,32,52,6,0,1,0,0,0
24088,2023-09-17 02:11:10.400,88,59,34,4,1,0,0,0,0
24089,2023-09-17 02:11:10.500,81,45,38,7,0,0,0,0,0


In [90]:
print('Here is an example of a 10Hz, perfect 30-minute POPS file after chunking:')
chunked_pops[1]

Here is an example of a 10Hz, perfect 30-minute POPS file after chunking:


Unnamed: 0_level_0,119.98_138.88,138.88_164.03,164.03_200.06,200.06_280.73,280.73_464.13,464.13_676.38,676.38_1259.19,1259.19_1913.06,1913.06_2999.25
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-09-17 01:00:00+00:00,75,53,26,3,1,0,0,1,0
2023-09-17 01:00:00.100000+00:00,70,49,25,2,2,0,0,1,0
2023-09-17 01:00:00.200000+00:00,51,44,28,3,0,1,0,0,0
2023-09-17 01:00:00.300000+00:00,62,62,38,1,0,0,0,0,0
2023-09-17 01:00:00.400000+00:00,88,41,34,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
2023-09-17 01:29:59.500000+00:00,67,55,41,0,0,0,0,0,0
2023-09-17 01:29:59.600000+00:00,78,47,28,1,0,0,0,0,0
2023-09-17 01:29:59.700000+00:00,69,41,25,2,0,0,0,0,0
2023-09-17 01:29:59.800000+00:00,83,46,31,4,0,0,0,0,0


In [91]:
export_pops = False
# Export archive data to csv
if export_pops == True:
    dir = "./POPS_thirty_min_csvs/"
    for f in chunked_pops:
        first_dt = f.index[0]
        time_string = datetime.strftime(first_dt, "%Y%m%d_%H%M")
        f_name = ("POPS_SCR_"+time_string+".csv")
        f.to_csv(dir+f_name)

# Merge POPS and CSAT data
* at scripps, CSAT timestamps are recorded as universal time.
* the pops records unix time which is also universal
* when unix time is decoded to readable time, it is converted into local time (mountain time).
* I need to change pops timestamp which is in MTN time back to universal

In [17]:
def merge_anem_2_pops(chunked_pops_files, chunked_anemometer_files, merged_dir, location):
    # check if folder exists and make if it does not
    if (os.path.exists(merged_dir) == False):
        print('{} does not exist. Creating it now!\n'.format(merged_dir))
        os.makedirs(merged_dir)

    for df_p in tqdm(chunked_pops_files):
        start_idx, end_idx = df_p.index[0], df_p.index[-1] # ID first and last timestamps
        #create template time series to match to
        ten_time_series = pd.date_range(start=start_idx, end=end_idx, freq='100ms')
        template_df = pd.DataFrame({'timestamp':ten_time_series}).set_index('timestamp')

        for df_an in chunked_anemometer_files:
            if df_an.index[0] == start_idx:
                template_join1 = template_df.join(df_an)
                final_merge = template_join1.join(df_p)
                # drop completely empty columns (aka unnamed). not sure why there are unnamed columns.
                final_merge = final_merge.drop(final_merge.columns[final_merge.columns.str.contains('Unnamed', case=False)], axis=1)
                # export final merged df to a csv
                first_dt = final_merge.index[0]
                time_string = datetime.strftime(first_dt, "%Y%m%d_%H%M")
                f_name = ("ANEM-POPS_"+location+"_"+time_string+".csv")
                final_merge.to_csv(merged_dir+f_name)

In [18]:
merged_dir = data_dir+"/Anem_POPS_csvs/"
merge_anem_2_pops(chunked_pops,resampled_anemometer_list, merged_dir, loc)

/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/ does not exist. Creating it now!



  0%|          | 0/3639 [00:00<?, ?it/s]

100%|██████████| 3639/3639 [05:09<00:00, 11.77it/s]


## A look at each component of the merge 
The template, anemometer dataframe, and POPS dataframe are displayed. <br>
THe merged dataframe is displayed last.

In [94]:
# Template dataframe
#template_df

In [95]:
# Anemometer dataframe
resampled_anemometer_list[5]

Unnamed: 0_level_0,u(m/s),v(m/s),w(m/s),SOS(m/s),T(Celsius)
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-09-17 08:00:00+00:00,-0.7310,1.2150,0.0380,343.700,20.12
2023-09-17 08:00:00.100000+00:00,-0.7340,1.2220,0.0270,343.690,20.10
2023-09-17 08:00:00.200000+00:00,-0.6680,1.2380,-0.0030,343.580,19.92
2023-09-17 08:00:00.300000+00:00,-0.6740,1.2320,-0.0010,343.580,19.92
2023-09-17 08:00:00.400000+00:00,-0.6650,1.2260,-0.0030,343.610,19.97
...,...,...,...,...,...
2023-09-17 08:29:59.500000+00:00,-0.3490,0.1910,-0.0075,343.590,19.93
2023-09-17 08:29:59.600000+00:00,-0.3430,0.1900,0.0040,343.600,19.95
2023-09-17 08:29:59.700000+00:00,-0.3410,0.1940,0.0050,343.600,19.95
2023-09-17 08:29:59.800000+00:00,,,,,


In [96]:
# Anemometer dataframe
chunked_pops[6]

Unnamed: 0_level_0,119.98_138.88,138.88_164.03,164.03_200.06,200.06_280.73,280.73_464.13,464.13_676.38,676.38_1259.19,1259.19_1913.06,1913.06_2999.25
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-09-17 03:30:00+00:00,76,63,34,7,0,0,0,0,0
2023-09-17 03:30:00.100000+00:00,62,64,44,3,1,0,0,0,0
2023-09-17 03:30:00.200000+00:00,83,65,32,3,0,0,0,0,0
2023-09-17 03:30:00.300000+00:00,71,42,38,7,1,0,0,0,0
2023-09-17 03:30:00.400000+00:00,96,57,41,4,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...
2023-09-17 03:59:59.500000+00:00,98,54,37,5,0,0,0,0,0
2023-09-17 03:59:59.600000+00:00,81,60,40,4,0,0,0,0,0
2023-09-17 03:59:59.700000+00:00,89,72,43,8,0,0,0,0,0
2023-09-17 03:59:59.800000+00:00,86,55,41,3,1,0,0,0,0


In [97]:
# Anemometer and POPS dataframes merged on template 10Hz. Missing data filled with NA
pd.read_csv('./Example_data/CST_data/Anem_POPS_csvs/ANEM-POPS_SGP_20230917_0930.csv')

Unnamed: 0,timestamp,u(m/s),v(m/s),w(m/s),SOS(m/s),T(Celsius),119.98_138.88,138.88_164.03,164.03_200.06,200.06_280.73,280.73_464.13,464.13_676.38,676.38_1259.19,1259.19_1913.06,1913.06_2999.25
0,2023-09-17 09:30:00+00:00,-0.150,-0.277,0.001,343.99,20.62,107,84,57,6,0,0,0,0,0
1,2023-09-17 09:30:00.100000+00:00,-0.150,-0.277,-0.001,343.97,20.58,89,71,32,8,0,1,1,0,0
2,2023-09-17 09:30:00.200000+00:00,-0.150,-0.278,-0.003,343.99,20.62,93,78,56,7,1,0,0,0,0
3,2023-09-17 09:30:00.300000+00:00,-0.149,-0.277,-0.004,344.00,20.63,86,82,57,8,0,0,0,0,0
4,2023-09-17 09:30:00.400000+00:00,-0.149,-0.276,-0.005,344.01,20.65,115,94,52,3,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,2023-09-17 09:59:59.500000+00:00,0.826,-0.997,-0.020,343.43,19.66,100,84,57,6,0,0,0,0,0
17996,2023-09-17 09:59:59.600000+00:00,0.842,-1.086,-0.053,343.44,19.68,87,66,48,3,2,1,0,0,0
17997,2023-09-17 09:59:59.700000+00:00,0.844,-1.060,0.007,343.43,19.66,86,81,52,10,3,0,0,0,0
17998,2023-09-17 09:59:59.800000+00:00,0.846,-1.045,-0.021,343.41,19.63,90,72,53,6,2,0,0,0,0


# Post processing specfically for Eddy Pro

In [19]:
def count_to_conc(bin_col, flow_rate, sample_rate):
    heading = bin_col.name
    edges = heading.split("_")
    upper = float(edges[1])
    lower = float(edges[0])
    dlog_Dp = round(np.log10(upper/lower), 4)
    concentration = bin_col.apply(lambda x: (x/flow_rate)*60*sample_rate)
    return concentration

In [20]:
def import_processed_counts(processed_dir):
    processed_count_list = glob.glob(processed_dir+'*.csv')
    processed_count_list.sort()
    return processed_count_list

In [21]:
#processed_dir = './Example_data/CST_data/Anem_POPS_csvs/'
processed_count_list = import_processed_counts(merged_dir)

print('POPS binned counts that will be processed:')
print(*processed_count_list, sep= "\n")

POPS binned counts that will be processed:
/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/ANEM-POPS_SGP_20230708_1430.csv
/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/ANEM-POPS_SGP_20230708_1500.csv
/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/ANEM-POPS_SGP_20230708_1530.csv
/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/ANEM-POPS_SGP_20230708_1600.csv
/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/ANEM-POPS_SGP_20230708_1630.csv
/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/ANEM-POPS_SGP_20230708_1700.csv
/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/ANEM-POPS_SGP_20230708_1730.csv
/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/ANEM-POPS_SGP_20230708_1800.csv
/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/ANEM-POPS_SGP_20230708_1830.csv
/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/ANEM-POPS_SGP_20230708_1900.csv
/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/ANEM-POPS_SGP_20230708_1930.csv
/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/ANEM-POPS_SGP_20230708_2000.csv
/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_csvs/A

In [22]:
def binned_counts_2_conc(processed_list,dir_to_place, anemometer_type):
    # check path for concentration directory and make if not there.
    if (os.path.exists(dir_to_place) == False):
        print('{} does not exist. Creating it now!'.format(dir_to_place))
        os.makedirs(dir_to_place)

    print("Converting POPS counts to concentrations!")
    #add in tqdm for loading bar
    for f in tqdm(processed_list):
        loaded_pops = pd.read_csv(f)
        header_metadata = dir_to_place+'header_meta.txt'
        if (os.path.isfile(header_metadata) == False):
            print("Creating a header metadata file!")
            pd.DataFrame({'headers':loaded_pops.columns}).to_csv(header_metadata)
        # Grab bins, set sample and flow rate
        bin_start = 5
        if anemometer_type == 'windmaster': bin_start = 6

        bin_headers = loaded_pops.columns[bin_start:]
        flow_rate = 180 #ccm
        sample_rate = 10 #Hz (samples/sec)
        # convert counts to concentration for each bin
        try:
            for i,bin in enumerate(bin_headers):
                bin_col = loaded_pops[bin]
                conc_at_col = count_to_conc(bin_col, flow_rate, sample_rate)
                loaded_pops["b"+str(i)] = conc_at_col
        except Exception as e:
            print(f)
            print(e)
            break
        #leave just concentrations in the file
        concentration_POPS = loaded_pops.drop(columns=bin_headers)
        #Calculate total concentration
        #print(concentration_POPS.iloc[:,bin_start:])
        concentration_POPS['total concentration (#/cm^3)'] = concentration_POPS.iloc[:,bin_start:].sum(axis=1)
        # regenerate file name
        name_chunk = (f.split("ANEM-POPS")[1].split('.')[0])
        new_name = "ANEM-POPS-CONC_"+name_chunk+".txt" # change txt to csv
        concentration_POPS.to_csv(dir_to_place+new_name, index = False)
    print("Done.")

In [23]:
conc_dir = data_dir+"/Anem_POPS_conc_csvs/"
binned_counts_2_conc(processed_count_list,conc_dir, anemometer_type)

/Volumes/T7/FALCON_SITES/SGP/Anem_POPS_conc_csvs/ does not exist. Creating it now!
Converting POPS counts to concentrations!


  0%|          | 0/1604 [00:00<?, ?it/s]

Creating a header metadata file!


100%|██████████| 1604/1604 [08:05<00:00,  3.31it/s]

Done.



