In [14]:
#!pip install chardet #run this line if chardet module is not found

import os
import pandas as pd
import re
from datetime import datetime
import chardet 

In [7]:
# specify the directory where the discrete IFCB Data is stored
dir = "/Volumes/My Passport/IFCB_PLIMS/IFCB_Cruise_data/Pioneer20_AR82_IFCB_data/AR82_shipboard_discrete_ifcb206/AR82a_ifcb206_discrete_dump"

# indicate the file parameters to target (these are the columns needed in the IFCB log)

#file_parameters = ["FileComment", "triggerCount", "roiCount", "runTime", "inhibitTime", "humidity", "temperature", "runTime",
#                       "PMTAhighVoltage", "PMTBhighVoltage", "humidity", "SyringeSampleVolume", "syringeSamplingSpeed", "temperature", "RunFastFactor",
#                   "sampleNumber", "runSampleFast"]

file_parameters_from_hdr_files = ["FileComment", "triggerCount", "roiCount", "runTime", "inhibitTime", "SyringeSampleVolume", "syringeSamplingSpeed", "temperature", "RunFastFactor",
                   "sampleNumber", "runSampleFast"]

# dictionary used to organize the filenames/sample ids with the associated parameters
all_fileparam_dicts = []

In [8]:
#### DECLARE the FUNCTION USED for PATTERN MATCHING within the HEADER FILE

def gather_values(text_content, param_list):
    file_dict = {"Filename": filename}
    for p in param_list:
        escaped_dynamic_string = re.escape(p)
        pattern = re.compile(r'{}:\s*(.+)'.format(escaped_dynamic_string))
        dynamic_match = pattern.search(hdr_content)

        if dynamic_match:
            value = dynamic_match.group(1)
            #return dynamic_match.group(1)
            #print(p, " value: ", value)
            file_dict[p] = value
        else: #return None
            print(filename, "No match found.")

    all_fileparam_dicts.append(file_dict)

In [11]:
#### APPLY gather_values() to ALL HDR FILES in the ESTABLISHED dir

for file in os.listdir(dir):
    if file.endswith(".hdr") and not file.startswith("._"):
        filename = os.path.splitext(file)[0]
        filepath = os.path.join(dir, file)
        with open(filepath, "r", encoding="iso-8859-1") as f:
            hdr_content = f.read()
            #print(hdr_content) # read out of all hdr content
            gather_values(hdr_content, file_parameters_from_hdr_files)

In [12]:
discrete_IFCB_hdr_output = pd.DataFrame(all_fileparam_dicts)
discrete_IFCB_hdr_output

Unnamed: 0,Filename,FileComment,triggerCount,roiCount,runTime,inhibitTime,SyringeSampleVolume,syringeSamplingSpeed,temperature,RunFastFactor,sampleNumber,runSampleFast
0,D20240330T151115_IFCB206,AR82 MOB beads run,10322,10340,1201.5091666666667,859.2371527777777,5,20,19.409803158617542,5,1,False
1,D20240403T154310_IFCB206,ar82a_c6n3_cn_7m,5260,4118,1201.440138888889,438.83458333333334,5,20,15.895391012436114,5,1,False
2,D20240403T160705_IFCB206,ar82a_c6n3_cn_7m,4073,3743,1201.3808333333334,340.1190972222222,5,20,15.897054245822858,5,2,False
3,D20240403T163100_IFCB206,ar82a_c6n3_cn_7m,3580,3424,1201.2030555555555,297.51996527777777,5,20,15.897054245822858,5,3,False
4,D20240403T165903_IFCB206,ar82a_c6n3_cn_7m,3619,3393,1201.1744444444444,301.45954861111113,5,20,15.895391012436114,5,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...
62,D20240416T225730_IFCB206,ar82b_cast4n6_b3_so_chlmax_23m,2953,2849,1200.8383333333334,246.41769097222223,5,20,16.32117875944153,5,2,False
63,D20240416T232124_IFCB206,ar82b_cast4n6_b3_so_chlmax_23m,2691,2534,1201.0645833333333,224.58078125,5,20,16.32117875944153,5,3,False
64,D20240417T150235_IFCB206,ar82b_c5n9_b1_no_surface,3678,3363,1200.890138888889,308.2913541666667,5,20,16.429288929579627,5,1,False
65,D20240417T152629_IFCB206,ar82b_c5n9_b1_no_surface,3131,2983,1200.9815277777777,261.29013888888886,5,20,16.429288929579627,5,2,False


In [15]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"HDR_Summaries/discrete_ifcb_hdr_summaries/ifcb_discrete_hdr_summary_{timestamp}.csv"
discrete_IFCB_hdr_output.to_csv(output_filename, index=False)

#### Adding Volume Analyzed (& lookTime, flowRate, and runSampleFast_Int) to the output dataframe

These can also be added via Excel functions rather than with this script.

In [16]:
# Pull date and time from the filename
discrete_IFCB_hdr_output['Datetime'] = pd.to_datetime(discrete_IFCB_hdr_output['Filename'].str[1:15], format='%Y%m%dT%H%M%S')

# Ensure needed numeric values are numeric and not strings
discrete_IFCB_hdr_output['runTime'] = pd.to_numeric(discrete_IFCB_hdr_output['runTime'], errors='coerce')
discrete_IFCB_hdr_output['inhibitTime'] = pd.to_numeric(discrete_IFCB_hdr_output['inhibitTime'], errors='coerce')
discrete_IFCB_hdr_output['syringeSamplingSpeed'] = pd.to_numeric(discrete_IFCB_hdr_output['syringeSamplingSpeed'], errors='coerce')
discrete_IFCB_hdr_output['SyringeSampleVolume'] = pd.to_numeric(discrete_IFCB_hdr_output['SyringeSampleVolume'], errors='coerce')
discrete_IFCB_hdr_output['RunFastFactor'] = pd.to_numeric(discrete_IFCB_hdr_output['RunFastFactor'], errors='coerce')


# Create a column for lookTime
discrete_IFCB_hdr_output['lookTime'] = discrete_IFCB_hdr_output['runTime'] - discrete_IFCB_hdr_output['inhibitTime']


# Create a column for runSampleFast_Int
## if runsamplefast = false then runsamplefast==1
discrete_IFCB_hdr_output['runSampleFast_Int'] = (discrete_IFCB_hdr_output['runSampleFast'].str.lower() != 'true').astype(int)
discrete_IFCB_hdr_output['runSampleFast_Int']

    
# Create a column for flowRate_mins ()
## syringeSamplingSpeed (usually 20 mins), SyringeSampleVolume (usually 5 ml)
discrete_IFCB_hdr_output['flowRate_mins'] = discrete_IFCB_hdr_output['SyringeSampleVolume'] / discrete_IFCB_hdr_output['syringeSamplingSpeed']


# Create a column for volumeAnalyzed
discrete_IFCB_hdr_output['volumeAnalyzed'] = (discrete_IFCB_hdr_output['RunFastFactor'] * discrete_IFCB_hdr_output['runSampleFast_Int']) * discrete_IFCB_hdr_output['flowRate_mins'] * (discrete_IFCB_hdr_output['lookTime']/60)/5

discrete_IFCB_hdr_output

Unnamed: 0,Filename,FileComment,triggerCount,roiCount,runTime,inhibitTime,SyringeSampleVolume,syringeSamplingSpeed,temperature,RunFastFactor,sampleNumber,runSampleFast,Datetime,lookTime,runSampleFast_Int,flowRate_mins,volumeAnalyzed
0,D20240330T151115_IFCB206,AR82 MOB beads run,10322,10340,1201.509167,859.237153,5,20,19.409803158617542,5,1,False,2024-03-30 15:11:01,342.272014,1,0.25,1.426133
1,D20240403T154310_IFCB206,ar82a_c6n3_cn_7m,5260,4118,1201.440139,438.834583,5,20,15.895391012436114,5,1,False,2024-04-03 15:43:01,762.605556,1,0.25,3.177523
2,D20240403T160705_IFCB206,ar82a_c6n3_cn_7m,4073,3743,1201.380833,340.119097,5,20,15.897054245822858,5,2,False,2024-04-03 16:07:00,861.261736,1,0.25,3.588591
3,D20240403T163100_IFCB206,ar82a_c6n3_cn_7m,3580,3424,1201.203056,297.519965,5,20,15.897054245822858,5,3,False,2024-04-03 16:31:00,903.683090,1,0.25,3.765346
4,D20240403T165903_IFCB206,ar82a_c6n3_cn_7m,3619,3393,1201.174444,301.459549,5,20,15.895391012436114,5,1,False,2024-04-03 16:59:00,899.714896,1,0.25,3.748812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,D20240416T225730_IFCB206,ar82b_cast4n6_b3_so_chlmax_23m,2953,2849,1200.838333,246.417691,5,20,16.32117875944153,5,2,False,2024-04-16 22:57:03,954.420642,1,0.25,3.976753
63,D20240416T232124_IFCB206,ar82b_cast4n6_b3_so_chlmax_23m,2691,2534,1201.064583,224.580781,5,20,16.32117875944153,5,3,False,2024-04-16 23:21:02,976.483802,1,0.25,4.068683
64,D20240417T150235_IFCB206,ar82b_c5n9_b1_no_surface,3678,3363,1200.890139,308.291354,5,20,16.429288929579627,5,1,False,2024-04-17 15:02:03,892.598785,1,0.25,3.719162
65,D20240417T152629_IFCB206,ar82b_c5n9_b1_no_surface,3131,2983,1200.981528,261.290139,5,20,16.429288929579627,5,2,False,2024-04-17 15:26:02,939.691389,1,0.25,3.915381


In [17]:
# SAVE THE HDR SUMMARY FILE CONTAINING THESE CALCULATED VALUES
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"HDR_Summaries/discrete_ifcb_hdr_summaries/ifcb_discrete_hdr_summary_with_calcd_values_{timestamp}.csv"
discrete_IFCB_hdr_output.to_csv(output_filename, index=False)