In [1]:
#!pip install chardet #run this line if chardet module is not found

import os
import pandas as pd
import re
from datetime import datetime
import chardet 
import numpy as np

In [9]:
# specify the directory where the discrete IFCB Data is stored
dir = "sample_hdrs"
cruiseNum = "fileTEST"

# indicate the file parameters to target (these are the columns needed in the IFCB log)

#file_parameters = ["FileComment", "triggerCount", "roiCount", "runTime", "inhibitTime", "humidity", "temperature", "runTime",
#                       "PMTAhighVoltage", "PMTBhighVoltage", "humidity", "SyringeSampleVolume", "syringeSamplingSpeed", "temperature", "RunFastFactor",
#                   "sampleNumber", "runSampleFast"]

file_parameters_from_hdr_files = ["FileComment", "triggerCount", "roiCount", "runTime", "inhibitTime", "SyringeSampleVolume", "syringeSamplingSpeed", "temperature", "RunFastFactor",
                   "sampleNumber", "runSampleFast"]

# dictionary used to organize the filenames/sample ids with the associated parameters
all_fileparam_dicts = []

In [10]:
#### DECLARE the FUNCTION USED for PATTERN MATCHING within the HEADER FILE

def gather_values(text_content, param_list):
    file_dict = {"Filename": filename}
    for p in param_list:
        escaped_dynamic_string = re.escape(p)
        pattern = re.compile(r'{}:\s*(.+)'.format(escaped_dynamic_string))
        dynamic_match = pattern.search(hdr_content)

        if dynamic_match:
            value = dynamic_match.group(1)
            #return dynamic_match.group(1)
            #print(p, " value: ", value)
            file_dict[p] = value
        else: #return None
            print(filename, "No match found.")

    all_fileparam_dicts.append(file_dict)

In [11]:
#### APPLY gather_values() to ALL HDR FILES in the ESTABLISHED dir

for file in os.listdir(dir):
    if file.endswith(".hdr") and not file.startswith("._"):
        filename = os.path.splitext(file)[0]
        filepath = os.path.join(dir, file)
        with open(filepath, "r", encoding="iso-8859-1") as f:
            hdr_content = f.read()
            #print(hdr_content) # read out of all hdr content
            gather_values(hdr_content, file_parameters_from_hdr_files)

In [12]:
discrete_IFCB_hdr_output = pd.DataFrame(all_fileparam_dicts)
discrete_IFCB_hdr_output

Unnamed: 0,Filename,FileComment,triggerCount,roiCount,runTime,inhibitTime,SyringeSampleVolume,syringeSamplingSpeed,temperature,RunFastFactor,sampleNumber,runSampleFast
0,D20240405T212510_IFCB199,CP10CNSM-00001 deployment,5399,5384,1200.89375,449.89739583333335,5,20,9.403791103990244,1,1,False
1,D20240403T141609_IFCB199,CP10CNSM-00001 deployment,6494,3666,1201.7823611111112,541.51,5,20,10.468260471503784,2,"manually edited test file SN jan 9, 2025",True
2,D20240404T122511_IFCB199,CP10CNSM-00001 deployment,6886,6868,1200.800277777778,574.8570138888889,5,20,10.042472724498367,1,1,False
3,D20240405T032510_IFCB199,CP10CNSM-00001 deployment,6644,6684,1201.181111111111,553.088125,5,20,9.510238040741598,1,1,False
4,D20240405T182512_IFCB199,CP10CNSM-00001 deployment,5135,5067,1201.7815277777777,427.6896527777778,5,20,9.403791103990244,1,1,False
5,D20240404T212511_IFCB199,CP10CNSM-00001 deployment,6367,6353,1201.4340277777778,530.3027430555555,5,20,9.616684977492952,1,1,False
6,D20240406T062512_IFCB199,CP10CNSM-00001 deployment,5538,5547,1202.0494444444444,461.5920833333333,5,20,9.615021744106215,1,1,False
7,D20240405T062512_IFCB199,CP10CNSM-00001 deployment,6452,6450,1202.055833333333,537.2093055555556,5,20,9.29734416723889,1,1,False
8,D20240406T032510_IFCB199,CP10CNSM-00001 deployment,5335,5319,1201.5975,443.9709027777778,5,20,9.403791103990244,1,1,False
9,D20240403T182514_IFCB199,CP10CNSM-00001 deployment,7476,4073,1201.294583333333,622.9629861111111,5,20,9.723131914244306,3,"manually edited file SN jan 9, 2026",True


In [13]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"HDR_Summaries/discrete_ifcb_hdr_summaries/{cruiseNum}_ifcb_discrete_hdr_summary_{timestamp}.csv"
#discrete_IFCB_hdr_output.to_csv(output_filename, index=False)

#### Adding Volume Analyzed (& lookTime, flowRate, and runSampleFast_Int) to the output dataframe

These can also be added via Excel functions rather than with this script.

In [15]:
# Pull date and time from the filename
discrete_IFCB_hdr_output['Datetime'] = pd.to_datetime(discrete_IFCB_hdr_output['Filename'].str[1:15], format='%Y%m%dT%H%M%S')

# Ensure needed numeric values are numeric and not strings
discrete_IFCB_hdr_output['runTime'] = pd.to_numeric(discrete_IFCB_hdr_output['runTime'], errors='coerce')
discrete_IFCB_hdr_output['inhibitTime'] = pd.to_numeric(discrete_IFCB_hdr_output['inhibitTime'], errors='coerce')
discrete_IFCB_hdr_output['syringeSamplingSpeed'] = pd.to_numeric(discrete_IFCB_hdr_output['syringeSamplingSpeed'], errors='coerce')
discrete_IFCB_hdr_output['SyringeSampleVolume'] = pd.to_numeric(discrete_IFCB_hdr_output['SyringeSampleVolume'], errors='coerce')
discrete_IFCB_hdr_output['RunFastFactor'] = pd.to_numeric(discrete_IFCB_hdr_output['RunFastFactor'], errors='coerce')


# Create a column for lookTime
discrete_IFCB_hdr_output['lookTime'] = discrete_IFCB_hdr_output['runTime'] - discrete_IFCB_hdr_output['inhibitTime']


# Create a column for runSampleFast_Int
## if runsamplefast = false then runsamplefast==1
discrete_IFCB_hdr_output['runSampleFast_Int'] = (discrete_IFCB_hdr_output['runSampleFast'].str.lower() != 'true').astype(int)
discrete_IFCB_hdr_output['runSampleFast_Int']

    
# Create a column for flowRate_mins 

## syringeSamplingSpeed (McLane sets this to 20 minutes, i.e., this is a fixed 20 in the header file, which is not always accurate)
#### 20 minutes is accurate in the cases where the sampling volume is 5 mL, but becomes inaccurate when the sampling times are altered.
#### The more operationally precise way to view this is to focus on the default flow rate being 0.25 mL/min (which is true, and is 
#### usually accurately represented by the defaults (5 mL SyringeSampleVolume / syringeSamplingSpeed).
#### 0.25 mL/min is only NOT accurate in cases where runSampleFast_Int is changed from 1 (the default) AND runSampleFast == TRUE.  

## SyringeSampleVolume (this is usually 5 ml unless the sample time is cut short by the operator (usually for testing reasons)
base_flow_rate = 0.25

discrete_IFCB_hdr_output['flowRate_mins'] = np.where(
    discrete_IFCB_hdr_output['runSampleFast'],
    base_flow_rate * discrete_IFCB_hdr_output["RunFastFactor"],
    base_flow_rate
)
    

# Create a column for volumeAnalyzed
#discrete_IFCB_hdr_output['volumeAnalyzed'] = (discrete_IFCB_hdr_output['RunFastFactor'] * discrete_IFCB_hdr_output['runSampleFast_Int']) * discrete_IFCB_hdr_output['flowRate_mins'] * (discrete_IFCB_hdr_output['lookTime']/60)/5
discrete_IFCB_hdr_output['volumeAnalyzed'] = discrete_IFCB_hdr_output['flowRate_mins'] * (discrete_IFCB_hdr_output['lookTime']/60)

discrete_IFCB_hdr_output

Unnamed: 0,Filename,FileComment,triggerCount,roiCount,runTime,inhibitTime,SyringeSampleVolume,syringeSamplingSpeed,temperature,RunFastFactor,sampleNumber,runSampleFast,Datetime,lookTime,runSampleFast_Int,flowRate_mins,volumeAnalyzed
0,D20240405T212510_IFCB199,CP10CNSM-00001 deployment,5399,5384,1200.89375,449.897396,5,20,9.403791103990244,1,1,False,2024-04-05 21:25:01,750.996354,1,0.25,3.129151
1,D20240403T141609_IFCB199,CP10CNSM-00001 deployment,6494,3666,1201.782361,541.51,5,20,10.468260471503784,2,"manually edited test file SN jan 9, 2025",True,2024-04-03 14:16:00,660.272361,0,0.5,5.50227
2,D20240404T122511_IFCB199,CP10CNSM-00001 deployment,6886,6868,1200.800278,574.857014,5,20,10.042472724498367,1,1,False,2024-04-04 12:25:01,625.943264,1,0.25,2.608097
3,D20240405T032510_IFCB199,CP10CNSM-00001 deployment,6644,6684,1201.181111,553.088125,5,20,9.510238040741598,1,1,False,2024-04-05 03:25:01,648.092986,1,0.25,2.700387
4,D20240405T182512_IFCB199,CP10CNSM-00001 deployment,5135,5067,1201.781528,427.689653,5,20,9.403791103990244,1,1,False,2024-04-05 18:25:01,774.091875,1,0.25,3.225383
5,D20240404T212511_IFCB199,CP10CNSM-00001 deployment,6367,6353,1201.434028,530.302743,5,20,9.616684977492952,1,1,False,2024-04-04 21:25:01,671.131285,1,0.25,2.79638
6,D20240406T062512_IFCB199,CP10CNSM-00001 deployment,5538,5547,1202.049444,461.592083,5,20,9.615021744106215,1,1,False,2024-04-06 06:25:01,740.457361,1,0.25,3.085239
7,D20240405T062512_IFCB199,CP10CNSM-00001 deployment,6452,6450,1202.055833,537.209306,5,20,9.29734416723889,1,1,False,2024-04-05 06:25:01,664.846528,1,0.25,2.770194
8,D20240406T032510_IFCB199,CP10CNSM-00001 deployment,5335,5319,1201.5975,443.970903,5,20,9.403791103990244,1,1,False,2024-04-06 03:25:01,757.626597,1,0.25,3.156777
9,D20240403T182514_IFCB199,CP10CNSM-00001 deployment,7476,4073,1201.294583,622.962986,5,20,9.723131914244306,3,"manually edited file SN jan 9, 2026",True,2024-04-03 18:25:01,578.331597,0,0.75,7.229145


In [16]:
# SAVE THE HDR SUMMARY FILE CONTAINING THESE CALCULATED VALUES
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"HDR_Summaries/discrete_ifcb_hdr_summaries/{cruiseNum}_ifcb_discrete_hdr_summary_with_calcd_values_{timestamp}.csv"
discrete_IFCB_hdr_output.to_csv(output_filename, index=False)