In [2]:
#!pip install geopy #run this line if geopy module is not found
#!pip install chardet #run this line if chardet module is not found
 
import os
import pandas as pd
import re
from datetime import datetime
from geopy.distance import geodesic
import chardet #character encoding detector

In [16]:
# specify the directory where the underway IFCB Data is stored
dir = "/Volumes/My Passport/IFCB_PLIMS/IFCB_Cruise_data/Pioneer21_AR87_IFCB_data/AR87_shipboard_underway_ifcb200"
#dir = "./Pioneer21_underwayGPS"

# indicate the file parameters to target (these are the columns needed in the IFCB log)

#file_parameters = ["FileComment", "triggerCount", "roiCount", "runTime", "inhibitTime", "humidity", "temperature", "runTime",
#                       "PMTAhighVoltage", "PMTBhighVoltage", "humidity", "SyringeSampleVolume", "syringeSamplingSpeed", "temperature", "RunFastFactor",
#                   "sampleNumber", "runSampleFast"]

file_parameters_from_hdr_files = ["FileComment", "triggerCount", "roiCount", "runTime", "inhibitTime", "SyringeSampleVolume", "syringeSamplingSpeed", "temperature", "RunFastFactor",
                   "sampleNumber", "runSampleFast"]

# dictionary used to organize the filenames/sample ids with the associated parameters
all_fileparam_dicts = []



In [17]:
#### DECLARE the FUNCTION USED for PATTERN MATCHING within the HEADER FILE

def gather_values(text_content, param_list):
    file_dict = {"Filename": filename}
    for p in param_list:
        escaped_dynamic_string = re.escape(p)
        pattern = re.compile(r'{}:\s*(.+)'.format(escaped_dynamic_string))
        dynamic_match = pattern.search(hdr_content)

        if dynamic_match:
            value = dynamic_match.group(1)
            #return dynamic_match.group(1)
            #print(p, " value: ", value)
            file_dict[p] = value
        else: #return None
            print(filename, "No match found.")

    all_fileparam_dicts.append(file_dict)

In [18]:
#### APPLY gather_values() to ALL HDR FILES in the ESTABLISHED dir

for file in os.listdir(dir):
    if file.endswith(".hdr") and not file.startswith("._"):
        filename = os.path.splitext(file)[0]
        filepath = os.path.join(dir, file)
        with open(filepath, "r", encoding="iso-8859-1") as f:
            hdr_content = f.read()
            #print(hdr_content) # read out of all hdr content
            gather_values(hdr_content, file_parameters_from_hdr_files)
        

In [19]:
underway_IFCB_hdr_output = pd.DataFrame(all_fileparam_dicts)
underway_IFCB_hdr_output

Unnamed: 0,Filename,FileComment,triggerCount,roiCount,runTime,inhibitTime,SyringeSampleVolume,syringeSamplingSpeed,temperature,RunFastFactor,sampleNumber,runSampleFast
0,D20250401T162025_IFCB200,ar87a_underway,3028,2901,1201.6904166666666,250.37109375,5,20,26.115960173952857,5,50,False
1,D20250401T155640_IFCB200,ar87a_underway,2937,2839,1200.8119444444444,242.88069444444446,5,20,26.21741741054398,5,49,False
2,D20250401T153256_IFCB200,ar87a_underway,2939,2815,1200.91,242.97210069444444,5,20,26.219080643930724,5,48,False
3,D20250401T150912_IFCB200,ar87a_underway,3407,3256,1200.9173611111112,281.8092013888889,5,20,26.22074387731746,5,47,False
4,D20250401T144528_IFCB200,ar87a_underway,2966,2853,1200.9620833333333,245.0948611111111,5,20,26.22074387731746,5,46,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1036,D20250418T234938_IFCB200,ar87b_underway,2135,2119,1200.8508333333334,177.18835069444444,5,20,16.642182803082335,5,67,False
1037,D20250418T232554_IFCB200,ar87b_underway,2235,2196,1200.7445833333334,185.60505208333333,5,20,16.53573586633098,5,66,False
1038,D20250418T230209_IFCB200,ar87b_underway,1801,1795,1202.024861111111,149.3659722222222,5,20,16.746966506446945,5,65,False
1039,D20250418T223825_IFCB200,ar87b_underway,2523,2487,1201.5422222222223,209.4875,5,20,16.959860379949653,5,64,False


In [20]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"HDR_Summaries/underway_ifcb_hdr_summaries/AR87_ifcb_underway_hdr_summary_{timestamp}.csv"
underway_IFCB_hdr_output.to_csv(output_filename, index=False)

#### Adding Volume Analyzed (& lookTime, flowRate, and runSampleFast_Int) to the output dataframe

These can also be added via Excel functions rather than with this script.

In [21]:
# Pull date and time from the filename
underway_IFCB_hdr_output['Datetime'] = pd.to_datetime(underway_IFCB_hdr_output['Filename'].str[1:15], format='%Y%m%dT%H%M%S')

# Ensure needed numeric values are numeric and not strings
underway_IFCB_hdr_output['runTime'] = pd.to_numeric(underway_IFCB_hdr_output['runTime'], errors='coerce')
underway_IFCB_hdr_output['inhibitTime'] = pd.to_numeric(underway_IFCB_hdr_output['inhibitTime'], errors='coerce')
underway_IFCB_hdr_output['syringeSamplingSpeed'] = pd.to_numeric(underway_IFCB_hdr_output['syringeSamplingSpeed'], errors='coerce')
underway_IFCB_hdr_output['SyringeSampleVolume'] = pd.to_numeric(underway_IFCB_hdr_output['SyringeSampleVolume'], errors='coerce')
underway_IFCB_hdr_output['RunFastFactor'] = pd.to_numeric(underway_IFCB_hdr_output['RunFastFactor'], errors='coerce')


# Create a column for lookTime
underway_IFCB_hdr_output['lookTime'] = underway_IFCB_hdr_output['runTime'] - underway_IFCB_hdr_output['inhibitTime']


# Create a column for runSampleFast_Int
## if runsamplefast = false then runsamplefast==1
underway_IFCB_hdr_output['runSampleFast_Int'] = (underway_IFCB_hdr_output['runSampleFast'].str.lower() != 'true').astype(int)
underway_IFCB_hdr_output['runSampleFast_Int']

    
# Create a column for flowRate_mins ()
## syringeSamplingSpeed (usually 20 mins), SyringeSampleVolume (usually 5 ml)
underway_IFCB_hdr_output['flowRate_mins'] = underway_IFCB_hdr_output['SyringeSampleVolume'] / underway_IFCB_hdr_output['syringeSamplingSpeed']


# Create a column for volumeAnalyzed
underway_IFCB_hdr_output['volumeAnalyzed'] = (underway_IFCB_hdr_output['RunFastFactor'] * underway_IFCB_hdr_output['runSampleFast_Int']) * underway_IFCB_hdr_output['flowRate_mins'] * (underway_IFCB_hdr_output['lookTime']/60)/5

underway_IFCB_hdr_output

Unnamed: 0,Filename,FileComment,triggerCount,roiCount,runTime,inhibitTime,SyringeSampleVolume,syringeSamplingSpeed,temperature,RunFastFactor,sampleNumber,runSampleFast,Datetime,lookTime,runSampleFast_Int,flowRate_mins,volumeAnalyzed
0,D20250401T162025_IFCB200,ar87a_underway,3028,2901,1201.690417,250.371094,5,20,26.115960173952857,5,50,False,2025-04-01 16:20:02,951.319323,1,0.25,3.963831
1,D20250401T155640_IFCB200,ar87a_underway,2937,2839,1200.811944,242.880694,5,20,26.21741741054398,5,49,False,2025-04-01 15:56:04,957.931250,1,0.25,3.991380
2,D20250401T153256_IFCB200,ar87a_underway,2939,2815,1200.910000,242.972101,5,20,26.219080643930724,5,48,False,2025-04-01 15:32:05,957.937899,1,0.25,3.991408
3,D20250401T150912_IFCB200,ar87a_underway,3407,3256,1200.917361,281.809201,5,20,26.22074387731746,5,47,False,2025-04-01 15:09:01,919.108160,1,0.25,3.829617
4,D20250401T144528_IFCB200,ar87a_underway,2966,2853,1200.962083,245.094861,5,20,26.22074387731746,5,46,False,2025-04-01 14:45:02,955.867222,1,0.25,3.982780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1036,D20250418T234938_IFCB200,ar87b_underway,2135,2119,1200.850833,177.188351,5,20,16.642182803082335,5,67,False,2025-04-18 23:49:03,1023.662483,1,0.25,4.265260
1037,D20250418T232554_IFCB200,ar87b_underway,2235,2196,1200.744583,185.605052,5,20,16.53573586633098,5,66,False,2025-04-18 23:25:05,1015.139531,1,0.25,4.229748
1038,D20250418T230209_IFCB200,ar87b_underway,1801,1795,1202.024861,149.365972,5,20,16.746966506446945,5,65,False,2025-04-18 23:02:00,1052.658889,1,0.25,4.386079
1039,D20250418T223825_IFCB200,ar87b_underway,2523,2487,1201.542222,209.487500,5,20,16.959860379949653,5,64,False,2025-04-18 22:38:02,992.054722,1,0.25,4.133561


In [22]:
# SAVE THE HDR SUMMARY FILE CONTAINING THESE CALCULATED VALUES
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"HDR_Summaries/underway_ifcb_hdr_summaries/AR87_ifcb_underway_hdr_summary_with_calcd_values_{timestamp}.csv"
underway_IFCB_hdr_output.to_csv(output_filename, index=False)

#### Adding Ship Latitude and Longitude

In [24]:
# This file is created via the Ship underway Lat & Lon Processing.ipynb notebook
underway_file = "Merged_Ship_Underway_Files/mergedunderway_AR87.csv"
#underway_ship_data = pd.read_csv(underway_file, dtype={'Datetime_GMT_underway': 'datetime64[ns]'})
underway_ship_data = pd.read_csv(underway_file, parse_dates=['Datetime_GMT_underway'], low_memory=False)
underway_ship_data['Datetime_GMT_underway'] = pd.to_datetime(underway_ship_data['Datetime_GMT_underway'], errors='coerce')
underway_ship_data['Datetime_GMT_underway'].dtype
underway_ship_data


# Uncomment these two following lines if you want to look at faulty dates that might be present in the ship data
#invalid_dates = underway_ship_data[underway_ship_data['Datetime_GMT_underway'].isna()]
#print(invalid_dates)

Unnamed: 0,DATE_GMT,TIME_GMT,CNAV_LAT,CNAV_LON,CNAV_COG,CNAV_SOG,DPS112_LAT,DPS112_LON,DPS_COG,SPD,GYRO_HDT,SEAPATH_LAT,SEAPATH_LON,SEAPATH_COG,SEAPATH_SOG,POSMV_LAT,POSMV_LON,CNAV_DECMIN,CNAV_DECMIN.1,Datetime_GMT_underway
0,2025/04/01,00:00:07.143,35.747,-74.853,178.2,9.1,35.747,-74.853,175.87,9.61,177.02,3544.795243,7451.17667,179.06,9.2,3544.79496,7451.1767,3544.801866,7451.172295,2025-04-01 00:00:07.143
1,2025/04/01,00:00:17.143,35.746,-74.853,178.5,9.09,35.746,-74.853,178.33,9.70,177.41,3544.769974,7451.176152,180.51,8.9,3544.76969,7451.17624,3544.776611,7451.171729,2025-04-01 00:00:17.143
2,2025/04/01,00:00:27.143,35.746,-74.853,178.6,9.12,35.746,-74.853,179.46,9.73,177.07,3544.744582,7451.1756,179.29,9.1,3544.74428,7451.1757,3544.751191,7451.171261,2025-04-01 00:00:27.143
3,2025/04/01,00:00:37.143,35.745,-74.853,179.5,9.11,35.745,-74.853,180.0,9.70,176.51,3544.719126,7451.175051,178.65,9.2,3544.71884,7451.17514,3544.725805,7451.17092,2025-04-01 00:00:37.143
4,2025/04/01,00:00:47.143,35.745,-74.853,177.4,9.1,35.745,-74.853,173.11,9.63,175.38,3544.694059,7451.17379,177.63,9.1,3544.69381,7451.17387,3544.700584,7451.16984,2025-04-01 00:00:47.143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203225,2025/04/09,23:59:12.286,41.524,-70.672,153.400,0.000,41.524,-70.672,242.35,NAN,24.54,4131.44301,7040.33008,122.96,0.0,4131.44275,7040.33047,4131.439885,7040.336932,2025-04-09 23:59:12.286
203226,2025/04/09,23:59:22.286,41.524,-70.672,261.500,0.040,41.524,-70.672,298.43,NAN,24.60,4131.44302,7040.330059,154.33,0.0,4131.44274,7040.33051,4131.439887,7040.336987,2025-04-09 23:59:22.286
203227,2025/04/09,23:59:32.287,41.524,-70.672,199.900,0.090,41.524,-70.672,202.53,NAN,24.57,4131.443018,7040.330067,209.75,0.0,4131.4427,7040.33054,4131.439879,7040.337074,2025-04-09 23:59:32.287
203228,2025/04/09,23:59:42.287,41.524,-70.672,293.200,0.010,41.524,-70.672,210.35,NAN,24.54,4131.443014,7040.330087,206.76,0.0,4131.44268,7040.33052,4131.439966,7040.337154,2025-04-09 23:59:42.287


In [26]:
# Sort both dfs by date
underway_IFCB_hdr_output = underway_IFCB_hdr_output.sort_values('Datetime')
underway_ship_data = underway_ship_data.sort_values('Datetime_GMT_underway')

# Drop any rows in the ship data that do not have datetimes
underway_ship_data = underway_ship_data.dropna(subset=['Datetime_GMT_underway'])

# Merge the underway ship data to the IFCB HDR summary df
hdr_summary_with_lat_and_lon = pd.merge_asof(underway_IFCB_hdr_output, underway_ship_data, left_on='Datetime', right_on='Datetime_GMT_underway', direction='nearest')

In [27]:
# Display the df containing all hdr summary information and ship lat and lon
hdr_summary_with_lat_and_lon

Unnamed: 0,Filename,FileComment,triggerCount,roiCount,runTime,inhibitTime,SyringeSampleVolume,syringeSamplingSpeed,temperature,RunFastFactor,...,GYRO_HDT,SEAPATH_LAT,SEAPATH_LON,SEAPATH_COG,SEAPATH_SOG,POSMV_LAT,POSMV_LON,CNAV_DECMIN,CNAV_DECMIN.1,Datetime_GMT_underway
0,D20250328T171555_IFCB200,ar87a_underway,1613,1540,1200.901528,133.086493,5,20,13.129433890287643,5,...,186.36,4108.715124,7053.448185,185.22,9.8,4108.71484,7053.44787,4108.721044,7053.441689,2025-03-28 17:15:00.220
1,D20250328T173939_IFCB200,ar87a_underway,1073,983,1200.910972,88.414045,5,20,12.808429846646831,5,...,169.85,4104.857138,7053.097267,169.53,10.0,4104.85681,7053.09697,4104.864312,7053.093213,2025-03-28 17:39:00.230
2,D20250328T180323_IFCB200,ar87a_underway,867,812,1200.930139,71.513533,5,20,12.810093080033568,5,...,180.91,4100.936547,7052.987073,180.59,9.8,4100.93602,7052.98686,4100.943207,7052.981696,2025-03-28 18:03:00.239
3,D20250328T182707_IFCB200,ar87a_underway,872,817,1200.981111,71.783125,5,20,12.808429846646831,5,...,180.57,4057.05181,7053.01058,179.38,9.8,4057.05122,7053.01043,4057.058562,7053.005612,2025-03-28 18:27:00.248
4,D20250328T185052_IFCB200,ar87a_underway,1248,1164,1201.002222,102.831484,5,20,12.48908903639277,5,...,185.28,NAN,NAN,NAN,NAN,4053.30142,7053.07443,4053.30576,7053.068749,2025-03-28 18:50:06.045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1036,D20250420T073711_IFCB200,ar87b_underway,4798,4286,1201.198333,400.173785,5,20,17.280864423590458,5,...,218.92,4021.801427,7052.928277,197.24,0.5,4021.80147,7052.92785,4021.803277,7052.920265,2025-04-19 23:59:57.787
1037,D20250420T080056_IFCB200,ar87b_underway,4440,4008,1201.521111,370.986146,5,20,17.387311360341812,5,...,218.92,4021.801427,7052.928277,197.24,0.5,4021.80147,7052.92785,4021.803277,7052.920265,2025-04-19 23:59:57.787
1038,D20250420T082440_IFCB200,ar87b_underway,4413,3981,1200.866806,368.031667,5,20,17.493758297093166,5,...,218.92,4021.801427,7052.928277,197.24,0.5,4021.80147,7052.92785,4021.803277,7052.920265,2025-04-19 23:59:57.787
1039,D20250420T084824_IFCB200,ar87b_underway,4885,4352,1201.439028,408.629028,5,20,17.60020523384452,5,...,218.92,4021.801427,7052.928277,197.24,0.5,4021.80147,7052.92785,4021.803277,7052.920265,2025-04-19 23:59:57.787


In [28]:
# Save the df containing all hdr summary information and ship lat and lon
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"HDR_Summaries/underway_ifcb_hdr_summaries/AR87_ifcb_underway_hdr_summary_with_lat_lon_{timestamp}.csv"
hdr_summary_with_lat_and_lon.to_csv(output_filename, index=False)

#### Adding Site (from Ship Latitude and Longitude)

In [29]:
mab_site_centers = {
    'site': ['CP10CNSM', 'CP11NOSM', 'CP11SOSM', 'CP12CNSW', 'CP12WESW', 'CP13EAPM', 
             'CP13NOPM', 'CP13SOPM', 'CP14NEPM', 'CP14SEPM'],
    'lat': [35.95, 36.175, 35.725, 35.95, 35.95, 35.95, 36.175, 35.725, 36.0536, 35.8514],
    'lon': [-75.125, -74.8267, -74.853, -75.125, -75.3333, -74.8457, -74.8267, -74.853, 
            -74.7776, -74.8482]
}

centers_df = pd.DataFrame(mab_site_centers)

In [35]:
hdr_summary_with_lat_and_lon.columns.tolist()
#hdr_summary_with_lat_and_lon[" Dec_LON"] - this was used for AR82, but this col is not in the AR87 files
hdr_summary_with_lat_and_lon[' CNAV_LON']
hdr_summary_with_lat_and_lon[' CNAV_LAT']

0       41.145
1       41.081
2       41.016
3       40.951
4       40.888
         ...  
1036    40.363
1037    40.363
1038    40.363
1039    40.363
1040    40.363
Name:  CNAV_LAT, Length: 1041, dtype: object

In [36]:
ifcb_lat = hdr_summary_with_lat_and_lon[" CNAV_LAT"]
ifcb_lon = hdr_summary_with_lat_and_lon[" CNAV_LON"]

hdr_summary_with_lat_and_lon_and_sites = hdr_summary_with_lat_and_lon

def check_within_radius(ifcb_lat, ifcb_lon, centers_df, radius):
    for _, center in centers_df.iterrows():
        center_lat = center['lat']
        center_lon = center['lon']
        center_name = center.get('site', 'Unnamed Site')

        # Calculate distance
        distance = geodesic((ifcb_lat, ifcb_lon), (center_lat, center_lon)).kilometers
        if distance <= radius:
            return True, center_name  # Return True and the site name if within radius
    return False, None  # Return False if not within any radius

# Define a wrapper function to apply row-wise
def apply_check_within_radius(row):
    return check_within_radius(row[' CNAV_LAT'], row[' CNAV_LON'], centers_df, radius=2)  # radius in kilometers



# Apply function to every row in the DataFrame
hdr_summary_with_lat_and_lon_and_sites[['within_radius', 'site_name']] = hdr_summary_with_lat_and_lon_and_sites.apply(apply_check_within_radius, axis=1, result_type="expand")

In [37]:
hdr_summary_with_lat_and_lon_and_sites

Unnamed: 0,Filename,FileComment,triggerCount,roiCount,runTime,inhibitTime,SyringeSampleVolume,syringeSamplingSpeed,temperature,RunFastFactor,...,SEAPATH_LON,SEAPATH_COG,SEAPATH_SOG,POSMV_LAT,POSMV_LON,CNAV_DECMIN,CNAV_DECMIN.1,Datetime_GMT_underway,within_radius,site_name
0,D20250328T171555_IFCB200,ar87a_underway,1613,1540,1200.901528,133.086493,5,20,13.129433890287643,5,...,7053.448185,185.22,9.8,4108.71484,7053.44787,4108.721044,7053.441689,2025-03-28 17:15:00.220,False,
1,D20250328T173939_IFCB200,ar87a_underway,1073,983,1200.910972,88.414045,5,20,12.808429846646831,5,...,7053.097267,169.53,10.0,4104.85681,7053.09697,4104.864312,7053.093213,2025-03-28 17:39:00.230,False,
2,D20250328T180323_IFCB200,ar87a_underway,867,812,1200.930139,71.513533,5,20,12.810093080033568,5,...,7052.987073,180.59,9.8,4100.93602,7052.98686,4100.943207,7052.981696,2025-03-28 18:03:00.239,False,
3,D20250328T182707_IFCB200,ar87a_underway,872,817,1200.981111,71.783125,5,20,12.808429846646831,5,...,7053.01058,179.38,9.8,4057.05122,7053.01043,4057.058562,7053.005612,2025-03-28 18:27:00.248,False,
4,D20250328T185052_IFCB200,ar87a_underway,1248,1164,1201.002222,102.831484,5,20,12.48908903639277,5,...,NAN,NAN,NAN,4053.30142,7053.07443,4053.30576,7053.068749,2025-03-28 18:50:06.045,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1036,D20250420T073711_IFCB200,ar87b_underway,4798,4286,1201.198333,400.173785,5,20,17.280864423590458,5,...,7052.928277,197.24,0.5,4021.80147,7052.92785,4021.803277,7052.920265,2025-04-19 23:59:57.787,False,
1037,D20250420T080056_IFCB200,ar87b_underway,4440,4008,1201.521111,370.986146,5,20,17.387311360341812,5,...,7052.928277,197.24,0.5,4021.80147,7052.92785,4021.803277,7052.920265,2025-04-19 23:59:57.787,False,
1038,D20250420T082440_IFCB200,ar87b_underway,4413,3981,1200.866806,368.031667,5,20,17.493758297093166,5,...,7052.928277,197.24,0.5,4021.80147,7052.92785,4021.803277,7052.920265,2025-04-19 23:59:57.787,False,
1039,D20250420T084824_IFCB200,ar87b_underway,4885,4352,1201.439028,408.629028,5,20,17.60020523384452,5,...,7052.928277,197.24,0.5,4021.80147,7052.92785,4021.803277,7052.920265,2025-04-19 23:59:57.787,False,


In [38]:
# SAVE this VERSION of the DF CONTAINING ALL SUMMARY DATA, LAT, LON, and SITE NAMES
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"HDR_Summaries/underway_ifcb_hdr_summaries/AR87_ifcb_underway_hdr_summary_with_lat_lon_and_sites_{timestamp}.csv"
hdr_summary_with_lat_and_lon_and_sites.to_csv(output_filename, index=False)