In [2]:
#!pip install geopy #run this line if geopy module is not found
#!pip install chardet #run this line if chardet module is not found
 
import os
import pandas as pd
import re
from datetime import datetime
from geopy.distance import geodesic
import chardet #character encoding detector

In [60]:
# specify the directory where the underway IFCB Data is stored
dir = "/Volumes/My Passport/IFCB_PLIMS/IFCB_Cruise_data/Pioneer20_AR82_IFCB_data/AR82_shipboard_underway_ifcb206"

# indicate the file parameters to target (these are the columns needed in the IFCB log)

#file_parameters = ["FileComment", "triggerCount", "roiCount", "runTime", "inhibitTime", "humidity", "temperature", "runTime",
#                       "PMTAhighVoltage", "PMTBhighVoltage", "humidity", "SyringeSampleVolume", "syringeSamplingSpeed", "temperature", "RunFastFactor",
#                   "sampleNumber", "runSampleFast"]

file_parameters_from_hdr_files = ["FileComment", "triggerCount", "roiCount", "runTime", "inhibitTime", "SyringeSampleVolume", "syringeSamplingSpeed", "temperature", "RunFastFactor",
                   "sampleNumber", "runSampleFast"]

# dictionary used to organize the filenames/sample ids with the associated parameters
all_fileparam_dicts = []



In [61]:
#### DECLARE the FUNCTION USED for PATTERN MATCHING within the HEADER FILE

def gather_values(text_content, param_list):
    file_dict = {"Filename": filename}
    for p in param_list:
        escaped_dynamic_string = re.escape(p)
        pattern = re.compile(r'{}:\s*(.+)'.format(escaped_dynamic_string))
        dynamic_match = pattern.search(hdr_content)

        if dynamic_match:
            value = dynamic_match.group(1)
            #return dynamic_match.group(1)
            #print(p, " value: ", value)
            file_dict[p] = value
        else: #return None
            print(filename, "No match found.")

    all_fileparam_dicts.append(file_dict)

In [62]:
#### APPLY gather_values() to ALL HDR FILES in the ESTABLISHED dir

for file in os.listdir(dir):
    if file.endswith(".hdr") and not file.startswith("._"):
        filename = os.path.splitext(file)[0]
        filepath = os.path.join(dir, file)
        with open(filepath, "r", encoding="iso-8859-1") as f:
            hdr_content = f.read()
            #print(hdr_content) # read out of all hdr content
            gather_values(hdr_content, file_parameters_from_hdr_files)
        

In [63]:
underway_IFCB_hdr_output = pd.DataFrame(all_fileparam_dicts)
underway_IFCB_hdr_output

Unnamed: 0,Filename,FileComment,triggerCount,roiCount,runTime,inhibitTime,SyringeSampleVolume,syringeSamplingSpeed,temperature,RunFastFactor,sampleNumber,runSampleFast
0,D20240401T151712_IFCB206,ar82a_underway,5811,5748,1200.9409722222222,483.21538194444446,5,20,11.743960479133293,5,1,False
1,D20240401T173608_IFCB206,ar82a_underway,7454,7347,1200.950138888889,620.1347916666666,5,20,10.679491111619754,5,1,False
2,D20240401T180004_IFCB206,ar82a_underway,6899,6745,1201.868888888889,575.4369444444444,5,20,10.5730441748684,5,2,False
3,D20240401T182359_IFCB206,ar82a_underway,6622,6448,1202.1619444444445,551.6087847222223,5,20,10.466597238117046,5,3,False
4,D20240401T184754_IFCB206,ar82a_underway,6347,6354,1202.0315277777777,528.6975694444444,5,20,10.466597238117046,5,4,False
...,...,...,...,...,...,...,...,...,...,...,...,...
879,D20240420T205326_IFCB206,ar82b_underway,4016,3676,1200.7993055555555,335.27739583333334,5,20,14.618027771419861,5,198,False
880,D20240420T211721_IFCB206,ar82b_underway,4221,4079,1200.903611111111,352.7632291666667,5,20,14.40679713130389,5,199,False
881,D20240420T214115_IFCB206,ar82b_underway,3502,3408,1201.4794444444444,293.83399305555554,5,20,14.192240024414446,5,200,False
882,D20240420T220509_IFCB206,ar82b_underway,3880,3804,1200.9676388888888,326.8259722222222,5,20,14.085793087663092,5,201,False


In [33]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"HDR_Summaries/underway_ifcb_hdr_summaries/ifcb_underway_hdr_summary_{timestamp}.csv"
underway_IFCB_hdr_output.to_csv(output_filename, index=False)

#### Adding Volume Analyzed (& lookTime, flowRate, and runSampleFast_Int) to the output dataframe

These can also be added via Excel functions rather than with this script.

In [64]:
# Pull date and time from the filename
underway_IFCB_hdr_output['Datetime'] = pd.to_datetime(underway_IFCB_hdr_output['Filename'].str[1:15], format='%Y%m%dT%H%M%S')

# Ensure needed numeric values are numeric and not strings
underway_IFCB_hdr_output['runTime'] = pd.to_numeric(underway_IFCB_hdr_output['runTime'], errors='coerce')
underway_IFCB_hdr_output['inhibitTime'] = pd.to_numeric(underway_IFCB_hdr_output['inhibitTime'], errors='coerce')
underway_IFCB_hdr_output['syringeSamplingSpeed'] = pd.to_numeric(underway_IFCB_hdr_output['syringeSamplingSpeed'], errors='coerce')
underway_IFCB_hdr_output['SyringeSampleVolume'] = pd.to_numeric(underway_IFCB_hdr_output['SyringeSampleVolume'], errors='coerce')
underway_IFCB_hdr_output['RunFastFactor'] = pd.to_numeric(underway_IFCB_hdr_output['RunFastFactor'], errors='coerce')


# Create a column for lookTime
underway_IFCB_hdr_output['lookTime'] = underway_IFCB_hdr_output['runTime'] - underway_IFCB_hdr_output['inhibitTime']


# Create a column for runSampleFast_Int
## if runsamplefast = false then runsamplefast==1
underway_IFCB_hdr_output['runSampleFast_Int'] = (underway_IFCB_hdr_output['runSampleFast'].str.lower() != 'true').astype(int)
underway_IFCB_hdr_output['runSampleFast_Int']

    
# Create a column for flowRate_mins ()
## syringeSamplingSpeed (usually 20 mins), SyringeSampleVolume (usually 5 ml)
underway_IFCB_hdr_output['flowRate_mins'] = underway_IFCB_hdr_output['SyringeSampleVolume'] / underway_IFCB_hdr_output['syringeSamplingSpeed']


# Create a column for volumeAnalyzed
underway_IFCB_hdr_output['volumeAnalyzed'] = (underway_IFCB_hdr_output['RunFastFactor'] * underway_IFCB_hdr_output['runSampleFast_Int']) * underway_IFCB_hdr_output['flowRate_mins'] * (underway_IFCB_hdr_output['lookTime']/60)/5

underway_IFCB_hdr_output

Unnamed: 0,Filename,FileComment,triggerCount,roiCount,runTime,inhibitTime,SyringeSampleVolume,syringeSamplingSpeed,temperature,RunFastFactor,sampleNumber,runSampleFast,Datetime,lookTime,runSampleFast_Int,flowRate_mins,volumeAnalyzed
0,D20240401T151712_IFCB206,ar82a_underway,5811,5748,1200.940972,483.215382,5,20,11.743960479133293,5,1,False,2024-04-01 15:17:01,717.725590,1,0.25,2.990523
1,D20240401T173608_IFCB206,ar82a_underway,7454,7347,1200.950139,620.134792,5,20,10.679491111619754,5,1,False,2024-04-01 17:36:00,580.815347,1,0.25,2.420064
2,D20240401T180004_IFCB206,ar82a_underway,6899,6745,1201.868889,575.436944,5,20,10.5730441748684,5,2,False,2024-04-01 18:00:00,626.431944,1,0.25,2.610133
3,D20240401T182359_IFCB206,ar82a_underway,6622,6448,1202.161944,551.608785,5,20,10.466597238117046,5,3,False,2024-04-01 18:23:05,650.553160,1,0.25,2.710638
4,D20240401T184754_IFCB206,ar82a_underway,6347,6354,1202.031528,528.697569,5,20,10.466597238117046,5,4,False,2024-04-01 18:47:05,673.333958,1,0.25,2.805558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
879,D20240420T205326_IFCB206,ar82b_underway,4016,3676,1200.799306,335.277396,5,20,14.618027771419861,5,198,False,2024-04-20 20:53:02,865.521910,1,0.25,3.606341
880,D20240420T211721_IFCB206,ar82b_underway,4221,4079,1200.903611,352.763229,5,20,14.40679713130389,5,199,False,2024-04-20 21:17:02,848.140382,1,0.25,3.533918
881,D20240420T214115_IFCB206,ar82b_underway,3502,3408,1201.479444,293.833993,5,20,14.192240024414446,5,200,False,2024-04-20 21:41:01,907.645451,1,0.25,3.781856
882,D20240420T220509_IFCB206,ar82b_underway,3880,3804,1200.967639,326.825972,5,20,14.085793087663092,5,201,False,2024-04-20 22:05:00,874.141667,1,0.25,3.642257


In [18]:
# SAVE THE HDR SUMMARY FILE CONTAINING THESE CALCULATED VALUES
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"HDR_Summaries/underway_ifcb_hdr_summaries/ifcb_underway_hdr_summary_with_calcd_values_{timestamp}.csv"
underway_IFCB_hdr_output.to_csv(output_filename, index=False)

#### Adding Ship Latitude and Longitude

In [65]:
# This file is created via the Ship underway Lat & Lon Processing.ipynb notebook
underway_file = "Merged_Ship_Underway_Files/mergedunderway_AR82.csv"
#underway_ship_data = pd.read_csv(underway_file, dtype={'Datetime_GMT_underway': 'datetime64[ns]'})
underway_ship_data = pd.read_csv(underway_file, parse_dates=['Datetime_GMT_underway'], low_memory=False)
underway_ship_data['Datetime_GMT_underway'] = pd.to_datetime(underway_ship_data['Datetime_GMT_underway'], errors='coerce')
underway_ship_data['Datetime_GMT_underway'].dtype


# Uncomment these two following lines if you want to look at faulty dates that might be present in the ship data
#invalid_dates = underway_ship_data[underway_ship_data['Datetime_GMT_underway'].isna()]
#print(invalid_dates)

dtype('<M8[ns]')

In [66]:
#underway_ship_data['Datetime_GMT_underway'] = underway_ship_data['Datetime_GMT_underway'].dt.strftime('%Y-%m-%d %H:%M:%S')
underway_IFCB_hdr_output['Datetime']

0     2024-04-01 15:17:01
1     2024-04-01 17:36:00
2     2024-04-01 18:00:00
3     2024-04-01 18:23:05
4     2024-04-01 18:47:05
              ...        
879   2024-04-20 20:53:02
880   2024-04-20 21:17:02
881   2024-04-20 21:41:01
882   2024-04-20 22:05:00
883   2024-04-20 22:29:00
Name: Datetime, Length: 884, dtype: datetime64[ns]

In [67]:
# Sort both dfs by date
underway_IFCB_hdr_output = underway_IFCB_hdr_output.sort_values('Datetime')
underway_ship_data = underway_ship_data.sort_values('Datetime_GMT_underway')

# Drop any rows in the ship data that do not have datetimes
underway_ship_data = underway_ship_data.dropna(subset=['Datetime_GMT_underway'])

# Merge the underway ship data to the IFCB HDR summary df
hdr_summary_with_lat_and_lon = pd.merge_asof(underway_IFCB_hdr_output, underway_ship_data, left_on='Datetime', right_on='Datetime_GMT_underway', direction='nearest')

In [68]:
# Display the df containing all hdr summary information and ship lat and lon
hdr_summary_with_lat_and_lon

Unnamed: 0,Filename,FileComment,triggerCount,roiCount,runTime,inhibitTime,SyringeSampleVolume,syringeSamplingSpeed,temperature,RunFastFactor,...,FLR,FLOW,SSVdslog,Depth12,Depth35,EM122,EM710,COG,Datetime_GMT_underway,Datetime_UTC_underway
0,D20240401T151712_IFCB206,ar82a_underway,5811,5748,1200.940972,483.215382,5,20,11.743960479133293,5,...,124.4,0.0,1438.982,NAN,NAN,NAN,NAN,,2024-04-01 13:13:33.061,2024-04-01 13:13:33.061000+00:00
1,D20240401T173608_IFCB206,ar82a_underway,7454,7347,1200.950139,620.134792,5,20,10.679491111619754,5,...,124.4,0.0,1438.982,NAN,NAN,NAN,NAN,,2024-04-01 13:13:33.061,2024-04-01 13:13:33.061000+00:00
2,D20240401T180004_IFCB206,ar82a_underway,6899,6745,1201.868889,575.436944,5,20,10.5730441748684,5,...,124.4,0.0,1438.982,NAN,NAN,NAN,NAN,,2024-04-01 13:13:33.061,2024-04-01 13:13:33.061000+00:00
3,D20240401T182359_IFCB206,ar82a_underway,6622,6448,1202.161944,551.608785,5,20,10.466597238117046,5,...,124.4,0.0,1438.982,NAN,NAN,NAN,NAN,,2024-04-01 13:13:33.061,2024-04-01 13:13:33.061000+00:00
4,D20240401T184754_IFCB206,ar82a_underway,6347,6354,1202.031528,528.697569,5,20,10.466597238117046,5,...,270.8,30.8,1472.697,NAN,NAN,NAN,NAN,,2024-04-02 00:00:47.596,2024-04-02 00:00:47.596000+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
879,D20240420T205326_IFCB206,ar82b_underway,4016,3676,1200.799306,335.277396,5,20,14.618027771419861,5,...,679.8,58.9,1477.116,NAN,NAN,NAN,NAN,,2024-04-20 20:52:49.317,2024-04-20 20:52:49.317000+00:00
880,D20240420T211721_IFCB206,ar82b_underway,4221,4079,1200.903611,352.763229,5,20,14.40679713130389,5,...,399.2,59.4,1474.546,NAN,NAN,NAN,NAN,,2024-04-20 21:16:49.319,2024-04-20 21:16:49.319000+00:00
881,D20240420T214115_IFCB206,ar82b_underway,3502,3408,1201.479444,293.833993,5,20,14.192240024414446,5,...,447.6,59.9,1475.68,NAN,NAN,NAN,NAN,,2024-04-20 21:40:49.320,2024-04-20 21:40:49.320000+00:00
882,D20240420T220509_IFCB206,ar82b_underway,3880,3804,1200.967639,326.825972,5,20,14.085793087663092,5,...,416.3,59.9,1475.813,NAN,NAN,NAN,NAN,,2024-04-20 22:04:49.322,2024-04-20 22:04:49.322000+00:00


In [60]:
# Save the df containing all hdr summary information and ship lat and lon
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"HDR_Summaries/underway_ifcb_hdr_summaries/ifcb_underway_hdr_summary_with_lat_lon_{timestamp}.csv"
hdr_summary_with_lat_and_lon.to_csv(output_filename, index=False)

#### Adding Site (from Ship Latitude and Longitude)

In [69]:
mab_site_centers = {
    'site': ['CP10CNSM', 'CP11NOSM', 'CP11SOSM', 'CP12CNSW', 'CP12WESW', 'CP13EAPM', 
             'CP13NOPM', 'CP13SOPM', 'CP14NEPM', 'CP14SEPM'],
    'lat': [35.95, 36.175, 35.725, 35.95, 35.95, 35.95, 36.175, 35.725, 36.0536, 35.8514],
    'lon': [-75.125, -74.8267, -74.853, -75.125, -75.3333, -74.8457, -74.8267, -74.853, 
            -74.7776, -74.8482]
}

centers_df = pd.DataFrame(mab_site_centers)

In [70]:
hdr_summary_with_lat_and_lon[" Dec_LON"]

0       -70.672
1       -70.672
2       -70.672
3       -70.672
4       -70.882
         ...   
879      -71.07
880     -70.987
881     -70.907
882     -70.885
883     -70.882
Name:  Dec_LON, Length: 884, dtype: object

In [71]:
#ifcb_lat = hdr_summary_with_lat_and_lon[" Dec_LAT"]
#ifcb_lon = hdr_summary_with_lat_and_lon[" Dec_LON"]

hdr_summary_with_lat_and_lon_and_sites = hdr_summary_with_lat_and_lon

def check_within_radius(ifcb_lat, ifcb_lon, centers_df, radius):
    for _, center in centers_df.iterrows():
        center_lat = center['lat']
        center_lon = center['lon']
        center_name = center.get('site', 'Unnamed Site')

        # Calculate distance
        distance = geodesic((ifcb_lat, ifcb_lon), (center_lat, center_lon)).kilometers
        if distance <= radius:
            return True, center_name  # Return True and the site name if within radius
    return False, None  # Return False if not within any radius

# Define a wrapper function to apply row-wise
def apply_check_within_radius(row):
    return check_within_radius(row[' Dec_LAT'], row[' Dec_LON'], centers_df, radius=2)  # radius in kilometers



# Apply function to every row in the DataFrame
hdr_summary_with_lat_and_lon_and_sites[['within_radius', 'site_name']] = hdr_summary_with_lat_and_lon_and_sites.apply(apply_check_within_radius, axis=1, result_type="expand")

In [72]:
hdr_summary_with_lat_and_lon_and_sites

Unnamed: 0,Filename,FileComment,triggerCount,roiCount,runTime,inhibitTime,SyringeSampleVolume,syringeSamplingSpeed,temperature,RunFastFactor,...,SSVdslog,Depth12,Depth35,EM122,EM710,COG,Datetime_GMT_underway,Datetime_UTC_underway,within_radius,site_name
0,D20240401T151712_IFCB206,ar82a_underway,5811,5748,1200.940972,483.215382,5,20,11.743960479133293,5,...,1438.982,NAN,NAN,NAN,NAN,,2024-04-01 13:13:33.061,2024-04-01 13:13:33.061000+00:00,False,
1,D20240401T173608_IFCB206,ar82a_underway,7454,7347,1200.950139,620.134792,5,20,10.679491111619754,5,...,1438.982,NAN,NAN,NAN,NAN,,2024-04-01 13:13:33.061,2024-04-01 13:13:33.061000+00:00,False,
2,D20240401T180004_IFCB206,ar82a_underway,6899,6745,1201.868889,575.436944,5,20,10.5730441748684,5,...,1438.982,NAN,NAN,NAN,NAN,,2024-04-01 13:13:33.061,2024-04-01 13:13:33.061000+00:00,False,
3,D20240401T182359_IFCB206,ar82a_underway,6622,6448,1202.161944,551.608785,5,20,10.466597238117046,5,...,1438.982,NAN,NAN,NAN,NAN,,2024-04-01 13:13:33.061,2024-04-01 13:13:33.061000+00:00,False,
4,D20240401T184754_IFCB206,ar82a_underway,6347,6354,1202.031528,528.697569,5,20,10.466597238117046,5,...,1472.697,NAN,NAN,NAN,NAN,,2024-04-02 00:00:47.596,2024-04-02 00:00:47.596000+00:00,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
879,D20240420T205326_IFCB206,ar82b_underway,4016,3676,1200.799306,335.277396,5,20,14.618027771419861,5,...,1477.116,NAN,NAN,NAN,NAN,,2024-04-20 20:52:49.317,2024-04-20 20:52:49.317000+00:00,False,
880,D20240420T211721_IFCB206,ar82b_underway,4221,4079,1200.903611,352.763229,5,20,14.40679713130389,5,...,1474.546,NAN,NAN,NAN,NAN,,2024-04-20 21:16:49.319,2024-04-20 21:16:49.319000+00:00,False,
881,D20240420T214115_IFCB206,ar82b_underway,3502,3408,1201.479444,293.833993,5,20,14.192240024414446,5,...,1475.68,NAN,NAN,NAN,NAN,,2024-04-20 21:40:49.320,2024-04-20 21:40:49.320000+00:00,False,
882,D20240420T220509_IFCB206,ar82b_underway,3880,3804,1200.967639,326.825972,5,20,14.085793087663092,5,...,1475.813,NAN,NAN,NAN,NAN,,2024-04-20 22:04:49.322,2024-04-20 22:04:49.322000+00:00,False,


In [74]:
# SAVE this VERSION of the DF CONTAINING ALL SUMMARY DATA, LAT, LON, and SITE NAMES
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"HDR_Summaries/underway_ifcb_hdr_summaries/ifcb_underway_hdr_summary_with_lat_lon_and_sites_{timestamp}.csv"
hdr_summary_with_lat_and_lon_and_sites.to_csv(output_filename, index=False)