In [None]:
#!pip install geopy #run this line if geopy module is not found
#!pip install chardet #run this line if chardet module is not found

import pandas as pd
import re
import os
import pytz
from datetime import datetime
from geopy.distance import geodesic
import chardet #character encoding detector

### Processing ship underway data to obtain lat and lon values

In [None]:
#### THIS IS THE ONLY CELL CONTAINING ELEMENTS THAT NEED TO BE CHANGED
    # 1) the directory containing leg 1 underway data
    # 2) the directory containing leg 2 underway data
    # 3) the cruise number - this is just used for the csv name saved at the end of the notebook

#the directory containing the csv files from the underway system
##each file is a day, and the file is data collected every minute
dir_leg_1 = "/AR82-Pioneer20/AR82_armstrong_underway_data/leg1/proc" # change me (1)
dir_leg_2 = "/AR82-Pioneer20/AR82_armstrong_underway_data/leg2/proc" # change me (2)
cruise_name = "AR82" # change me (3) 

In [None]:
#the regex pattern that will match the correct underway files
file_name_pattern = r'AR\d*_0000.csv'
#compile the regex pattern
pattern_regex = re.compile(file_name_pattern)

In [None]:
#get a list of all csv files in the directory matching the pattern above
leg_1_csv_files = [file for file in os.listdir(dir_leg_1) if pattern_regex.match(file)]
leg_2_csv_files = [file for file in os.listdir(dir_leg_2) if pattern_regex.match(file)]

#empty list to store individual dataframes before merge
dfs = []

#iterate over each csv file, read as df, and append to dfs list
for file in leg_1_csv_files:
    file_path = os.path.join(dir_leg_1, file)
    df = pd.read_csv(file_path, header=1)
    dfs.append(df)
    
for file in leg_2_csv_files:
    file_path = os.path.join(dir_leg_2, file)
    df = pd.read_csv(file_path, header=1)
    dfs.append(df)

#concatenate all dfs in the list into a single df
merged_ship_underway_df = pd.concat(dfs, ignore_index=True)

In [None]:
merged_ship_underway_df

In [None]:
# convert the datetime format
def gmt_to_utc(gmt_datetime_str):
    gmt = pytz.timezone('GMT')
    datetime_formats = [
        '%Y/%m/%d %H:%M:%S',
        '%Y-%m-%d %H:%M:%S',
        '%Y/%m/%d %H:%M:%S.%f'
    ]

    for fmt in datetime_formats:
        try:
            gmt_datetime = datetime.strptime(gmt_datetime_str, fmt)
            gmt_localized_datetime = gmt.localize(gmt_datetime)
            utc_datetime = gmt_localized_datetime.astimezone(pytz.utc)
            return utc_datetime
        except ValueError:
            continue
    raise ValueError(f"Time data '{gmt_datetime_str}' does not match any known format")

merged_underway_df['Datetime_UTC_underway'] = merged_underway_df['Datetime_GMT_underway'].apply(gmt_to_utc)
merged_underway_df

### Processing raw underway IFCB data

In [None]:
# specify the directory where the underway IFCB Data is stored
dir = "/Volumes/My Passport/IFCB_PLIMS/IFCB_Cruise_data/Pioneer21_AR87_IFCB_data/AR87_shipboard_underway_ifcb200"

# indicate the file parameters to target (these are the columns needed in the IFCB log)

file_parameters_from_hdr_files = ["FileComment", "triggerCount", "roiCount", "runTime", "inhibitTime", "SyringeSampleVolume", "syringeSamplingSpeed", "temperature", "RunFastFactor",
                   "sampleNumber", "runSampleFast"]

# dictionary used to organize the filenames/sample ids with the associated parameters
all_fileparam_dicts = []

In [None]:
#### DECLARE the FUNCTION USED for PATTERN MATCHING within the HEADER FILE

def gather_values(text_content, param_list):
    file_dict = {"Filename": filename}
    for p in param_list:
        escaped_dynamic_string = re.escape(p)
        pattern = re.compile(r'{}:\s*(.+)'.format(escaped_dynamic_string))
        dynamic_match = pattern.search(hdr_content)

        if dynamic_match:
            value = dynamic_match.group(1)
            #return dynamic_match.group(1)
            #print(p, " value: ", value)
            file_dict[p] = value
        else: #return None
            print(filename, "No match found.")

    all_fileparam_dicts.append(file_dict)

In [None]:
#### APPLY gather_values() to ALL HDR FILES in the ESTABLISHED dir

for file in os.listdir(dir):
    if file.endswith(".hdr") and not file.startswith("._"):
        filename = os.path.splitext(file)[0]
        filepath = os.path.join(dir, file)
        with open(filepath, "r", encoding="iso-8859-1") as f:
            hdr_content = f.read()
            #print(hdr_content) # read out of all hdr content
            gather_values(hdr_content, file_parameters_from_hdr_files)

In [None]:
underway_IFCB_hdr_output = pd.DataFrame(all_fileparam_dicts)
underway_IFCB_hdr_output

### Adding volume analyzed (& lookTime, flowRate, and runSampleFast_Int) to the output dataframe

In [None]:
underway_IFCB_output_with_calcs = underway_IFCB_hdr_output

# Pull date and time from the filename
underway_IFCB_output_with_calcs['Datetime'] = pd.to_datetime(underway_IFCB_output_with_calcs['Filename'].str[1:15], format='%Y%m%dT%H%M%S')

# Ensure needed numeric values are numeric and not strings
underway_IFCB_output_with_calcs['runTime'] = pd.to_numeric(underway_IFCB_output_with_calcs['runTime'], errors='coerce')
underway_IFCB_output_with_calcs['inhibitTime'] = pd.to_numeric(underway_IFCB_output_with_calcs['inhibitTime'], errors='coerce')
underway_IFCB_output_with_calcs['syringeSamplingSpeed'] = pd.to_numeric(underway_IFCB_output_with_calcs['syringeSamplingSpeed'], errors='coerce')
underway_IFCB_output_with_calcs['SyringeSampleVolume'] = pd.to_numeric(underway_IFCB_output_with_calcs['SyringeSampleVolume'], errors='coerce')
underway_IFCB_output_with_calcs['RunFastFactor'] = pd.to_numeric(underway_IFCB_output_with_calcs['RunFastFactor'], errors='coerce')


# Create a column for lookTime
underway_IFCB_output_with_calcs['lookTime'] = underway_IFCB_output_with_calcs['runTime'] - underway_IFCB_output_with_calcs['inhibitTime']


# Create a column for runSampleFast_Int
## if runsamplefast = false then runsamplefast==1
underway_IFCB_output_with_calcs['runSampleFast_Int'] = (underway_IFCB_output_with_calcs['runSampleFast'].str.lower() != 'true').astype(int)
underway_IFCB_output_with_calcs['runSampleFast_Int']

    
# Create a column for flowRate_mins ()
## syringeSamplingSpeed (usually 20 mins), SyringeSampleVolume (usually 5 ml)
underway_IFCB_output_with_calcs['flowRate_mins'] = underway_IFCB_output_with_calcs['SyringeSampleVolume'] / underway_IFCB_output_with_calcs['syringeSamplingSpeed']


# Create a column for volumeAnalyzed
underway_IFCB_output_with_calcs['volumeAnalyzed'] = (underway_IFCB_output_with_calcs['RunFastFactor'] * underway_IFCB_output_with_calcs['runSampleFast_Int']) * underway_IFCB_output_with_calcs['flowRate_mins'] * (underway_IFCB_output_with_calcs['lookTime']/60)/5


In [None]:
underway_IFCB_output_with_calcs

### Adding ship latitude and longitude values to IFCB output

In [1]:
# this is the name of the underway ship data df created above within this notebook: merged_ship_underway_df

merged_ship_underway_df['Datetime_GMT_underway'] = pd.to_datetime(merged_ship_underway_df['Datetime_GMT_underway'], errors='coerce')
merged_ship_underway_df['Datetime_GMT_underway'].dtype
merged_ship_underway_df

# Uncomment these two following lines if you want to look at faulty/missing dates that might be present in the ship data, these can appear during GPS outages
#invalid_dates = underway_ship_data[underway_ship_data['Datetime_GMT_underway'].isna()]
#print(invalid_dates)

### Join the lat/lon values from the ship system data to the HDR plus calcs data

In [None]:
# Sort both dfs by date
underway_IFCB_output_with_calcs = underway_IFCB_output_with_calcs.sort_values('Datetime')
merged_ship_underway_df = merged_ship_underway_df.sort_values('Datetime_GMT_underway')

# Drop any rows in the ship data that do not have datetimes
merged_ship_underway_df = merged_ship_underway_df.dropna(subset=['Datetime_GMT_underway'])

# Merge the underway ship data to the IFCB HDR summary df
hdr_summary_with_lat_and_lon = pd.merge_asof(underway_IFCB_output_with_calcs, merged_ship_underway_df, left_on='Datetime', right_on='Datetime_GMT_underway', direction='nearest')

In [None]:
# Display the df containing all hdr summary information and ship lat and lon
hdr_summary_with_lat_and_lon

### Identifying and adding OOI sites to the df using ship lat/lon values

In [None]:
mab_site_centers = {
    'site': ['CP10CNSM', 'CP11NOSM', 'CP11SOSM', 'CP12CNSW', 'CP12WESW', 'CP13EAPM', 
             'CP13NOPM', 'CP13SOPM', 'CP14NEPM', 'CP14SEPM'],
    'lat': [35.95, 36.175, 35.725, 35.95, 35.95, 35.95, 36.175, 35.725, 36.0536, 35.8514],
    'lon': [-75.125, -74.8267, -74.853, -75.125, -75.3333, -74.8457, -74.8267, -74.853, 
            -74.7776, -74.8482]
}

centers_df = pd.DataFrame(mab_site_centers)

In [None]:
hdr_summary_with_lat_and_lon.columns.tolist()
#hdr_summary_with_lat_and_lon[" Dec_LON"] - this was used for AR82, but this col is not in the AR87 files
hdr_summary_with_lat_and_lon[' CNAV_LON']
hdr_summary_with_lat_and_lon[' CNAV_LAT']

In [None]:
ifcb_lat = hdr_summary_with_lat_and_lon[" CNAV_LAT"]
ifcb_lon = hdr_summary_with_lat_and_lon[" CNAV_LON"]

hdr_summary_with_lat_and_lon_and_sites = hdr_summary_with_lat_and_lon

def check_within_radius(ifcb_lat, ifcb_lon, centers_df, radius):
    for _, center in centers_df.iterrows():
        center_lat = center['lat']
        center_lon = center['lon']
        center_name = center.get('site', 'Unnamed Site')

        # Calculate distance
        distance = geodesic((ifcb_lat, ifcb_lon), (center_lat, center_lon)).kilometers
        if distance <= radius:
            return True, center_name  # Return True and the site name if within radius
    return False, None  # Return False if not within any radius

# Define a wrapper function to apply row-wise
def apply_check_within_radius(row):
    return check_within_radius(row[' CNAV_LAT'], row[' CNAV_LON'], centers_df, radius=2)  # radius in kilometers



# Apply function to every row in the DataFrame
hdr_summary_with_lat_and_lon_and_sites[['within_radius', 'site_name']] = hdr_summary_with_lat_and_lon_and_sites.apply(apply_check_within_radius, axis=1, result_type="expand")

In [None]:
hdr_summary_with_lat_and_lon_and_sites

In [None]:
# SAVE this VERSION of the DF CONTAINING ALL SUMMARY DATA, LAT, LON, and SITE NAMES
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"HDR_Summaries/underway_ifcb_hdr_summaries/{cruise_name}_ifcb_underway_hdr_summary_with_lat_lon_and_sites_{timestamp}.csv"
hdr_summary_with_lat_and_lon_and_sites.to_csv(output_filename, index=False)