In [1]:
import os
import pandas as pd
from datetime import datetime

In [10]:
# Read in and standardize Underway data
processed_underway_ifcb_data = pd.read_excel('IFCB_Logs/Pioneer-21_AR87_IFCB_Log_2025-06-05_Ver_1-00.xlsx', sheet_name=1, header=3)
processed_underway_ifcb_data = processed_underway_ifcb_data.fillna('')
processed_underway_ifcb_data.columns = processed_underway_ifcb_data.columns.str.strip()


# Read in and standardize Discrete data
processed_discrete_ifcb_data = pd.read_excel('IFCB_Logs/Pioneer-21_AR87_IFCB_Log_2025-06-05_Ver_1-00.xlsx', sheet_name=2, header=3)
processed_discrete_ifcb_data = processed_discrete_ifcb_data.fillna('')
processed_discrete_ifcb_data.columns = processed_discrete_ifcb_data.columns.str.strip()

# Define the folder with your .bl files
bottle_folder_path = '/Users/sawyer/Documents/GitHub/ifcb-parsing-processing-master/IFCB_Log_Notebooks/Bottle_file_copies/AR87/'

cruise = 'AR87' # format example: AR82
ifcb_instrument_num = 'IFCB200' # format: IFCBXXX

#this is the only block that contains variables that need to be updated
processed_discrete_ifcb_data.columns.tolist()

['Filename',
 'HDR Comment',
 'Sample Type',
 'Cruise Leg',
 'Site',
 'Cast',
 'Niskin',
 'IFCB Bottle',
 'Target Cast Depth',
 'Trip Depth',
 '# Triggers',
 '# ROIs',
 'Run time',
 'Inhibit time',
 'Sample time',
 'Volume Analyzed',
 'ROIs/ml',
 'Cast Start Latitude',
 'Cast Start Longitude',
 'Notes']

In [11]:
# Get datetimes from bottle data files


# Get list of all .bl files
bl_files = [os.path.join(bottle_folder_path, f) for f in os.listdir(bottle_folder_path) if f.lower().endswith('.bl')]

# Initialize list to store all rows
all_bottle_data = []

for filepath in bl_files:
    filename = os.path.basename(filepath)
    
    # Extract cruise and cast from the filename
    try:
        cruise_portion = filename.split('_')[0]      # e.g., 'ar87a'
        cast_portion = filename.split('_')[1].split('.')[0]  # e.g., '023'
    except IndexError:
        print(f"Filename format unexpected: {filename}")
        continue

    with open(filepath, 'r') as file:
        lines = file.readlines()
    
    for line in lines:
        line = line.strip()
        
        # Skip lines that don't start with data (e.g., path or RESET lines)
        if not line or not line[0].isdigit():
            continue
        
        parts = [p.strip() for p in line.split(',')]
        if len(parts) != 5:
            continue  # Skip malformed lines

        try:
            niskin = int(parts[0])
            bottle = int(parts[1])
            dt_str = parts[2]
            start_byte = int(parts[3])
            end_byte = int(parts[4])

            # Convert to datetime
            dt = datetime.strptime(dt_str, '%b %d %Y %H:%M:%S')
            
            all_bottle_data.append({
                'cruise': cruise_portion,
                'Cast': cast_portion,
                'Niskin': niskin,
                'bottle': bottle,
                'date': dt.date(),
                'time': dt.time(),
                'datetime': dt,
                'start_byte': start_byte,
                'end_byte': end_byte,
                'source_file': filename
            })
        except Exception as e:
            print(f"Error parsing line in {filename}: {line} — {e}")

# Create DataFrame
df = pd.DataFrame(all_bottle_data)

# Preview result
print(f"Parsed {len(df)} rows from {len(bl_files)} files.")
df.head()

output_path = '/Users/sawyer/Documents/GitHub/ifcb-parsing-processing-master/IFCB_Log_Notebooks/Bottle_file_copies/merged_bottle_data_csvs/AR87_merged_bottle_data.csv'
df.to_csv(output_path, index=False)

print(f"DataFrame saved to {output_path}")


Parsed 215 rows from 58 files.
DataFrame saved to /Users/sawyer/Documents/GitHub/ifcb-parsing-processing-master/IFCB_Log_Notebooks/Bottle_file_copies/merged_bottle_data_csvs/AR87_merged_bottle_data.csv


In [12]:
processed_discrete_ifcb_data.head()

Unnamed: 0,Filename,HDR Comment,Sample Type,Cruise Leg,Site,Cast,Niskin,IFCB Bottle,Target Cast Depth,Trip Depth,# Triggers,# ROIs,Run time,Inhibit time,Sample time,Volume Analyzed,ROIs/ml,Cast Start Latitude,Cast Start Longitude,Notes
0,D20250330T214132_IFCB200,ar87a_c4n14_b1_GL_surface,discrete_water_sample,a,CP15MOAS,4,14,1,surface,2.6,5185,4008,1201.078333,429.41625,771.662083,3.215259,1246.556,35.849728,-74.80819,LTER IFCB
1,D20250330T220653_IFCB200,ar87a_c4n14_b1_GL_surface,discrete_water_sample,a,CP15MOAS,4,14,1,surface,2.6,4366,3240,1201.776111,362.114826,839.661285,3.498589,926.087714,35.849728,-74.80819,LTER IFCB
2,D20250330T223214_IFCB200,ar87a_c4n14_b1_GL_surface,discrete_water_sample,a,CP15MOAS,4,14,1,surface,2.6,4216,3120,1201.585417,349.005868,852.579549,3.552415,878.275817,35.849728,-74.80819,LTER IFCB
3,D20250330T231708_IFCB200,ar87a_c4n13_b5_gl_surface,discrete_water_sample,a,CP15MOAS,4,13,5,surface,2.9,4887,3634,1201.723889,404.575521,797.148368,3.321452,1094.099963,35.849728,-74.80819,
4,D20250330T234229_IFCB200,ar87a_c4n13_b5_gl_surface,discrete_water_sample,a,CP15MOAS,4,13,5,surface,2.9,4777,3453,1201.717222,395.546354,806.170868,3.359045,1027.970661,35.849728,-74.80819,


In [13]:
df

Unnamed: 0,cruise,Cast,Niskin,bottle,date,time,datetime,start_byte,end_byte,source_file
0,ar87b,034,1,1,2025-04-20,00:09:32,2025-04-20 00:09:32,10306,10342,ar87b_034.bl
1,ar87b,034,2,2,2025-04-20,00:11:08,2025-04-20 00:11:08,12610,12646,ar87b_034.bl
2,ar87b,010,1,1,2025-04-13,15:58:50,2025-04-13 15:58:50,4516,4552,ar87b_010.bl
3,ar87b,010,2,2,2025-04-13,16:00:56,2025-04-13 16:00:56,7556,7592,ar87b_010.bl
4,ar87b,024,1,1,2025-04-16,16:51:44,2025-04-16 16:51:44,9133,9169,ar87b_024.bl
...,...,...,...,...,...,...,...,...,...,...
210,ar87b,023,6,6,2025-04-15,14:39:26,2025-04-15 14:39:26,22413,22449,ar87b_023.bl
211,ar87b,023,7,7,2025-04-15,14:42:23,2025-04-15 14:42:23,26680,26716,ar87b_023.bl
212,ar87b,023,8,8,2025-04-15,14:43:03,2025-04-15 14:43:03,27632,27668,ar87b_023.bl
213,ar87b,023,9,9,2025-04-15,14:46:02,2025-04-15 14:46:02,31937,31973,ar87b_023.bl


In [14]:
# Make sure both are strings and pad the processed_discrete_ifcb_data to match the 3-digit format in df
processed_discrete_ifcb_data['Cast'] = processed_discrete_ifcb_data['Cast'].astype(int).astype(str).str.zfill(3)
df['Cast'] = df['Cast'].astype(str)  # Just to be safe
# Parse out leg from cruise number
df['Cruise Leg'] = df['cruise'].str.strip().str[-1]

# Now merge
merged = processed_discrete_ifcb_data.merge(
    df[['Cruise Leg', 'Cast', 'Niskin', 'datetime']],
    on=['Cruise Leg', 'Cast', 'Niskin'],
    how='left'
)

# Optional: rename for clarity
merged = merged.rename(columns={'datetime': 'bottle_datetime'})

# Preview
merged.head()



Unnamed: 0,Filename,HDR Comment,Sample Type,Cruise Leg,Site,Cast,Niskin,IFCB Bottle,Target Cast Depth,Trip Depth,...,# ROIs,Run time,Inhibit time,Sample time,Volume Analyzed,ROIs/ml,Cast Start Latitude,Cast Start Longitude,Notes,bottle_datetime
0,D20250330T214132_IFCB200,ar87a_c4n14_b1_GL_surface,discrete_water_sample,a,CP15MOAS,4,14,1,surface,2.6,...,4008,1201.078333,429.41625,771.662083,3.215259,1246.556,35.849728,-74.80819,LTER IFCB,2025-03-30 19:39:04
1,D20250330T220653_IFCB200,ar87a_c4n14_b1_GL_surface,discrete_water_sample,a,CP15MOAS,4,14,1,surface,2.6,...,3240,1201.776111,362.114826,839.661285,3.498589,926.087714,35.849728,-74.80819,LTER IFCB,2025-03-30 19:39:04
2,D20250330T223214_IFCB200,ar87a_c4n14_b1_GL_surface,discrete_water_sample,a,CP15MOAS,4,14,1,surface,2.6,...,3120,1201.585417,349.005868,852.579549,3.552415,878.275817,35.849728,-74.80819,LTER IFCB,2025-03-30 19:39:04
3,D20250330T231708_IFCB200,ar87a_c4n13_b5_gl_surface,discrete_water_sample,a,CP15MOAS,4,13,5,surface,2.9,...,3634,1201.723889,404.575521,797.148368,3.321452,1094.099963,35.849728,-74.80819,,2025-03-30 19:38:46
4,D20250330T234229_IFCB200,ar87a_c4n13_b5_gl_surface,discrete_water_sample,a,CP15MOAS,4,13,5,surface,2.9,...,3453,1201.717222,395.546354,806.170868,3.359045,1027.970661,35.849728,-74.80819,,2025-03-30 19:38:46


In [15]:
processed_discrete_ifcb_data_merged = merged
processed_discrete_ifcb_data_merged

Unnamed: 0,Filename,HDR Comment,Sample Type,Cruise Leg,Site,Cast,Niskin,IFCB Bottle,Target Cast Depth,Trip Depth,...,# ROIs,Run time,Inhibit time,Sample time,Volume Analyzed,ROIs/ml,Cast Start Latitude,Cast Start Longitude,Notes,bottle_datetime
0,D20250330T214132_IFCB200,ar87a_c4n14_b1_GL_surface,discrete_water_sample,a,CP15MOAS,004,14,1,surface,2.6,...,4008,1201.078333,429.416250,771.662083,3.215259,1246.556000,35.849728,-74.808190,LTER IFCB,2025-03-30 19:39:04
1,D20250330T220653_IFCB200,ar87a_c4n14_b1_GL_surface,discrete_water_sample,a,CP15MOAS,004,14,1,surface,2.6,...,3240,1201.776111,362.114826,839.661285,3.498589,926.087714,35.849728,-74.808190,LTER IFCB,2025-03-30 19:39:04
2,D20250330T223214_IFCB200,ar87a_c4n14_b1_GL_surface,discrete_water_sample,a,CP15MOAS,004,14,1,surface,2.6,...,3120,1201.585417,349.005868,852.579549,3.552415,878.275817,35.849728,-74.808190,LTER IFCB,2025-03-30 19:39:04
3,D20250330T231708_IFCB200,ar87a_c4n13_b5_gl_surface,discrete_water_sample,a,CP15MOAS,004,13,5,surface,2.9,...,3634,1201.723889,404.575521,797.148368,3.321452,1094.099963,35.849728,-74.808190,,2025-03-30 19:38:46
4,D20250330T234229_IFCB200,ar87a_c4n13_b5_gl_surface,discrete_water_sample,a,CP15MOAS,004,13,5,surface,2.9,...,3453,1201.717222,395.546354,806.170868,3.359045,1027.970661,35.849728,-74.808190,,2025-03-30 19:38:46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,D20250417T200345_IFCB200,ar87b_c26n10_b2_ne_chl_max,discrete_water_sample,b,CP14NEPM,026,10,2,chl max,29.0,...,4438,1200.774306,374.131979,826.642326,3.444343,1288.489551,36.053037,-74.779623,,2025-04-17 14:55:52
95,D20250417T202905_IFCB200,ar87b_c26n10_b2_ne_chl_max,discrete_water_sample,b,CP14NEPM,026,10,2,chl max,29.0,...,4258,1201.926944,360.320729,841.606215,3.506693,1214.249588,36.053037,-74.779623,,2025-04-17 14:55:52
96,D20250417T211015_IFCB200,ar87b_c26n6_b3_ne_75m,discrete_water_sample,b,CP14NEPM,026,6,3,75,77.0,...,1764,1200.878333,158.175434,1042.702899,4.344595,406.021696,36.053037,-74.779623,LTER IFCB,2025-04-17 14:48:06
97,D20250417T213536_IFCB200,ar87b_c26n6_b3_ne_75m,discrete_water_sample,b,CP14NEPM,026,6,3,75,77.0,...,1722,1200.816806,164.729340,1036.087465,4.317031,398.885243,36.053037,-74.779623,LTER IFCB,2025-04-17 14:48:06


In [16]:
columns_in_metadata_csv = ['filename', 'Latitude', 'Longitude', 'Depth', 'sample_type', 'Cruise', 'Instrument', 'tag1', 'tag2', 'tag3', 'datetime']

processed_underway_ifcb_data_mapped = {
    'filename': processed_underway_ifcb_data['Filename'], 
    'Latitude': processed_underway_ifcb_data['Ship Latitude'],
    'Longitude': processed_underway_ifcb_data['Ship Longitude'],
    'Depth': 2.13, #for R/V Armstrong Aft Diaphram Pump
    'sample_type': 'underway',
    'Cruise': cruise,
    'Instrument': ifcb_instrument_num,
    'tag1': processed_underway_ifcb_data['Site'].apply(lambda x: 'site_' + str(x) if pd.notna(x) and x != '' else x), 
    'tag2': 'targetdepth_surface',
    'tag3': '',
    'datetime': ''
}

processed_discrete_ifcb_data_mapped = {
    'filename': processed_discrete_ifcb_data_merged['Filename'], 
    'Latitude': processed_discrete_ifcb_data_merged['Cast Start Latitude'],
    'Longitude': processed_discrete_ifcb_data_merged['Cast Start Longitude'],
    'Depth': processed_discrete_ifcb_data_merged['Trip Depth'],
    'sample_type': processed_discrete_ifcb_data_merged['Sample Type'].apply(
        lambda x: 'discrete' if 'discrete' in str(x).lower() and 'test' not in str(x).lower() and 'beads' not in str(x).lower() else x
    ),
    'Cruise': cruise,
    'Instrument':ifcb_instrument_num,
    'tag1': processed_discrete_ifcb_data_merged['Site'].apply(lambda x: 'site_' + str(x) if pd.notna(x) and x != '' else x), 
    'tag2': processed_discrete_ifcb_data_merged['Target Cast Depth'].apply(lambda x: 'site_' + str(x) if pd.notna(x) and x != '' else x), 
    'tag3': processed_discrete_ifcb_data_merged.apply(
        lambda row: 'qc_' + str(row['Sample Type']) if (
            (pd.notna(row['Sample Type']) and 'test' in str(row['Sample Type']).lower()) or 
            (pd.notna(row['HDR Comment']) and 'test' in str(row['HDR Comment']).lower()) or
            (pd.notna(row['Sample Type']) and 'beads' in str(row['Sample Type']).lower()) or 
            (pd.notna(row['HDR Comment']) and 'beads' in str(row['HDR Comment']).lower())
        ) else '', axis=1
    ),
    'datetime': processed_discrete_ifcb_data_merged['bottle_datetime']
}


underway_new = pd.DataFrame(processed_underway_ifcb_data_mapped)
discrete_new = pd.DataFrame(processed_discrete_ifcb_data_mapped)


metadata_df = pd.concat([underway_new, discrete_new], ignore_index=True)


metadata_df

Unnamed: 0,filename,Latitude,Longitude,Depth,sample_type,Cruise,Instrument,tag1,tag2,tag3,datetime
0,D20250328T171555_IFCB200,41.145000,-70.891000,2.13,underway,AR87,IFCB200,,targetdepth_surface,,
1,D20250328T173939_IFCB200,41.081000,-70.885000,2.13,underway,AR87,IFCB200,,targetdepth_surface,,
2,D20250328T180323_IFCB200,41.016000,-70.883000,2.13,underway,AR87,IFCB200,,targetdepth_surface,,
3,D20250328T182707_IFCB200,40.951000,-70.883000,2.13,underway,AR87,IFCB200,,targetdepth_surface,,
4,D20250328T185052_IFCB200,40.948000,-70.883000,2.13,underway,AR87,IFCB200,,targetdepth_surface,,
...,...,...,...,...,...,...,...,...,...,...,...
1135,D20250417T200345_IFCB200,36.053037,-74.779623,29.0,discrete,AR87,IFCB200,site_CP14NEPM,site_chl max,,2025-04-17 14:55:52
1136,D20250417T202905_IFCB200,36.053037,-74.779623,29.0,discrete,AR87,IFCB200,site_CP14NEPM,site_chl max,,2025-04-17 14:55:52
1137,D20250417T211015_IFCB200,36.053037,-74.779623,77.0,discrete,AR87,IFCB200,site_CP14NEPM,site_75,,2025-04-17 14:48:06
1138,D20250417T213536_IFCB200,36.053037,-74.779623,77.0,discrete,AR87,IFCB200,site_CP14NEPM,site_75,,2025-04-17 14:48:06


In [17]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
output_filename = f"Metadata_CSVs/AR87_shipboard_ifcb_dashboard_metadata_{timestamp}.csv"
metadata_df.to_csv(output_filename, index=False)