# Introduction 

- Following Code reads FVCOM particle tracking outputs and get the group_number
- Count the number if group_id in each group
- Add a new column to the data called group_number and add the number of particle in each group_id 
- The order is the first number is group_is,second numbers are particle number 


## Imports Libraries

In [2]:
# Importing libraries
import os
import glob
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr

# Input Files

In [3]:
# Set the directory of the FVCOM model outputs
FVCOM_dir = '/home/abolmaal/modelling/FVCOM/Huron/output'
# Set the directory of the FVCOM model outputs
files = glob.glob(os.path.join(FVCOM_dir, 'FVCOM_Huron_2424_*.nc'))

# Output Files

# Helper Functions 

In [4]:
# Define the function to sort the files based on the time
def sort_key(file):
    filename = os.path.basename(file)
    try:
        # Extract the number after the double underscores and before the `.nc` extension
        number = int(filename.split('_')[-1].split('.')[0])
        return number
    except (IndexError, ValueError):
        # Handle filenames that do not match the pattern by returning a high number to place them last
        return float('inf')

# Updated Function: group_id + particle_index + yymmddHHMM

### Example Output
group_id = 1, group_index = 5, release_time = 2301 → particle_id = 001052301

In [5]:
import xarray as xr
import pandas as pd
import numpy as np
import os

def write_particle_id(files, data_dir):
    """
    Adds a particle_id = group_id(3-digit) + group_index(2-digit) + release_time(4-digit) to each NetCDF file.
    Also adds a group_number field if needed (optional).
    """

    # release_times = [
    #     "2301", "2302", "2303", "2304",
    #     "2305", "2306", "2307", "2308",
    #     "2309", "2310", "2311", "2312"
    # ]
    release_times = [
        "2401", "2402", "2403", "2404",
        "2405", "2406", "2407", "2408",
        "2409", "2410", "2411", "2412"
    ]

    for idx, file in enumerate(files):
        print(f"🔄 Processing: {os.path.basename(file)}")

        ds = xr.open_dataset(file)
        num_particles = ds.sizes['particles']

        # Get group_id values
        group_ids = ds['group_id'].values
        release_time = release_times[idx]

        # Create DataFrame
        df = pd.DataFrame({'group_id': group_ids})
        df['group_index'] = df.groupby('group_id').cumcount()

        # Create particle_id with padding
        df['particle_id'] = df.apply(
            lambda row: f"{int(row['group_id']):03}{int(row['group_index']):02}{release_time}",
            axis=1
        )

        # Safety check
        if len(df) != num_particles:
            raise ValueError(f"❌ Length mismatch in {file}: {len(df)} particle_ids vs {num_particles} particles")

        # Assign to dataset
        ds['particle_id'] = (('particles'), df['particle_id'].values.astype('U15'))

        # Save updated file
        output_path = os.path.join(data_dir, f"updated_{os.path.basename(file)}")
        ds.to_netcdf(output_path)
        print(f"✅ Saved: {output_path}\n")




# Main Functions

In [6]:
# Load the FVCOM output files and sort them based on the time
files = glob.glob(FVCOM_dir + "/FVCOM_Huron_2424*.nc")
files.sort(key=sort_key)
print(files)


['/home/abolmaal/modelling/FVCOM/Huron/output/FVCOM_Huron_2424_JanFeb_1.nc', '/home/abolmaal/modelling/FVCOM/Huron/output/FVCOM_Huron_2424_FebMar_2.nc', '/home/abolmaal/modelling/FVCOM/Huron/output/FVCOM_Huron_2424_MarApr_3.nc', '/home/abolmaal/modelling/FVCOM/Huron/output/FVCOM_Huron_2424_AprMay_4.nc', '/home/abolmaal/modelling/FVCOM/Huron/output/FVCOM_Huron_2424_MayJun_5.nc', '/home/abolmaal/modelling/FVCOM/Huron/output/FVCOM_Huron_2424_JunJul_6.nc', '/home/abolmaal/modelling/FVCOM/Huron/output/FVCOM_Huron_2424_JulAug_7.nc', '/home/abolmaal/modelling/FVCOM/Huron/output/FVCOM_Huron_2424_AugSep_8.nc']


In [7]:
# Call the function to update the NetCDF files with the group_number column
write_particle_id(files, FVCOM_dir)


🔄 Processing: FVCOM_Huron_2424_JanFeb_1.nc
✅ Saved: /home/abolmaal/modelling/FVCOM/Huron/output/updated_FVCOM_Huron_2424_JanFeb_1.nc

🔄 Processing: FVCOM_Huron_2424_FebMar_2.nc
✅ Saved: /home/abolmaal/modelling/FVCOM/Huron/output/updated_FVCOM_Huron_2424_FebMar_2.nc

🔄 Processing: FVCOM_Huron_2424_MarApr_3.nc
✅ Saved: /home/abolmaal/modelling/FVCOM/Huron/output/updated_FVCOM_Huron_2424_MarApr_3.nc

🔄 Processing: FVCOM_Huron_2424_AprMay_4.nc
✅ Saved: /home/abolmaal/modelling/FVCOM/Huron/output/updated_FVCOM_Huron_2424_AprMay_4.nc

🔄 Processing: FVCOM_Huron_2424_MayJun_5.nc
✅ Saved: /home/abolmaal/modelling/FVCOM/Huron/output/updated_FVCOM_Huron_2424_MayJun_5.nc

🔄 Processing: FVCOM_Huron_2424_JunJul_6.nc
✅ Saved: /home/abolmaal/modelling/FVCOM/Huron/output/updated_FVCOM_Huron_2424_JunJul_6.nc

🔄 Processing: FVCOM_Huron_2424_JulAug_7.nc
✅ Saved: /home/abolmaal/modelling/FVCOM/Huron/output/updated_FVCOM_Huron_2424_JulAug_7.nc

🔄 Processing: FVCOM_Huron_2424_AugSep_8.nc
✅ Saved: /home/abol

In [8]:
# Make sure the group_number column is added to the NetCDF files and is in right order 
# read the updated NetCDF file
updated_files = glob.glob(FVCOM_dir + "/updated_FVCOM_Huron_2424_*.nc")
updated_files.sort(key=sort_key)
# Open the first file
ds = xr.open_dataset(updated_files[0])

# Print the first 10 particle_id values
print("🔎 First 10 particle_id values:")
print(ds['particle_id'].values[:152])
# # read files usimg xarray
# ds = xr.open_mfdataset(updated_files, combine='nested', concat_dim='time', parallel=True)

# # read the firs file
# #ds = xr.open_dataset(updated_files[:])
# # print the updated group_number variable values
# print(ds['group_number'].values)

🔎 First 10 particle_id values:
['000002401' '000012401' '000022401' '000032401' '000042401' '000052401'
 '000062401' '000072401' '000082401' '000092401' '000102401' '000112401'
 '000122401' '000132401' '000142401' '000152401' '000162401' '000172401'
 '000182401' '000192401' '000202401' '000212401' '000222401' '000232401'
 '000242401' '000252401' '000262401' '000272401' '000282401' '000292401'
 '000302401' '000312401' '000322401' '000332401' '000342401' '000352401'
 '000362401' '000372401' '000382401' '000392401' '000402401' '000412401'
 '000422401' '000432401' '000442401' '000452401' '000462401' '000472401'
 '000482401' '000492401' '000502401' '000512401' '000522401' '000532401'
 '000542401' '000552401' '000562401' '000572401' '000582401' '000592401'
 '000602401' '000612401' '000622401' '000632401' '000642401' '000652401'
 '000662401' '000672401' '000682401' '000692401' '000702401' '000712401'
 '000722401' '000732401' '000742401' '003002401' '003012401' '003022401'
 '003032401' '003042

In [9]:
# write a function to read each file adn 1- # Step 1: Convert time to datetime (safe for large data)
# Step 2: Extract day and month period
# Step 3: Count unique days per month (memory-efficient)
# Step_ 4: Create a DataFrame with the results
def count_unique_days_per_month(files):
    """
    Counts unique days per month across multiple NetCDF files.
    Returns a DataFrame with the results.
    """
    results = []

    for file in files:
        print(f"🔄 Processing: {os.path.basename(file)}")
        with xr.open_dataset(file) as ds:  # Use context manager

            # Convert time to datetime
            time = pd.to_datetime(ds['time'].values, unit='s', origin='unix')

            # Extract day and month
            days = time.day
            months = time.month

            # Create a DataFrame for counting unique days per month
            df = pd.DataFrame({'day': days, 'month': months})
            unique_days_per_month = df.groupby('month')['day'].nunique().reset_index()

            # Append results
            results.append(unique_days_per_month)

    # Concatenate all results into a single DataFrame
    final_df = pd.concat(results, ignore_index=True)
    final_counts = final_df.groupby('month')['day'].sum().reset_index()
    # save the final counts to a CSV file
    output_csv = os.path.join(FVCOM_dir, 'days_per_releasetime_24.csv')
    final_counts.to_csv(output_csv, index=False)
    return final_counts

In [10]:
#  Call the function to count unique days per month
unique_days_per_month = count_unique_days_per_month(updated_files)

🔄 Processing: updated_FVCOM_Huron_2424_JanFeb_1.nc
🔄 Processing: updated_FVCOM_Huron_2424_FebMar_2.nc
🔄 Processing: updated_FVCOM_Huron_2424_MarApr_3.nc
🔄 Processing: updated_FVCOM_Huron_2424_AprMay_4.nc
🔄 Processing: updated_FVCOM_Huron_2424_MayJun_5.nc
🔄 Processing: updated_FVCOM_Huron_2424_JunJul_6.nc
🔄 Processing: updated_FVCOM_Huron_2424_JulAug_7.nc
🔄 Processing: updated_FVCOM_Huron_2424_AugSep_8.nc


In [None]:
# print the occurance of each group_id in ds using xarray
group_id_counts = ds['group_id'].count(dim='particles').values

In [None]:
# Extract the year and month as 'YYYY-MM' from 'time'
ds['month'] = ds['time'].dt.strftime('%Y-%m')

# Convert to a DataFrame for easier manipulation
df = ds.to_dataframe()

# Group by 'month' and 'group_id', and count the occurrences
group_id_counts_df = df.groupby(['month', 'group_id']).size().reset_index(name='group_id_count')

# Pivot the table to have months as rows and group_ids as columns
pivot_df = group_id_counts_df.pivot(index='month', columns='group_id', values='group_id_count').fillna(0)

# Print the resulting pivoted DataFrame
print(pivot_df)
#save this pivoted DataFrame to a csv file
output_csv_path = os.path.join(FVCOM_dir, 'group_id_counts_by_month.csv')
pivot_df.to_csv(output_csv_path)

In [None]:
# Convert to a DataFrame for easier manipulation
df = ds.to_dataframe().reset_index()  # Resetting the index to make 'time' a column

# Check the first few rows to confirm the structure of the DataFrame
print(df.head())

# Filter the data for the specific day and group_id = 0
filtered_df = df[(df['month'] == '2023-01') & (df['group_id'] == 0)]

# Count the occurrences of group_id = 0 on the day 2023-01-01
occurrences = filtered_df[filtered_df['time'] == '2023-01-01'].shape[0]

# Print the result
print(f"Occurrences of group_id=0 on 2023-01: {occurrences}")

In [None]:
# Convert to a DataFrame for easier manipulation
df = ds.to_dataframe().reset_index()  # Resetting the index to make 'time' a column

# Check the first few rows to confirm the structure of the DataFrame
print(df.head())

# Filter the data for group_id = 0
filtered_df = df[df['group_id'] == 0].copy()  # Use copy() to avoid warning

# Convert the 'time' column to just the date (ignoring the time part)
filtered_df['date'] = filtered_df['time'].dt.date  # Extract date without time

# Extract month and year for filtering
filtered_df['month'] = filtered_df['time'].dt.to_period('M')  # Use .loc to avoid warning

# Filter data for the month of January 2023
filtered_january = filtered_df[filtered_df['month'] == '2023-02']

# Now count the total occurrences of group_id = 0 for the entire month of January 2023
total_occurrences = filtered_january.shape[0]  # Count all rows

# Print the result
print(f"Total occurrences of group_id=0 in January 2023: {total_occurrences}")

In [None]:
# Convert the NetCDF 'group_id' and 'group_number' variables to a DataFrame for processing
netcdf_df = ds['group_id'].to_dataframe().reset_index()

# Ensure that 'group_number' is extracted and correctly added
netcdf_df['group_number'] = ds['group_number'].values

# Step 1: Convert 'group_number' to integer if necessary, and format as a 5-digit string with leading zeros
netcdf_df['group_number'] = netcdf_df['group_number'].apply(lambda x: f"{int(x):11}")

# Step 2: Print values for debugging
print("First few rows of netcdf_df:")
print(netcdf_df.head())

# Step 3: Select relevant columns for saving to CSV
netcdf_df = netcdf_df[['group_id', 'group_number']]

# Save the result as a CSV file
netcdf_df.to_csv(os.path.join(FVCOM_dir, 'group_id_group_number.csv'), index=False)

print("CSV file saved with formatted group numbers")

