I have this netcdf files one the variable this file has is group_id, and I have a csv that also has group_id coulmn and another coulmn called WetLoad_TN_kg2 and WetLoad_TP_kg2 how I can add the values of WetLoad to nercdf file with the corresponding group_id in python

To assign the values of WetLoad_TN_kg2 and WetLoad_TP_kg2 from your CSV to the NetCDF files, while distributing them equally across the occurrences of each group_id, you can do the following:

For each group_id, count how many times it appears in the NetCDF dataset.
Divide the corresponding WetLoad_TN_kg2 and WetLoad_TP_kg2 values from the CSV by the number of occurrences of that group_id.
Assign the divided values back to the NetCDF file for each occurrence of the group_id

In [1]:
# Importing libraries
import os
import glob
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
import matplotlib.dates as mdates
from matplotlib import rcParams
from matplotlib.colors import LinearSegmentedColormap, LogNorm
import re 
import pylag
from shapely.geometry import Point
import contextily as ctx
import datetime

In [4]:
# define the path to the data directory
data_dir = '/home/abolmaal/data/FVCOME_OUTPUT/Simulations/Huron/output'
# Define the sort_key function to sort files by number in the filename
def sort_key(file):
    filename = os.path.basename(file)
    number = int(filename.split('__')[1].split('.')[0])
    return number

csv_file_path  = '/mnt/d/Users/abolmaal/Arcgis/NASAOceanProject/ZonalStats/StreamWatresheds_total_N_P.csv'
csv_data = pd.read_csv(csv_file_path)
# Ensure the CSV contains the necessary columns
if 'Group_id' not in csv_data.columns or 'WetLoad_TN_kg2' not in csv_data.columns or 'WetLoad_TP_kg2' not in csv_data.columns:
    raise ValueError("CSV file is missing required columns: 'Group_id', 'WetLoad_TN_kg2', 'WetLoad_TP_kg2'")

# Rename 'Group_id' to 'group_id' in the CSV to match the NetCDF data
csv_data.rename(columns={'Group_id': 'group_id'}, inplace=True)


In [3]:
files = glob.glob(os.path.join(data_dir, 'Fvcome_huron_estuary_2023_Winter_*.nc'))
files.sort(key=sort_key)

# Loop through each file, load it, merge the WetLoad data, and save the updated NetCDF file
for file in files:
    # Read the NetCDF file
    ds = xr.open_dataset(file)

    # Convert the NetCDF 'group_id' variable to a DataFrame for processing
    netcdf_df = ds['group_id'].to_dataframe().reset_index()

    # Step 1: Count occurrences of each group_id in the NetCDF file
    group_counts = netcdf_df['group_id'].value_counts().to_dict()

    # Step 2: Merge the CSV data with the NetCDF data on 'group_id'
    merged_df = pd.merge(netcdf_df, csv_data, on='group_id', how='left')

    # Step 3: For each group_id, divide the WetLoad_TN_kg2 and WetLoad_TP_kg2 values by the count of that group_id
    merged_df['WetLoad_TN_kg2'] = merged_df.apply(
        lambda row: row['WetLoad_TN_kg2'] / group_counts[row['group_id']] if pd.notnull(row['WetLoad_TN_kg2']) else None,
        axis=1
    )

    merged_df['WetLoad_TP_kg2'] = merged_df.apply(
        lambda row: row['WetLoad_TP_kg2'] / group_counts[row['group_id']] if pd.notnull(row['WetLoad_TP_kg2']) else None,
        axis=1
    )

    # Step 4: Add the updated WetLoad_TN_kg2 and WetLoad_TP_kg2 columns back into the NetCDF dataset
    ds['WetLoad_TN_kg2'] = (('dim_0'), merged_df['WetLoad_TN_kg2'].values)  # Replace 'dim_0' with the correct dimension
    ds['WetLoad_TP_kg2'] = (('dim_0'), merged_df['WetLoad_TP_kg2'].values)

    # Step 5: Save the updated NetCDF file
    output_file_path = os.path.join(data_dir, f"updated_{os.path.basename(file)}")
    ds.to_netcdf(output_file_path)

    print(f"Updated NetCDF file saved: {output_file_path}")

Updated NetCDF file saved: /home/abolmaal/data/FVCOME_OUTPUT/Simulations/Huron/output/updated_Fvcome_huron_estuary_2023_Winter_Jan__1.nc
Updated NetCDF file saved: /home/abolmaal/data/FVCOME_OUTPUT/Simulations/Huron/output/updated_Fvcome_huron_estuary_2023_Winter_Feb__2.nc
Updated NetCDF file saved: /home/abolmaal/data/FVCOME_OUTPUT/Simulations/Huron/output/updated_Fvcome_huron_estuary_2023_Winter_Mar__3.nc
Updated NetCDF file saved: /home/abolmaal/data/FVCOME_OUTPUT/Simulations/Huron/output/updated_Fvcome_huron_estuary_2023_Winter_Apr__4.nc
Updated NetCDF file saved: /home/abolmaal/data/FVCOME_OUTPUT/Simulations/Huron/output/updated_Fvcome_huron_estuary_2023_Winter_May__5.nc
Updated NetCDF file saved: /home/abolmaal/data/FVCOME_OUTPUT/Simulations/Huron/output/updated_Fvcome_huron_estuary_2023_Winter_Jun__6.nc
Updated NetCDF file saved: /home/abolmaal/data/FVCOME_OUTPUT/Simulations/Huron/output/updated_Fvcome_huron_estuary_2023_Winter_July__7.nc
Updated NetCDF file saved: /home/abolmaa

In [5]:
# Load the updated NetCDF files
updated_files = glob.glob(os.path.join(data_dir, 'updated_Fvcome_huron_estuary_2023_Winter_*.nc'))
updated_files.sort(key=sort_key)

# read all the files and concatenate them into a single xarray datase
updated_ds = [xr.open_dataset(file) for file in updated_files]
print(updated_ds)


[<xarray.Dataset>
Dimensions:                   (time: 121, particles: 4950, dim_0: 4950)
Coordinates:
  * time                      (time) datetime64[ns] 2023-01-01 ... 2023-01-31
Dimensions without coordinates: particles, dim_0
Data variables: (12/13)
    group_id                  (particles) int32 ...
    longitude                 (time, particles) float32 ...
    latitude                  (time, particles) float32 ...
    depth                     (time, particles) float32 ...
    host_fvcom                (time, particles) int32 ...
    error_status              (time, particles) int32 ...
    ...                        ...
    is_beached                (time, particles) int32 ...
    land_boundary_encounters  (time, particles) int32 ...
    thetao                    (time, particles) float32 ...
    so                        (time, particles) float32 ...
    WetLoad_TN_kg2            (dim_0) float64 ...
    WetLoad_TP_kg2            (dim_0) float64 ...
Attributes:
    title:    P

In [1]:
ds = updated_ds[0]
# Check if group_id exists
if 'group_id' in ds:
    # Step 1: Filter the dataset for group_id == 1 before converting to a DataFrame
    ds_group_1 = ds.where(ds['group_id'] == 0, drop=True)

    # Step 2: Convert filtered data to DataFrame
    netcdf_df = ds_group_1.to_dataframe().reset_index()

    # Step 3: Extract the WetLoad_TN_kg2 and WetLoad_TP_kg2 values for group_id == 1
    wetload_tn = netcdf_df['WetLoad_TN_kg2'].values
    wetload_tp = netcdf_df['WetLoad_TP_kg2'].values

    # Step 4: Create the plot for WetLoad_TN_kg2 and WetLoad_TP_kg2
    plt.figure(figsize=(10, 6))

    # Plot WetLoad_TN_kg2
    plt.plot(wetload_tn, label='WetLoad_TN_kg2')

    # Plot WetLoad_TP_kg2
    plt.plot(wetload_tp, label='WetLoad_TP_kg2')

    # Add titles and labels
    plt.title('WetLoad TN and TP for group_id = 1')
    plt.xlabel('Index')
    plt.ylabel('WetLoad (kg2)')
    plt.legend()

    # Show the plot
    plt.grid(True)
    plt.show()

else:
     print("group_id not found in dataset.")

NameError: name 'updated_ds' is not defined

In [6]:
# count the number of group_id that is equal to 1
group_count = ds['group_id'].where(ds['group_id'] == 0).count()
print(f"Number of group_id == 1: {group_count}")

Number of group_id == 1: <xarray.DataArray 'group_id' ()>
array(42)


In [7]:
a= 5.20382055035444E-07/42
print(a)

1.2390048929415335e-08


# Now we want to only keep the values that return to Coastal wetlands 