## Classifying fishsop data into up and downcast ##

In [3]:
# Import libs #
import pandas as pd
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt

from fishsoop.io import load_data
#from fishsoop.example_scripts import cast_time_QC


ModuleNotFoundError: No module named 'fishsoop'

### Select data based on region of interest ###

In [None]:
#path = 'C:/Users/z5493451/OneDrive - UNSW/Documents/Data/FishSOOP/IMOS_SOOP-FishSOOP_TP_FV01.parquet'
path = "/g/data/yj27/data/obs/fishsoop/IMOS_SOOP-FishSOOP_TP_FV01.parquet"
df = load_data(
    path_to_file=path,
    date_start="01-08-2023",
    date_end="01-08-2024",
    longitude_min=147.1,
    longitude_max=162.218,
    latitude_min=-41.545,
    latitude_max=-25.117,
    depth_min=None,
    depth_max=None,
    sensor_serial=None,
    gear_type=None
)

df

### Load the moana sensor data ###

In [None]:
## select the first cast
first_cast = df.index.get_level_values(0)[0]
df_first = df.xs(first_cast, level=0)
#df_first

### Separate time series into upcast, downcast, and staitonary (everything in between) ###

Now that we have a large sample of moana sensor cast data, we will test an algorithm to sort each cast into different cast components.
Once the algorithm is working, we will be able to add into the accessors in the fishsoop package to do this automatically for any cast on the fly.

The function we will use is in the 'example_scripts' file in the fishsoop library and is called 'classify_casts'. It requires a series of timedeltas (generally in seconds) and array of depths to work. It is designed to do any sorting automatically without user intervention (which is why it must be tested vigarously!) by adopting a 'flipper' strategy (flips the series to pinpoint the end of the upcast), a clustering process to determine if depth is increasing, decreasing, or remaining constant, and a spline smoothing process to account for noise. For more information, see the function documentation.

In [None]:
# EXAMPLE USSAGE FOR 1 DATAFRAME --------------------------------------------------------------- #
time = (df_first.TIME.values - df_first.TIME.values[0]).astype('timedelta64[s]').view('int64')
depth = df_first.index.values
temp = df_first.TEMPERATURE.values

labels = cast_time_QC(time, depth, ref_depth=4, time_limit=15)

# EXAMPLE PLOT ----------------------------------------------------------------- #
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10))

# Color mapping
colors = {1: 'dodgerblue', 2: 'crimson', 3: 'gainsboro'}
label_colors = [colors[label] for label in labels]

# Plot 1: Depth vs Time
ax1.scatter(time, depth, c=label_colors, alpha=0.6, s=10)
ax1.set_xlabel('Time (s)')
ax1.set_ylabel('Depth (m)')
ax1.set_title('Depth vs Time (Classified)')
ax1.invert_yaxis()
ax1.grid(True, alpha=0.3)

# Add legend
for label, color in colors.items():
    ax1.scatter([], [], c=color, label=label, s=50)
ax1.legend()

# Plot 2: Temperature vs Depth
ax2.scatter(temp, depth, c=label_colors, alpha=0.6, s=10)
ax2.set_xlabel('Temperature (°C)')
ax2.set_ylabel('Depth (m)')
ax2.set_title('Temperature vs Depth (Classified)')
ax2.invert_yaxis()
ax2.grid(True, alpha=0.3)

# Plot 4: Classification segments
unique_labels = np.unique(labels)
y_positions = {1: 1, 3: 0, 2: -1}

for i, label in enumerate(labels):
    ax3.scatter(time[i], y_positions[label], c=colors[label], alpha=0.6, s=20)

ax3.set_xlabel('Time (s)')
ax3.set_ylabel('Cast Type')
ax3.set_yticks([-1, 0, 1])
ax3.set_yticklabels(['Upcast', 'Stationary', 'Downcast'])
ax3.set_title('Classification Timeline')
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


When the classify cast function is used, it returns a list of labels which correspond to the state of the sensor (i.e., 'downcast', 'upcast', or 'stationary') which can be assigned to the dataset. This is how the function can eventually be implemented as an xarray accessor - by asigning the values directly to the dataset when ds.upcast/downcast is called.

Now, we will go through some of the datasets iteratively to test the function...

In [None]:
# PLOT FUNCTION FOR TESTS --------------------------------------------------------------- #
def castplot(time, depth, labels):
    fig, ax = plt.subplots(1, 1, figsize=(3, 3), dpi=200)

    # Color mapping
    colors = {'downcast': 'dodgerblue', 'upcast': 'crimson', 'stationary': 'gainsboro'}
    label_colors = [colors[label] for label in labels]

    # Plot 1: Depth vs Time
    ax.scatter(time, depth, c=label_colors, alpha=0.6, s=10)
    ax.set_xlabel('Time (s)')
    ax.set_ylabel('Depth (m)')
    ax.set_title('Depth vs Time (Classified)')
    ax.invert_yaxis()
    ax.grid(True, alpha=0.3)

    return fig

In [None]:
# LOCAL VERSION OF THE FUNCTION JUST FOR TESTING ----------------------------------------------- #
def cast_time_QC(time, depth, time_limit=15, ref_depth=4):
    """
    Marks times within ± interval around zero (or reference) depth crossings as good (1),
    based on interval length = max_depth / sinking_speed.
    """

    def datetime_to_minutes(t):
        return (t - t[0]).astype("timedelta64[s]").astype(int)/60

    def split_updown(labels, trim: int=1):
        midpoint = (len(labels)//2)
        new_labs = np.full_like(labels, 2)
        new_labs[0:midpoint-trim] = 1
        new_labs[midpoint-trim:midpoint+trim] = 3
        return new_labs
    
    def assign_cast_directions(flags):
        """
        Assigns 1=downcast, 2=upcast, 3=stationary flags along an array.
        Alternates between downcast and upcast segments, starting with downcast.
        
        Parameters
        ----------
        flags : np.ndarray
            Array containing 1 (cast) and 3 (stationary) values.
        
        Returns
        -------
        np.ndarray
            Array of same shape, with upcast (2) assigned appropriately.
        """
        flags = np.asarray(flags, dtype=int)
        new_flags = flags.copy()
        
        # Identify cast regions between stationary (3) sequences
        is_cast = flags == 1
        cast_starts = np.where((~is_cast[:-1]) & is_cast[1:])[0] + 1
        cast_ends = np.where((is_cast[:-1]) & (~is_cast[1:]))[0] + 1
        
        # Handle edge cases where array starts/ends with cast
        if is_cast[0]:
            cast_starts = np.r_[0, cast_starts]
        if is_cast[-1]:
            cast_ends = np.r_[cast_ends, len(flags)]
        
        # Alternate between downcast (1) and upcast (2)
        cast_direction = 1  # start with downcast
        for start, end in zip(cast_starts, cast_ends):
            new_flags[start:end] = cast_direction
            cast_direction = 2 if cast_direction == 1 else 1  # toggle direction
        
        return new_flags
    
    # Convert to arrays
    time = np.asarray(time)
    depth = np.asarray(depth)
    t_min = datetime_to_minutes(time)

    # Determine time window (dt)
    max_time = np.max(t_min)
    if max_time <= time_limit:
        dt = max_time
    elif time_limit + 15 <= max_time < time_limit + 30:
        dt = 15
    elif time_limit + 30 <= max_time < time_limit + 45:
        dt = 30
    elif time_limit + 45 <= max_time < time_limit + 60:
        dt = 45
    else:
        dt = time_limit

    if np.max(depth) > 225:
        dt = dt + 5
    if np.max(depth) > 325:
        dt = dt + 5

    # Reference crossings (depth ≤ ref_depth)
    ref_times = t_min[depth <= ref_depth]
    if ref_times.size == 0:
        raise ValueError("No reference depth crossings found.")

    # For each ref_time, mark times within ±dt as good
    cast_labels = np.full(time.size, 3, dtype=int)  # default bad
    for ref_t in ref_times:
        mask = np.abs(t_min - ref_t) <= dt
        cast_labels[mask] = 1
    
    # Split short trajectories into down and up parts with a short stationary middle
    if ~np.any(cast_labels == 3):
        cast_labels = split_updown(cast_labels, trim=1)
        return cast_labels
    # Otherwise asign the up and downcast pairs for each ref time
    cast_labels = assign_cast_directions(cast_labels)

    return cast_labels



In [None]:
labels = (
    df.reset_index(level=['DEPTH'])
      .groupby(level=0, group_keys=False)
      .apply(lambda g: pd.Series(
          cast_time_QC(g['TIME'].to_numpy(), g['DEPTH'].to_numpy(), ref_depth=4, time_limit=15),
          index=g.index
      ))
)

df = df.copy()
df['CAST_FLAG'] = labels.values

In [None]:
df['CAST_FLAG'].value_counts()

In [None]:
# PLOT A BUNCH OF CASTS COLOURED BY THEIR TIME_QC CLASSIFICATION ----------------------------------------------- #
# NOTE: This will plot a lot of plots so be ready bucko... 
df_casts = df.copy()
cast_ids = df_casts.index.get_level_values('TRAJECTORY').unique()
N = (np.random.randint(0, len(cast_ids), 100)) # Choose 100 random casts to plot
for i in N:
    df_casts_i = df_casts[df_casts.index.get_level_values('TRAJECTORY') == cast_ids[i]]
    time = (df_casts_i.TIME.values - df_casts_i.TIME.values[0]).astype('timedelta64[s]').view('int64')/60
    fig, ax = plt.subplots(1, 1, figsize=(6, 3), dpi=200)
    # Scatter plot
    sc = ax.scatter(
        time,
        df_casts_i.index.get_level_values('DEPTH'),
        c=df_casts_i['CAST_FLAG'],
        s=3,
        cmap='coolwarm'
    )
    # Add legend entries for each flag
    for flag_val, label in [(1, 'Down (1)'), (2, 'Up (2)'), (3, 'Stationary (3)')]:
        ax.scatter([], [], c=sc.cmap(sc.norm(flag_val)), s=15, label=label)
    # Labels, title, and axis orientation
    ax.set_xlabel('Time (min)')
    ax.set_ylabel('Depth (m)')
    ax.set_title(f'Cast: {cast_ids[i]}')
    ax.invert_yaxis()        # Flip so depth increases downward
    ax.legend(title="CAST_FLAG", loc='best', markerscale=2)