**Pre-Processing** 

Loop through the raw daily .csv files in the dbfs and align with the master event file so we have fish events aligned to the resistivity data. Save the labelled data to a new folder called labelled. 

Start of by aligning the raw resistivity with the fish events. 

In [0]:
# Get packages
import pandas as pd
import glob
import os

# Load and prepare events file with all of the known fish events
events_path = "/dbfs/FileStore/rachlenn/Thr 20 process/test_KMThu16_2021_07_eventonly.csv"
events = pd.read_csv(events_path, header=0)

# Ensure datetime is timezone-aware
events['Time'] = pd.to_datetime(events['Time'], utc=True)

# Create +/- event windows 2 seconds before and 2 seconds after as per AF parameters - we might want to truncate this?
events['start_time'] = events['Time'] - pd.Timedelta(seconds=2.5)
events['end_time']   = events['Time'] + pd.Timedelta(seconds=2.5)


# Function to label fish presence from the timestamps in the raw df 
def label_fish_presence(df, events_df):
    df['Time'] = pd.to_datetime(df['Time'], utc=True)
    df['fish_present'] = 0
    
    for _, event in events_df.iterrows():
        mask = (df['Time'] >= event['start_time']) & (df['Time'] <= event['end_time'])
        df.loc[mask, 'fish_present'] = 1
    
    return df

# Loop through all raw daily datasets
input_pattern = "/dbfs/FileStore/rachlenn/DuplicateFree/*_no_duplicate"  
output_folder = "/dbfs/FileStore/rachlenn/labeled" # where to put the labelled dailies

os.makedirs(output_folder, exist_ok=True)

# Run labelling function on all of the daily datasets
for file_path in glob.glob(input_pattern):
    print(f"Processing {file_path}...")

    # Load and prep daily data
    df = pd.read_csv(file_path, header=0)
    df.columns = ["timestamp", "upstream", "downstream"]

    # Convert ms timestamp to datetime making sure it is timezone aware too
    df["Time"] = pd.to_datetime(df["timestamp"], unit="ms", utc=True)
    df = df.drop(columns=["timestamp"])

    # Add the differential conductance to the daily df 
    df["differential_conductance"] = (df["downstream"] - df["upstream"]) / 2

    # Label fish events using the function 
    labeled_df = label_fish_presence(df, events)

    # Save to new CSVs in the output folder
    filename = os.path.basename(file_path).replace("_no_duplicate", "_labelled")
    save_path = os.path.join(output_folder, filename)
    labeled_df.to_csv(save_path, index=False)

print("All datasets labelled and saved.")


Look at the df to make sure it is structured as expected.

In [0]:
df = pd.read_csv("/dbfs/FileStore/rachlenn/labeled/test_KMThu16_2021_07_17_15_36_26Z_labelled")

display(df)

Check the average sampling rate.

In [0]:
# Path to files
input_pattern = "/dbfs/FileStore/rachlenn/labeled/*Z_labelled"

for file_path in glob.glob(input_pattern):
    print(f"Processing {file_path}...")
    
    # Load CSV
    df = pd.read_csv(file_path)
    
    # Convert Time column to datetime
    df['Time'] = pd.to_datetime(df['Time'], format='ISO8601')
    
    # Sort by time
    df = df.sort_values('Time')
    
    # Calculate time differences in seconds
    df['delta_t'] = df['Time'].diff().dt.total_seconds()
    
    # Drop the first NaN
    delta_t = df['delta_t'].dropna()
    
    # Compute average sampling rate
    avg_sampling_rate = 1 / delta_t.mean()
    
    print(f"Average sampling rate for {file_path}: {avg_sampling_rate:.2f} Hz")




**Re-Sampling**

We have varying sample rate so we need to resample the data. Done using polyphase filtering with automatically derived FIR (Finite Impulse Response) filter to avoid ailising of the signal.

In [0]:
import pandas as pd
from fractions import Fraction
from scipy.signal import resample_poly
import numpy as np

# Load data
df = pd.read_csv("/dbfs/FileStore/rachlenn/labeled/test_KMThu16_2021_07_17_15_36_26Z_labelled")
df['Time'] = pd.to_datetime(df['Time'], format = "ISO8601")
df = df.sort_values('Time')

# Extract signal
signal = df['differential_conductance'].values

# Original sampling rate
delta_t = df['Time'].diff().dt.total_seconds().dropna()
fs_original = 1 / delta_t.mean()
fs_target = 100

# Compute up/down ratio
ratio = Fraction(str(fs_target)) / Fraction(str(fs_original))
ratio = ratio.limit_denominator()
up, down = ratio.numerator, ratio.denominator

# Resample signal
signal_resampled = resample_poly(signal, up, down)

# Build new uniform time index
n_samples = len(signal_resampled)
start_time = df['Time'].iloc[0]
new_time_index = pd.date_range(start=start_time, periods=n_samples, freq=f"{1000/fs_target}ms")

# Convert times to seconds since start for binning
original_times_sec = (df['Time'] - start_time).dt.total_seconds()
bin_edges_sec = np.append(np.arange(n_samples) / fs_target, n_samples / fs_target)

# Get bin index for each original row
bin_indices = np.digitize(original_times_sec, bin_edges_sec) - 1

# Vectorized: group fish_present by bin index and take max (1 if any fish present)
labels_resampled = (
    pd.Series(df['fish_present'].values)
    .groupby(bin_indices)
    .max()
    .reindex(range(n_samples), fill_value=0)
    .astype(int)
    .values
)

# Build resampled DataFrame
df_resampled = pd.DataFrame({
    'Time': new_time_index,
    'differential_conductance': signal_resampled,
    'fish_present': labels_resampled
})

print(df_resampled.head(20))





**Bin into 10 second chunks**


We need to have windows of time to feed into the UNet, do this e = 1/10 Hz as an initial approach. 

In [0]:
import numpy as np

# Parameters
window_size_seconds = 10
samples_per_window = fs_target * window_size_seconds  # e.g., 100 Hz * 10 s = 1000 samples

# Drop incomplete final chunk
n_complete_chunks = len(df_resampled) // samples_per_window

# Reshape signal
signal_chunks = signal_resampled[:n_complete_chunks * samples_per_window].reshape(
    n_complete_chunks, samples_per_window
)

# Reshape labels (binary, so we take max across window later)
label_chunks = labels_resampled[:n_complete_chunks * samples_per_window].reshape(
    n_complete_chunks, samples_per_window
)

# Window-level labels: 1 if fish present anywhere in window
window_labels = (label_chunks.max(axis=1)).astype(int)

# Build a DataFrame where each row is a 10-second chunk
df_chunks = pd.DataFrame({
    'signal': list(signal_chunks),   # each row is an array of 1000 samples
    'fish_present': window_labels
})

print(df_chunks.head())


**Scale**

Individually scale each window to median 0 and interquartile range to provide a baseline

In [0]:
import numpy as np

def scale_chunk(chunk):
    """Scale a 1D array to median 0, IQR 1"""
    median = np.median(chunk)
    iqr = np.percentile(chunk, 75) - np.percentile(chunk, 25)
    if iqr == 0:
        return chunk - median  # avoid division by zero
    return (chunk - median) / iqr

# Apply scaling to each chunk
df_chunks['signal_scaled'] = df_chunks['signal'].apply(scale_chunk)

display(df_chunks)


Databricks filtered table. Run in Databricks to view.

**Balance training dataset**

The noise (0/no fish) to signal (1/fish) ratio is out of whack. We need to balance it so the ratio is more 1:1 and can be used as a training dataset. Otherwise it could label everything as 0 and we would still get 99% accuracy. 

In [0]:
# Get packages
import pandas as pd
import glob
import os

# Get the paths to the right folders
input_folder = "/dbfs/FileStore/rachlenn/labeled/" # Where the labelled daily datasets are
output_folder = "/dbfs/FileStore/rachlenn/balanced/" # Where to put the balanced df 

os.makedirs(output_folder, exist_ok=True)

# Get all CSV files in input folder that end in Z
csv_files = glob.glob(os.path.join(input_folder, "*Z"))

# Initiate empty df
balanced_dfs = []

# Run through all of the labelled daily df and balance the noise to signal
for file in csv_files:
    print(f"Processing {file}...")
    df = pd.read_csv(file)
    
    # Split fish vs no-fish
    fish_df = df[df['fish_present'] == 1]
    no_fish_df = df[df['fish_present'] == 0]
    
    # Match counts so equalised
    no_fish_sample = no_fish_df.sample(len(fish_df), random_state=42)
    
    # Combine and shuffle
    balanced_df = pd.concat([fish_df, no_fish_sample]).sample(frac=1, random_state=42)
    
    # Save balanced CSV
    filename = os.path.basename(file).replace(".csv", "_balanced.csv")
    balanced_path = os.path.join(output_folder, filename)
    balanced_df.to_csv(balanced_path, index=False)
    
    balanced_dfs.append(balanced_df)

# Merge into one big dataset
merged_df = pd.concat(balanced_dfs).sample(frac=1, random_state=42)
merged_df.to_csv(os.path.join(output_folder, "all_balanced.csv"), index=False)

print(" All balanced CSVs and master df saved.")
