**Generate labelled resistivity data** 

Loop through the raw daily .csv files in the dbfs and align with the master event file (too big to combine and do in one). Save the labelled data to a new folder called labelled. 

In [0]:
# Get packages
import pandas as pd
import glob
import os

# Load and prepare events file with all of the known fish events
events_path = "/dbfs/FileStore/rachlenn/Thr 20 process/test_KMThu16_2021_07.csv"
events = pd.read_csv(events_path, header=0)

# Ensure datetime is timezone-aware
events['Time'] = pd.to_datetime(events['Time'], utc=True)

# Create +/- event windows 2 seconds before and 8 seconds after as per AF parameters - we might want to truncate this?
events['start_time'] = events['Time'] - pd.Timedelta(seconds=2)
events['end_time']   = events['Time'] + pd.Timedelta(seconds=8)


# Function to label fish presence from the timestamps in the raw df 
def label_fish_presence(df, events_df):
    df['Time'] = pd.to_datetime(df['Time'], utc=True)
    df['fish_present'] = 0
    
    for _, event in events_df.iterrows():
        mask = (df['Time'] >= event['start_time']) & (df['Time'] <= event['end_time'])
        df.loc[mask, 'fish_present'] = 1
    
    return df

# Loop through all raw daily datasets
input_pattern = "/dbfs/FileStore/rachlenn/Thr 20 process/*Z"  # dailies 
output_folder = "/dbfs/FileStore/rachlenn/labeled" # where to put the labelled dailies

os.makedirs(output_folder, exist_ok=True)

# Run labelling function on all of the daily datasets
for file_path in glob.glob(input_pattern):
    print(f"Processing {file_path}...")

    # Load and prep daily data
    df = pd.read_csv(file_path, header=0)
    df.columns = ["timestamp", "upstream", "downstream"]

    # Convert ms timestamp to datetime making sure it is timezone aware too
    df["Time"] = pd.to_datetime(df["timestamp"], unit="ms", utc=True)
    df = df.drop(columns=["timestamp"])

    # Add the differential conductance to the daily df 
    df["differential_conductance"] = (df["downstream"] - df["upstream"]) / 2

    # Label fish events using the function 
    labeled_df = label_fish_presence(df, events)

    # Save to new CSVs in the output folder
    filename = os.path.basename(file_path).replace(".csv", "_labeled.csv")
    save_path = os.path.join(output_folder, filename)
    labeled_df.to_csv(save_path, index=False)

print("All datasets labelled and saved.")


**Extract Windows**

The UNet model should be fed windows of the event, so need to generate these before balancing the date stream. 

In [0]:
import numpy as np
import pandas as pd
import glob

input_folder = "/dbfs/FileStore/rachlenn/labeled/"
window_duration_seconds = 10

sampling_rates = []

# First pass: compute all sampling rates
for file_path in glob.glob(input_folder + "*Z"):
    df = pd.read_csv(file_path)
    df["Time"] = pd.to_datetime(df["Time"], utc=True)
    num_samples = len(df)
    time_delta = (df["Time"].iloc[-1] - df["Time"].iloc[0]).total_seconds()
    sps = num_samples / time_delta
    sampling_rates.append(sps)

median_sps = np.median(sampling_rates)
print(f"Median sampling rate: {median_sps} Hz")

fixed_window_size = int(median_sps * window_duration_seconds)
print(f"Fixed window size: {fixed_window_size} samples per window")


In [0]:
import numpy as np
import pandas as pd
import glob
import os

input_folder = "/dbfs/FileStore/rachlenn/labeled/"
output_folder = "/dbfs/FileStore/rachlenn/windows_balanced/"
os.makedirs(output_folder, exist_ok=True)

feature_cols = ["upstream", "downstream", "differential_conductance"]
label_col = ["fish_present"]

def create_windows_fixed(df, window_size):
    df = df.sort_values("Time").reset_index(drop=True)
    data = df[feature_cols].values
    labels = df[label_col].values.flatten()
    
    X, y = [], []
    step = window_size  # no overlap
    
    for start in range(0, len(df) - window_size + 1, step):
        end = start + window_size
        window_features = data[start:end]
        window_label = 1 if np.any(labels[start:end] == 1) else 0
        X.append(window_features)
        y.append(window_label)
    return np.array(X), np.array(y)


def balance_windows(X, y):
    fish_indices = np.where(y == 1)[0]
    nonfish_indices = np.where(y == 0)[0]
    num_fish = len(fish_indices)
    if num_fish == 0:
        return X, y  # no fish windows, return as-is
    
    sampled_nonfish_indices = np.random.choice(nonfish_indices, size=num_fish, replace=False)
    balanced_indices = np.concatenate([fish_indices, sampled_nonfish_indices])
    np.random.shuffle(balanced_indices)
    
    return X[balanced_indices], y[balanced_indices]

all_X = []
all_y = []

for file_path in glob.glob(os.path.join(input_folder, "*Z")):
    print(f"Processing {file_path}...")
    df = pd.read_csv(file_path)
    df["Time"] = pd.to_datetime(df["Time"], utc=True)
    
    X, y = create_windows_fixed(df)
    X_balanced, y_balanced = balance_windows(X, y)
    
    # Save balanced windows per file if needed, or just append
    np.save(os.path.join(output_folder, os.path.basename(file_path).replace("Z", "_X_balanced.npy")), X_balanced)
    np.save(os.path.join(output_folder, os.path.basename(file_path).replace("Z", "_y_balanced.npy")), y_balanced)
    
    all_X.append(X_balanced)
    all_y.append(y_balanced)

# Combine all balanced windows
all_X = np.concatenate(all_X)
all_y = np.concatenate(all_y)

print(f"Combined balanced windows shape: {all_X.shape}, labels shape: {all_y.shape}")


**Balance training dataset**

The noise (0/no fish) to signal (1/fish) ratio is out of whack. We need to balance it so the ratio is more 1:1 and can be used as a training dataset. Otherwise it could label everything as 0 and we would still get 99% accuracy. 

In [0]:
# Get packages
import pandas as pd
import glob
import os

# Get the paths to the right folders
input_folder = "/dbfs/FileStore/rachlenn/labeled/" # Where the labelled daily datasets are
output_folder = "/dbfs/FileStore/rachlenn/balanced/" # Where to put the balanced df 

os.makedirs(output_folder, exist_ok=True)

# Get all CSV files in input folder that end in Z
csv_files = glob.glob(os.path.join(input_folder, "*Z"))

# Initiate empty df
balanced_dfs = []

# Run through all of the labelled daily df and balance the noise to signal
for file in csv_files:
    print(f"Processing {file}...")
    df = pd.read_csv(file)
    
    # Split fish vs no-fish
    fish_df = df[df['fish_present'] == 1]
    no_fish_df = df[df['fish_present'] == 0]
    
    # Match counts so equalised
    no_fish_sample = no_fish_df.sample(len(fish_df), random_state=42)
    
    # Combine and shuffle
    balanced_df = pd.concat([fish_df, no_fish_sample]).sample(frac=1, random_state=42)
    
    # Save balanced CSV
    filename = os.path.basename(file).replace(".csv", "_balanced.csv")
    balanced_path = os.path.join(output_folder, filename)
    balanced_df.to_csv(balanced_path, index=False)
    
    balanced_dfs.append(balanced_df)

# Merge into one big dataset
merged_df = pd.concat(balanced_dfs).sample(frac=1, random_state=42)
merged_df.to_csv(os.path.join(output_folder, "all_balanced.csv"), index=False)

print(" All balanced CSVs and master df saved.")
