**Generate labelled resistivity data** 

Loop through the raw daily .csv files in the dbfs and align with the master event file so we have fish events aligned to the resistivity data. Save the labelled data to a new folder called labelled. 

In [0]:
# Get packages
import pandas as pd
import glob
import os

# Load and prepare events file with all of the known fish events
events_path = "/dbfs/FileStore/rachlenn/Thr 20 process/test_KMThu16_2021_07_eventonly.csv"
events = pd.read_csv(events_path, header=0)

# Ensure datetime is timezone-aware
events['Time'] = pd.to_datetime(events['Time'], utc=True)

# Create +/- event windows 2 seconds before and 2 seconds after as per AF parameters - we might want to truncate this?
events['start_time'] = events['Time'] - pd.Timedelta(seconds=2.5)
events['end_time']   = events['Time'] + pd.Timedelta(seconds=2.5)


# Function to label fish presence from the timestamps in the raw df 
def label_fish_presence(df, events_df):
    df['Time'] = pd.to_datetime(df['Time'], utc=True)
    df['fish_present'] = 0
    
    for _, event in events_df.iterrows():
        mask = (df['Time'] >= event['start_time']) & (df['Time'] <= event['end_time'])
        df.loc[mask, 'fish_present'] = 1
    
    return df

# Loop through all raw daily datasets
input_pattern = "/dbfs/FileStore/rachlenn/DuplicateFree/*_no_duplicate"  
output_folder = "/dbfs/FileStore/rachlenn/labeled" # where to put the labelled dailies

os.makedirs(output_folder, exist_ok=True)

# Run labelling function on all of the daily datasets
for file_path in glob.glob(input_pattern):
    print(f"Processing {file_path}...")

    # Load and prep daily data
    df = pd.read_csv(file_path, header=0)
    df.columns = ["timestamp", "upstream", "downstream"]

    # Convert ms timestamp to datetime making sure it is timezone aware too
    df["Time"] = pd.to_datetime(df["timestamp"], unit="ms", utc=True)
    df = df.drop(columns=["timestamp"])

    # Add the differential conductance to the daily df 
    df["differential_conductance"] = (df["downstream"] - df["upstream"]) / 2

    # Label fish events using the function 
    labeled_df = label_fish_presence(df, events)

    # Save to new CSVs in the output folder
    filename = os.path.basename(file_path).replace("_no_duplicate", "_labelled")
    save_path = os.path.join(output_folder, filename)
    labeled_df.to_csv(save_path, index=False)

print("All datasets labelled and saved.")


**Extract Windows**

The ML models should be fed windows of the event, so need to generate these before balancing the date stream. 

In [0]:
import pandas as pd
import glob
import os

input_pattern = "/dbfs/FileStore/rachlenn/labeled/*_labelled"
output_file = "/dbfs/FileStore/rachlenn/windows_10s.csv"

window_size_seconds = 10
all_windows = []

for file in glob.glob(input_pattern):
    df = pd.read_csv(file)
    df['Time'] = pd.to_datetime(df['Time'], utc=True)
    df = df.sort_values('Time')

    # Start at the earliest timestamp and move in 10-second steps
    start_time = df['Time'].min()
    end_time = df['Time'].max()

    current_start = start_time
    while current_start < end_time:
        current_end = current_start + pd.Timedelta(seconds=window_size_seconds)

        window_df = df[(df['Time'] >= current_start) & (df['Time'] < current_end)]

        if not window_df.empty:
            fish_label = 1 if window_df['fish_present'].any() else 0
            all_windows.append({
                "start_time": current_start,
                "end_time": current_end,
                "upstream_values": window_df['upstream'].tolist(),
                "downstream_values": window_df['downstream'].tolist(),
                "diff_values": window_df['differential_conductance'].tolist(),
                "label": fish_label
            })

        current_start = current_end

# Combine into one DataFrame
windows_df = pd.DataFrame(all_windows)

# Save for training
windows_df.to_csv(output_file, index=False)
print(f"Saved {len(windows_df)} windows to {output_file}")


**Balance training dataset**

The noise (0/no fish) to signal (1/fish) ratio is out of whack. We need to balance it so the ratio is more 1:1 and can be used as a training dataset. Otherwise it could label everything as 0 and we would still get 99% accuracy. 

In [0]:
# Get packages
import pandas as pd
import glob
import os

# Get the paths to the right folders
input_folder = "/dbfs/FileStore/rachlenn/labeled/" # Where the labelled daily datasets are
output_folder = "/dbfs/FileStore/rachlenn/balanced/" # Where to put the balanced df 

os.makedirs(output_folder, exist_ok=True)

# Get all CSV files in input folder that end in Z
csv_files = glob.glob(os.path.join(input_folder, "*Z"))

# Initiate empty df
balanced_dfs = []

# Run through all of the labelled daily df and balance the noise to signal
for file in csv_files:
    print(f"Processing {file}...")
    df = pd.read_csv(file)
    
    # Split fish vs no-fish
    fish_df = df[df['fish_present'] == 1]
    no_fish_df = df[df['fish_present'] == 0]
    
    # Match counts so equalised
    no_fish_sample = no_fish_df.sample(len(fish_df), random_state=42)
    
    # Combine and shuffle
    balanced_df = pd.concat([fish_df, no_fish_sample]).sample(frac=1, random_state=42)
    
    # Save balanced CSV
    filename = os.path.basename(file).replace(".csv", "_balanced.csv")
    balanced_path = os.path.join(output_folder, filename)
    balanced_df.to_csv(balanced_path, index=False)
    
    balanced_dfs.append(balanced_df)

# Merge into one big dataset
merged_df = pd.concat(balanced_dfs).sample(frac=1, random_state=42)
merged_df.to_csv(os.path.join(output_folder, "all_balanced.csv"), index=False)

print(" All balanced CSVs and master df saved.")
