**Generate labelled resistivity data** 

Loop through the raw daily .csv files in the dbfs and align with the master event file (too big to combine and do in one). Save the labelled data to a new folder called labelled. 

In [0]:
# Get packages
import pandas as pd
import glob
import os

# Load and prepare events file

events_path = "/dbfs/FileStore/rachlenn/Thr 20 process/test_KMThu16_2021_07.csv"
events = pd.read_csv(events_path, header=0)

# Ensure datetime is timezone-aware
events['Time'] = pd.to_datetime(events['Time'], utc=True)

# Create +/- event windows
events['start_time'] = events['Time'] - pd.Timedelta(seconds=2)
events['end_time']   = events['Time'] + pd.Timedelta(seconds=8)


# Function to label fish presence
def label_fish_presence(df, events_df):
    df['Time'] = pd.to_datetime(df['Time'], utc=True)
    df['fish_present'] = 0
    
    for _, event in events_df.iterrows():
        mask = (df['Time'] >= event['start_time']) & (df['Time'] <= event['end_time'])
        df.loc[mask, 'fish_present'] = 1
    
    return df


# Loop through all daily datasets
input_pattern = "/dbfs/FileStore/rachlenn/Thr 20 process/*Z"  # match your naming
output_folder = "/dbfs/FileStore/rachlenn/labeled"

os.makedirs(output_folder, exist_ok=True)

for file_path in glob.glob(input_pattern):
    print(f"Processing {file_path}...")

    # Load and prep daily data
    df = pd.read_csv(file_path, header=0)
    df.columns = ["timestamp", "upstream", "downstream"]

    # Convert ms timestamp to datetime
    df["Time"] = pd.to_datetime(df["timestamp"], unit="ms", utc=True)
    df = df.drop(columns=["timestamp"])

    # Add extra features
    df["differential_conductance"] = (df["downstream"] - df["upstream"]) / 2

    # Label fish events
    labeled_df = label_fish_presence(df, events)

    # Save to new CSV
    filename = os.path.basename(file_path).replace(".csv", "_labeled.csv")
    save_path = os.path.join(output_folder, filename)
    labeled_df.to_csv(save_path, index=False)

print("✅ All datasets processed and saved.")


In [0]:
import pandas as pd
import glob
import os

# Path where your labeled CSVs are stored
input_folder = "/dbfs/FileStore/rachlenn/labeled/"
output_folder = "/dbfs/FileStore/rachlenn/balanced/"

os.makedirs(output_folder, exist_ok=True)

# Get all CSV files in input folder
csv_files = glob.glob(os.path.join(input_folder, "*Z"))

balanced_dfs = []

for file in csv_files:
    print(f"Processing {file}...")
    df = pd.read_csv(file)
    
    # Split fish vs no-fish
    fish_df = df[df['fish_present'] == 1]
    no_fish_df = df[df['fish_present'] == 0]
    
    # Match counts
    no_fish_sample = no_fish_df.sample(len(fish_df), random_state=42)
    
    # Combine and shuffle
    balanced_df = pd.concat([fish_df, no_fish_sample]).sample(frac=1, random_state=42)
    
    # Save balanced CSV
    filename = os.path.basename(file).replace(".csv", "_balanced.csv")
    balanced_path = os.path.join(output_folder, filename)
    balanced_df.to_csv(balanced_path, index=False)
    
    balanced_dfs.append(balanced_df)

# Optional: Merge into one big dataset
merged_df = pd.concat(balanced_dfs).sample(frac=1, random_state=42)
merged_df.to_csv(os.path.join(output_folder, "all_balanced.csv"), index=False)

print("✅ Done — balanced CSVs saved.")


In [0]:
display(merged_df)