**Random Forest Resistivity Approach**

Apply sliding window to random forest classification model to predict presence/absence of fish events. 

In [0]:
import pandas as pd

# Inspect dataframe
df = pd.read_csv("/dbfs/FileStore/rachlenn/labeled/test_KMThu16_2021_07_15_15_36_24Z")
display(df.head(10))


In [0]:
from pyspark.sql import SparkSession

# Read all CSVs into a single Spark DataFrame
spark_df = spark.read.csv("/FileStore/rachlenn/labeled/*Z", header=True, inferSchema=True)

# Sort by time
spark_df = spark_df.sort("Time")

# Convert to pandas
pdf = spark_df.toPandas()

Make a sliding window of ~10 seconds

In [0]:

import pandas as pd

# Total number of samples
num_samples = len(pdf)

# Time difference in seconds
full_pdf['Time'] = pd.to_datetime(full_pdf['Time']); time_delta = (full_pdf["Time"].iloc[-1] - full_pdf["Time"].iloc[0]).total_seconds()

# Samples per second
sps = num_samples / time_delta

print(f"Samples per second: {sps:.2f}")

# Make a window frame of 10 seconds (may need to change this later on but discuss with AF)
window_duration_seconds = 10

window_size = int(sps * window_duration_seconds)

print(f"Rows per 10 sec window: {window_size}")

Engineer additional features for the model to "learn" from as this is traditional ML and not a NN

In [0]:
## Feature Engineering

# Step size
step_size = int(window_size/2)  #  50% overlap of frame - can change?

# Initiate empty df
feature_rows = []

# Extract features over windows
for start in range(0, len(pdf) - window_size + 1, step_size):
    window = pdf.iloc[start:start + window_size]

    features = {}
    features["window_start_time"] = window["Time"].iloc[0]
    features["window_end_time"] = window["Time"].iloc[-1]

    for col in ["upstream", "downstream", "differential_conductance"]:
        features[f"{col}_mean"] = window[col].mean()
        features[f"{col}_std"] = window[col].std()
        features[f"{col}_min"] = window[col].min()
        features[f"{col}_max"] = window[col].max()
        features[f"{col}_energy"] = (window[col]**2).sum()

    feature_rows.append(features)

# Convert to new df
features_df = pd.DataFrame(feature_rows)

# Check
display(features_df.head())


In [0]:
## Add fish events - pray to God this exists already 
fish_events = [
    ("2021-07-16T22:20:00Z", "2021-07-16T22:22:03Z"),
    ("2021-07-17T05:53:21Z", "2021-07-17T05:53:24Z"),
    ("2021-07-17T07:25:29Z", "2021-07-17T07:25:22Z"),
    ("2021-07-17T14:10:23Z", "2021-07-17T14:10:26Z"),
    ("2021-07-17T14:48:28Z", "2021-07-17T14:48:31Z"),
    ("2021-07-18T06:22:04Z", "2021-07-18T06:22:07Z"),
    ("2021-07-18T18:52:47Z", "2021-07-18T18:53:00Z"),
    ("2021-07-18T22:28:41Z", "2021-07-18T22:28:44Z"),
    ("2021-07-18T22:28:45Z", "2021-07-18T22:28:48Z"),
    ("2021-07-19T06:31:20Z", "2021-07-19T06:31:23Z"),
    ("2021-07-19T06:34:17Z", "2021-07-19T06:34:20Z"),
    ("2021-07-19T06:57:29Z", "2021-07-19T06:57:32Z"),
    ("2021-07-19T08:44:38Z", "2021-07-19T08:44:41Z"),
    ("2021-07-19T14:14:20Z", "2021-07-18T14:14:23Z"),
    ("2021-07-19T16:36:06Z", "2021-07-19T16:36:09Z"),
    ("2021-07-19T16:41:30Z", "2021-07-19T16:41:33Z"),
    ("2021-07-19T16:55:35Z", "2021-07-19T16:55:38Z"),
    ("2021-07-19T17:05:45Z", "2021-07-19T17:05:48Z"),
    ("2021-07-19T17:40:18Z", "2021-07-19T17:40:21Z"),
    ("2021-07-19T19:08:54Z", "2021-07-19T19:08:57Z"),
    ("2021-07-19T19:26:00Z", "2021-07-19T19:26:03Z"),
    ("2021-07-20T07:08:02Z", "2021-07-20T07:08:05Z"),
    ("2021-07-20T07:12:20Z", "2021-07-20T07:12:23Z"),
    ("2021-07-20T07:14:19Z", "2021-07-20T07:14:22Z"),
    ("2021-07-20T07:58:45Z", "2021-07-20T07:58:48Z"),
    ("2021-07-20T09:31:34Z", "2021-07-20T09:31:34Z"),
]

# Convert to datetime
fish_events = [(pd.to_datetime(start), pd.to_datetime(end)) for start, end in fish_events]

def label_window(window_start, window_end):
    for fish_start, fish_end in fish_events:
        # If any overlap, label = 1
        if (window_start <= fish_end) and (window_end >= fish_start):
            return 1
    return 0

features_df["label"] = features_df.apply(
    lambda row: label_window(row["window_start_time"], row["window_end_time"]),
    axis=1
)

print(features_df["label"].value_counts())
display(features_df)


**RF Model: Trial #1**

In [0]:
## Set up train/test split
from sklearn.model_selection import train_test_split

# Define feature columns 
feature_cols = [c for c in features_df.columns if c not in ["label", "window_start_time", "window_end_time"]]

x = features_df[feature_cols]
y = features_df["label"]

x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    stratify=y,
    test_size=0.3,
    random_state=42
)

print(f"Train size: {len(x_train)}, Test size: {len(x_test)}")

In [0]:
## Run the RF model
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced", #This take into account that the 0/1 ratio is off
    random_state=42
)
clf.fit(x_train, y_train)

In [0]:
## Evaluate model
from sklearn.metrics import classification_report, confusion_matrix

y_pred = clf.predict(x_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Try with downsampling instead

In [0]:
## Balance the counts (Downsample)

# separate into events vs non events
positives = features_df[features_df["label"] == 1]
negatives = features_df[features_df["label"] == 0]

print("Original counts:")
print("Positives:", len(positives))
print("Negatives:", len(negatives))

# to get about 1:2 ratio:
n_negatives_to_keep = len(positives) * 2

# Randomly sample negatives (reproducible)
negatives_sampled = negatives.sample(n=n_negatives_to_keep, random_state=42)

# Combine back into a balanced df 
balanced_df = pd.concat([positives, negatives_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)

print("Balanced counts:")
print(balanced_df["label"].value_counts())


In [0]:
## Set up train/test split (Downsample)
from sklearn.model_selection import train_test_split

# Define feature columns 
feature_cols = [c for c in balanced_df.columns if c not in ["label", "window_start_time", "window_end_time"]]

x = balanced_df[feature_cols]
y = balanced_df["label"]

x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    stratify=y,
    test_size=0.3,
    random_state=42
)

print(f"Train size: {len(x_train)}, Test size: {len(x_test)}")


In [0]:
## Run the RF model (DOWNSAMPLED)
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)
clf.fit(x_train, y_train)


In [0]:
## Evaluate DS model
from sklearn.metrics import classification_report, confusion_matrix

y_pred = clf.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

