**Basic 1D CNN Model for small sample size**

In [0]:
import pandas as pd

# Inspect dataframe
df = pd.read_csv("/dbfs/mnt/lab/unrestricted/rachel.lennon@defra.gov.uk/cleaned/all_balanced.csv")
display(df.head(10))


**Data Prep**

In [0]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Parse signal column into numeric arrays
def parse_signal(x):
    if isinstance(x, (list, np.ndarray)):
        return np.array(x, dtype=np.float32)
    # Otherwise assume string like "[ -1.63 -1.53 ... ]"
    x = x.strip().replace("\n", " ").replace(",", " ")  # remove commas and newlines
    x = x.strip("[]")
    return np.array(x.split(), dtype=np.float32)

signals = df["signal_scaled"].apply(parse_signal)

# Pad/truncate all signals to fixed length (e.g., 10s * 100Hz = 1000)
max_len = 1000
X = pad_sequences(signals, maxlen=max_len, dtype="float32",
                  padding="post", truncating="post")

# Labels
y = df["fish_present"].astype(np.float32).to_numpy()

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y
)

# Add channel dimension for Conv1D/U-Net
X_train = X_train[..., np.newaxis]  # (samples, timesteps, 1)
X_val   = X_val[..., np.newaxis]

# Ensure labels are numeric
y_train = np.array(y_train, dtype=np.float32)
y_val   = np.array(y_val, dtype=np.float32)

# Check shapes
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)


**Define Model**

In [0]:
from tensorflow.keras import layers, models, callbacks

# Simple Conv1D baseline
def simple_conv1d(input_length):
    model = models.Sequential([
        layers.Conv1D(16, 5, activation="relu", input_shape=(input_length, 1)),
        layers.MaxPooling1D(2),

        layers.Conv1D(32, 5, activation="relu"),
        layers.MaxPooling1D(2),

        layers.Conv1D(64, 5, activation="relu"),
        layers.MaxPooling1D(2),

        layers.GlobalAveragePooling1D(),
        layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    return model



In [0]:
model.summary()

**Train Model**

In [0]:
# Build and train
model = simple_conv1d(max_len)

# weights
class_weights = {0: 1.0, 1: 2.0}   # makes fish mistakes more costly

# early stops
early_stop = callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
reduce_lr = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=16,
    epochs=50,
    class_weight=class_weights,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

**Evaluate**

In [0]:
# Evaluate
results = model.evaluate(X_val, y_val, verbose=0)
print(dict(zip(model.metrics_names, results)))

# Predictions
y_pred_proba = model.predict(X_val)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()

from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, digits=3))

**Hyperparameter tuning**

Look at the optimal thresholds to use for fish detection

In [0]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve

fpr, tpr, thr = roc_curve(y_val, y_pred_proba)
print("Best threshold (Youden's J):", thr[np.argmax(tpr - fpr)])


In [0]:
# Apply the new threshold
y_pred = (y_pred_proba > 0.651).astype(int).flatten()

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred, digits=3))


In [0]:
# Training and validation loss curves
train_loss = history.history['loss']
val_loss = history.history['val_loss']
print("Training loss is", train_loss)
print("Validation loss is",val_loss)

# Training and validation accuracy
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

In [0]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_val, y_pred)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
            xticklabels=["No Fish", "Fish"],
            yticklabels=["No Fish", "Fish"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [0]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.3f})")
plt.plot([0,1],[0,1],"k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

This is a bit better but still basically flipping a coin...need more data or try augmenting?