**Try Augmenting the Data to Boost the Signal**

Data preperation

In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

import pandas as pd

# Inspect dataframe
df = pd.read_csv("/dbfs/mnt/lab/unrestricted/rachel.lennon@defra.gov.uk/cleaned/all_balanced.csv")

# Parse df into numpy arrays
def parse_signal(x):
    """Convert string or array-like signal to numeric numpy array"""
    if isinstance(x, (list, np.ndarray)):
        return np.array(x, dtype=np.float32)
    x = x.strip().replace("\n", " ").replace(",", " ")
    x = x.strip("[]")
    return np.array(x.split(), dtype=np.float32)

# Parse all signals
signals = df["signal_scaled"].apply(parse_signal)

# Pad/truncate signals
max_len = 1000  #10s at 100Hz
X = pad_sequences(signals, maxlen=max_len, dtype="float32",
                  padding="post", truncating="post")

# Labels
y = df["fish_present"].astype(np.float32).to_numpy()


# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

Augment

In [0]:

# Augmentation functions
def jitter(x, sigma=0.01):
    return x + np.random.normal(0, sigma, size=x.shape)

def scaling(x, sigma=0.1):
    factor = np.random.normal(1.0, sigma)
    return x * factor

def time_shift(x, shift_max=50):
    shift = np.random.randint(-shift_max, shift_max)
    if shift > 0:
        return np.concatenate([np.zeros(shift), x[:-shift]])
    elif shift < 0:
        return np.concatenate([x[-shift:], np.zeros(-shift)])
    else:
        return x

def augment_positive_samples(X_pos, N_aug=5):
    aug_list = []
    for x in X_pos:
        for _ in range(N_aug):
            y = x.copy()
            if np.random.rand() < 0.5: y = jitter(y, 0.02)
            if np.random.rand() < 0.5: y = scaling(y, 0.1)
            if np.random.rand() < 0.5: y = time_shift(y, shift_max=50)
            aug_list.append(y)
    return np.array(aug_list, dtype=np.float32)

# Augment positives
X_train_pos = X_train[y_train==1]
X_train_neg = X_train[y_train==0]

N_aug = 5
X_aug = augment_positive_samples(X_train_pos, N_aug)
y_aug = np.ones(len(X_aug))

# Ensure 2D for concatenation
X_train_2d = X_train.reshape(X_train.shape[0], -1)
X_aug_2d = X_aug.reshape(X_aug.shape[0], -1)

# Concatenate and shuffle
X_train_aug = np.concatenate([X_train_2d, X_aug_2d], axis=0)
y_train_aug = np.concatenate([y_train, y_aug], axis=0)
X_train_aug, y_train_aug = shuffle(X_train_aug, y_train_aug, random_state=42)

# Add channel dimension for Conv1D
X_train_aug = X_train_aug[..., np.newaxis]
X_val_exp = X_val[..., np.newaxis]

Define model

In [0]:
# Define model
from tensorflow.keras import layers, models, callbacks

def simple_cnn(input_length):
    inputs = layers.Input(shape=(input_length, 1))
    x = layers.Conv1D(32, 5, activation='relu', padding='same')(inputs)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(64, 3, activation='relu', padding='same')(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.3)(x)
    output = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs, output)
    return model

model = simple_cnn(X_train_aug.shape[1])
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC()]
)

Train model

In [0]:
# early stops
early_stop = callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
reduce_lr = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3)

history = model.fit(
    X_train_aug, y_train_aug,
    validation_data=(X_val_exp, y_val),
    epochs=50,
    callbacks=[early_stop, reduce_lr],
    batch_size=32,
)


Evaluate

In [0]:
from sklearn.metrics import roc_curve, auc

# Evaluate
results = model.evaluate(X_val_exp, y_val, verbose=0)
print(dict(zip(model.metrics_names, results)))

# Predictions
y_pred_proba = model.predict(X_val_exp)
fpr, tpr, thr = roc_curve(y_val, y_pred_proba)
threshold = thr[np.argmax(tpr - fpr)]

y_pred = (y_pred_proba > threshold).astype(int).flatten()

from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, digits=3))

In [0]:
import matplotlib.pyplot as plt

# Training and validation loss curves
train_loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(train_loss, label="Training Loss", color="tab:blue")
plt.plot(val_loss, label="Validation Loss", color="tab:orange")

# Mark the last points
plt.scatter(len(train_loss)-1, train_loss[-1], color="tab:blue", s=60, zorder=3)
plt.scatter(len(val_loss)-1, val_loss[-1], color="tab:orange", s=60, zorder=3)

# Add offset labels with arrows for clarity
plt.annotate(f"{train_loss[-1]:.4f}",
             xy=(len(train_loss)-1, train_loss[-1]),
             xytext=(10, 10), textcoords="offset points",
             ha="left", color="tab:blue",
             arrowprops=dict(arrowstyle="->", color="tab:blue"))

plt.annotate(f"{val_loss[-1]:.4f}",
             xy=(len(val_loss)-1, val_loss[-1]),
             xytext=(10, -15), textcoords="offset points",
             ha="left", color="tab:orange",
             arrowprops=dict(arrowstyle="->", color="tab:orange"))

plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Loss Curve")
plt.legend()
plt.tight_layout()
plt.show()

In [0]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc


fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.3f})")
plt.plot([0,1],[0,1],"k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

In [0]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_val, y_pred)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
            xticklabels=["No Fish", "Fish"],
            yticklabels=["No Fish", "Fish"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()