**UNet Resistivity Trial**

Run the cleaned dataset through a UNet model

**Data Prep**

In [0]:
import pandas as pd

# Inspect dataframe
df = pd.read_csv("/dbfs/mnt/lab/unrestricted/rachel.lennon@defra.gov.uk/cleaned/all_balanced.csv")
display(df.head(10))


In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count class balance
class_counts = df["fish_present"].value_counts().sort_index()
print(class_counts)

# Bar plot
sns.barplot(x=class_counts.index, y=class_counts.values)
plt.xticks([0, 1], ["No Fish (0)", "Fish (1)"])
plt.ylabel("Number of samples")
plt.title("Class Balance")
plt.show()


In [0]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Parse signal column into numeric arrays
def parse_signal(x):
    if isinstance(x, (list, np.ndarray)):
        return np.array(x, dtype=np.float32)
    # Otherwise assume string like "[ -1.63 -1.53 ... ]"
    x = x.strip().replace("\n", " ").replace(",", " ")  # remove commas and newlines
    x = x.strip("[]")
    return np.array(x.split(), dtype=np.float32)

signals = df["signal_scaled"].apply(parse_signal)

# Pad/truncate all signals to fixed length (e.g., 10s * 100Hz = 1000)
max_len = 1000
X = pad_sequences(signals, maxlen=max_len, dtype="float32",
                  padding="post", truncating="post")

# Labels
y = df["fish_present"].astype(np.float32).to_numpy()

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y
)

# Add channel dimension for Conv1D/U-Net
X_train = X_train[..., np.newaxis]  # (samples, timesteps, 1)
X_val   = X_val[..., np.newaxis]

# Ensure labels are numeric
y_train = np.array(y_train, dtype=np.float32)
y_val   = np.array(y_val, dtype=np.float32)

# Check shapes
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)


**Define the 1D U-Net**

In [0]:
from tensorflow.keras import layers, models, callbacks

# Define function for 1D-Unet
def unet_1d(input_length, num_classes=1): 
    inputs = layers.Input(shape=(input_length, 1))

    # Encoder
    c1 = layers.Conv1D(8, 3, activation='relu', padding='same')(inputs)
    c1 = layers.Conv1D(8, 3, activation='relu', padding='same')(c1)
    p1 = layers.MaxPooling1D(2)(c1)

    c2 = layers.Conv1D(16, 3, activation='relu', padding='same')(p1)
    c2 = layers.Conv1D(16, 3, activation='relu', padding='same')(c2)
    p2 = layers.MaxPooling1D(2)(c2)

    c3 = layers.Conv1D(32, 3, activation='relu', padding='same')(p2)
    c3 = layers.Conv1D(32, 3, activation='relu', padding='same')(c3)
    p3 = layers.MaxPooling1D(2)(c3)

    # Bottleneck
    b = layers.Conv1D(64, 3, activation='relu', padding='same')(p3)
    b = layers.Conv1D(64, 3, activation='relu', padding='same')(b)

    # Decoder
    u3 = layers.UpSampling1D(2)(b)
    u3 = layers.Concatenate()([u3, c3])
    c6 = layers.Conv1D(32, 3, activation='relu', padding='same')(u3)

    u2 = layers.UpSampling1D(2)(c6)
    u2 = layers.Concatenate()([u2, c2])
    c7 = layers.Conv1D(16, 3, activation='relu', padding='same')(u2)

    u1 = layers.UpSampling1D(2)(c7)
    u1 = layers.Concatenate()([u1, c1])
    c8 = layers.Conv1D(8, 3, activation='relu', padding='same')(u1)

    # Output (binary classification at sequence level)
    gap = layers.GlobalAveragePooling1D()(c8)
    output = layers.Dense(1, activation='sigmoid')(gap)

    model = models.Model(inputs, output)
    return model

model = unet_1d(max_len)
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',  # still works per timestep
    metrics=['accuracy']
)



In [0]:
model.summary()

**Train the Model**

In [0]:
# early stops
early_stop = callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
reduce_lr = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=16,
    epochs=20,
    callbacks=[early_stop, reduce_lr],
)

In [0]:
# Evaluate
results = model.evaluate(X_val, y_val, verbose=0)
print(dict(zip(model.metrics_names, results)))

# Training and validation loss curves
train_loss = history.history['loss']
val_loss = history.history['val_loss']
print(train_loss)
print(val_loss)



In [0]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve

y_pred_proba = model.predict(X_val)

fpr, tpr, thr = roc_curve(y_val, y_pred_proba)
print("Best threshold (Youden's J):", thr[np.argmax(tpr - fpr)])

In [0]:
# Predictions
y_pred = (y_pred_proba > 0.507).astype(int).flatten()  # flatten for sklearn

from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, digits=3))

In [0]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_val, y_pred)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
            xticklabels=["No Fish", "Fish"],
            yticklabels=["No Fish", "Fish"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [0]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.3f})")
plt.plot([0,1],[0,1],"k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

**Model with Weights**

In [0]:
# weights
class_weights = {0: 1.0, 1: 2.0}   # makes fish mistakes more costly

# early stops
early_stop = callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
reduce_lr = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=16,
    epochs=20,
    class_weight=class_weights,
    callbacks=[early_stop, reduce_lr],
)


**Evaluate model**

In [0]:
# Evaluate
results = model.evaluate(X_val, y_val, verbose=0)
print(dict(zip(model.metrics_names, results)))

# Training and validation loss curves
train_loss = history.history['loss']
val_loss = history.history['val_loss']
print(train_loss)
print(val_loss)

# Training and validation accuracy
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']


In [0]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve

y_pred_proba = model.predict(X_val)

fpr, tpr, thr = roc_curve(y_val, y_pred_proba)
print("Best threshold (Youden's J):", thr[np.argmax(tpr - fpr)])


In [0]:
# Predictions
y_pred = (y_pred_proba > 0.64).astype(int).flatten()  # flatten for sklearn

from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, digits=3))


In [0]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_val, y_pred)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
            xticklabels=["No Fish", "Fish"],
            yticklabels=["No Fish", "Fish"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()



In [0]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.3f})")
plt.plot([0,1],[0,1],"k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()
