**Random Forest Analysis**

Insepct the dataframe

In [0]:
import pandas as pd

# Inspect dataframe
df = pd.read_csv("/dbfs/mnt/lab/unrestricted/rachel.lennon@defra.gov.uk/cleaned/all_balanced.csv")
display(df.head(10))

Data preperation

In [0]:
import numpy as np
import pandas as pd
from scipy import stats
from numpy.fft import rfft, rfftfreq

# Parse signal into numeric arrays
def parse_signal(x):
    if isinstance(x, (list, np.ndarray)):
        return np.array(x, dtype=np.float32)
    x = str(x).strip().replace("\n", " ").replace(",", " ")
    x = x.strip("[]")
    return np.array(x.split(), dtype=np.float32)

signals = df["signal_scaled"].apply(parse_signal)

# Feature extractor
def extract_features(signal, fs=100):
    feats = {
        "mean": np.mean(signal),
        "std": np.std(signal),
        "min": np.min(signal),
        "max": np.max(signal),
        "median": np.median(signal),
        "skew": stats.skew(signal),
        "kurtosis": stats.kurtosis(signal),
        "rms": np.sqrt(np.mean(signal**2)),
        "abs_mean": np.mean(np.abs(signal)),
        "ptp": np.ptp(signal)  # peak-to-peak
    }

    diff = np.diff(signal)
    feats.update({
        "diff_mean": np.mean(diff),
        "diff_std": np.std(diff),
        "num_peaks": np.sum((diff[:-1] > 0) & (diff[1:] <= 0)),
    })

    # frequency domain
    N = len(signal)
    yf = np.abs(rfft(signal))
    xf = rfftfreq(N, 1/fs)
    total_power = np.sum(yf**2) + 1e-9
    bands = [(0.5,2),(2,5),(5,20),(20,50)]
    for i,(lo,hi) in enumerate(bands):
        idx = (xf >= lo) & (xf < hi)
        feats[f"band_{i}_power"] = np.sum(yf[idx]**2) / total_power

    feats["dom_freq"] = xf[np.argmax(yf)] if yf.size>0 else 0.0
    return feats

#  Build feature DataFrame
features = [extract_features(sig) for sig in signals]
df_features = pd.DataFrame(features, index=df.index)

# Combine with labels
df_features["fish_present"] = df["fish_present"].astype(int).values

# Visualise df 
print(df_features.head())
print("Shape:", df_features.shape)


**Train the Model**

In [0]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

#  Split features and labels
X = df_features.drop("fish_present", axis=1).values
y = df_features["fish_present"].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

#  Build pipeline: PCA + Random Forest 
pipeline = Pipeline([
    ("pca", PCA(n_components=min(30, X_train.shape[1]), random_state=42)),  # optional
    ("rf", RandomForestClassifier(
        n_estimators=300,
        max_depth=10,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])

#  Cross-validation (optional) 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring="f1_macro")
print("5-fold F1-macro (train):", cv_scores)
print("Mean F1-macro:", cv_scores.mean())

#  Train on full training set 
pipeline.fit(X_train, y_train)

#  Predict on validation set 
y_pred = pipeline.predict(X_val)
y_pred_proba = pipeline.predict_proba(X_val)[:, 1]

#  Evaluation 
print("\nValidation Metrics:")
print("Accuracy:", accuracy_score(y_val, y_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_pred_proba))
print("\nClassification Report:")
print(classification_report(y_val, y_pred, digits=3))
