In [None]:
#Last Edited: 2025/10/29
#File changed more recently (2025/11/18) to commit (with Cell outputs deleted)

In [None]:
#I created this ipynb file, and many of the .py files using the help of chatGPT.  
#This was my initial prompt:

'''
So, I'd like to train an ML model.  I have a data set consisting of 4000 h5-format files.  Each file contains 24 hours of data.  There is a data group called '/data' with many multi channel variables.  There is also a set of annotations that indicate the classication of the data when anomalies are present.  Generally the class-imbalance for anomalous data is very large.  Probably <1 day with anomalous data per 300 days anomaly free. I want to do several things:

1) Do pre-training on the "good"/"anomaly free" data
2) Train on classified data.

Things I will need to do:
1) Have a data loader that loads the data
- the data is for an ADCP with 3 beams.  Each beam collects data for beam velocity, beam backscatter, and beam correlation.  Since the beams are kind of independant, it probably makes sense to organize the data such that there are 3 input channels [velocity, backscatter, correlation], and each beam can be fed through the model independantly.  
-The "anomalous" data may only occur for 6 hours within a 24 hour period, and I'd like the model to identify specifically which data is anomalous (not just binary yes/no for the entire 24 hour period).  I don't know if it makes sense to create a layer mask for the data, or if some kind of time-series label would be more appropriate (or even possible)

2) Define a model architecture / type
 - I want this to be set up so that I can experiment with differnet options for this, and have this be pretty modular.  Any suggestions would be appreciated 
 - I also want to have the option to be able to randomly tile the inputs, to add more variance for training, but this might not be needed

3) Have a loss function that is weighted to account for class-imbalance.  I've used graduated dice loss in the past but not sure what other options are available

'''

#It then gave me a bunch of info, but no code, and I responded with 

'''
so each input channel is 2D (time, range), and with multiple channels is 3D.  I guess I don't need to do anything fancy like making a mask for the annotations since it's not semantic segmentation, so the output could probably by 1D (time), with values like 0 (normal), 1, 2, 3, to indicate the class. Right now the annotations are start/end time and index, so I'll need to create a time-series label with the data loader.  

 I prefer pytorch.  I think a CNN is the minimum I'll want to use.  I'll also need a normalization step.  I like F1-score for an evaluation metric, and maybe a combo of cross entropy and dice loss, 

Can you give me code for all of this please?  Maybe start with the data loader. If there is a limit on tokens, do just the data loader, then ask me to say continue for each subsequent section of code
'''

#And from there it pretty much gave me all of this.  Now I have he task of testing and debugging the code and actually making it work for my data

In [None]:

'''
To set up the environment:

conda create -n adcp_anomaly_env python=3.10
conda activate adcp_anomaly_env
pip install -r requirements.txt

OR

python -m venv adcp_anomaly_env
adcp_anomaly_env\Scripts\activate
pip install -r requirements.txt

'''

In [1]:
# Cell 1: Imports & Setup

import os
import torch
from torch.utils.data import DataLoader, random_split

# Add repo root to Python path - Needed to import from src folder
import sys
from pathlib import Path
repo_root = Path().resolve().parent  # notebooks/ → ADCP-CNN-QAQC
sys.path.append(str(repo_root))

from src.dataset_loader import ADCPDataset  # your custom dataset
from src.resnet_temporal import ResNetTemporalClassifier # CNNClassifier  # your model
# from model import TemporalCNN # CNNClassifier  # your model
from src.utils import seed_everything, get_class_weights, combined_loss, train_model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed_everything(42)

USE_WANDB = False  # Set to True to enable logging - Keep false while in development/debugging
if USE_WANDB:
    import wandb

In [2]:
# Cell 2: Initialize wandb
from types import SimpleNamespace

if USE_WANDB:
    wandb.init(project="adcp-anomaly-detection", dir=r"F:\Documents\GitHub\ml_development\ADCP_ML\wandb_runs", config={
        "model": "ResNetTemporalClassifier",
        "epochs": 20,
        "batch_size": 16,
        "lr": 1e-3,
        "loss_alpha": 0.5,
        "optimizer": "Adam"
    })
    config = wandb.config
else:
    config = SimpleNamespace(**{
        "model": "ResNetTemporalClassifier",
        "epochs": 20,
        "batch_size": 2, #16,
        "lr": 1e-3,
        "loss_alpha": 0.5,
        "optimizer": "Adam"})

In [15]:
#DEBUG

#Get a list of files from the directory:
data_folder= r"F:\Documents\Projects\ML\ADCP_ML\h5_24h_files\\"
#annotation_file = "/path/to/annotations.json"
file_list = os.listdir(data_folder)
h5_files = {k for k in file_list if os.path.splitext(k)[1] == ".h5"}
h5_files = sorted(h5_files)  # Sorts alphabetically

#print(os.path.splitext(file_list[0]))
#print(h5_files)

h5_paths = []
for filename in h5_files:
    h5_paths.append(data_folder + filename) 

#print(len(h5_paths))

#WHILE DEBUGGING, ONLY US A FEW FILES
#CHANGE THIS LATER
file_idx = 3567
print(h5_paths[file_idx])                                  

F:\Documents\Projects\ML\ADCP_ML\h5_24h_files\\20240406T000000_20240406T235959.h5


In [3]:
# Cell 3: Load Dataset


#Get a list of files from the directory:
data_folder= r"F:\Documents\Projects\ML\ADCP_ML\h5_24h_files\\"
#annotation_file = "/path/to/annotations.json"
file_list = os.listdir(data_folder)
h5_files = {k for k in file_list if os.path.splitext(k)[1] == ".h5"}
#Files are inherently NOT in order in python! So if you want them in order, need to do this:
h5_files = sorted(h5_files)  # Sorts alphabetically

#print(os.path.splitext(file_list[0]))
#print(h5_files)

h5_paths = []
for filename in h5_files:
    h5_paths.append(data_folder + filename) 

#print(len(h5_paths))

#WHILE DEBUGGING, ONLY US A FEW FILES
#CHANGE THIS LATER

file_idx = 3567
h5_paths = h5_paths[file_idx-5 : file_idx+5 ] 

#num_files = 4
#h5_paths = h5_paths[:num_files]                                             # <===== ELIMINATE THIS LATER    

full_dataset = ADCPDataset(h5_paths) # (data_dir)
#full_dataset = ADCPDataset(data_dir, annotation_file)
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size

train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=4)


In [4]:
#Example:
#if num_files is 20, then len(full_dataset) will be 60 (*3) because of the 3 beams

print(train_size)
print(val_size)

24
6


In [5]:
# Cell 4: Initialize Model and Loss
num_classes = 6 # full_dataset.num_classes
model = ResNetTemporalClassifier(
    num_classes=num_classes,
    pretrained=True,      # set False if you want to train from scratch
    variant='resnet50',     # options: 'resnet50', 'resnet101', 'resnet152'
    resize=(224, 224)        # input size for ResNet
)
# model = TemporalCNN(input_channels=3, num_classes=num_classes)

class_weights = torch.tensor([1.6761e-01, 3.8812e+01, 1.4015e+02, 9.8140e+02, 0.0000e+00, 0.0000e+00])
#class_weights = torch.tensor([1/300, 1, 1, 1, 1, 1])                 # FIX THIS LATER -  TEMPORARY SETTING - DEFINE MANUALLY
#class_weights = get_class_weights(train_dataset)                    # FIX THIS LATER -  UNCOMMENT THIS OR DEFINE MANUALLY BUT CORRECT WEIGHTS

#Original with 6 classes
#tensor([1.8522e-01, 2.0864e+00, 2.3713e+01, 9.4286e+01, 2.3760e+04, 1.4505e+01]) # For 200 files, is inverse of [5.3990, 0.4793, 0.0422, 0.0106, 0.0000,0.0689]
# tensor([2.2450e-01, 3.8812e+01, 1.4015e+02, 9.8140e+02, 1.9692e+00, 9.9600e-01]) # For full dateset (or 70% anyways)

loss_fn = combined_loss(class_weights, alpha=config.loss_alpha)
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)




In [55]:
#DEBUG: reload train_model

import importlib
import utils
importlib.reload(utils)

from utils import seed_everything, get_class_weights, combined_loss, train_model

In [6]:
# DEBUG:  One batch from your DataLoader

import os
os.environ["TORCH_DISABLE_MKL"] = "1"  # optional: disables MKL
os.environ["ONEDNN_VERBOSE"] = "0"
os.environ["DNNL_VERBOSE"] = "0"
torch.backends.mkldnn.enabled = False


x_batch, y_batch, meta_batch = next(iter(train_loader))

print(x_batch.shape)  # should be (B, T, num_classes)

print(type(x_batch))             # Should be torch.Tensor
print(x_batch.dtype)             # Should be torch.float32
print(torch.isnan(x_batch).any())  # Should be False
print(x_batch.shape)             # Should be [B, 3, T, R]

x_batch = x_batch.to(device)
model = model.to(device)

# Run forward pass
with torch.no_grad():
   outputs = model(x_batch) # (B, T, num_classes)

#optimizer.zero_grad()
#outputs = model(x_batch)     
print("output shape:", outputs.shape)
print("y shape:", y_batch.shape)

#Need to reshape the outputs, and y_batch for loss_fn to work properly:
outputs = outputs.reshape(-1, outputs.shape[-1])  # (B*T, num_classes)
y_batch = y_batch.view(-1)                        # (B*T, )

#Run the loss function
loss_fn = combined_loss(class_weights, alpha=config.loss_alpha)
loss = loss_fn(outputs, y_batch)

#Check the meta data contents 
print(meta_batch)

torch.Size([2, 3, 288, 102])
<class 'torch.Tensor'>
torch.float32
tensor(False)
torch.Size([2, 3, 288, 102])
output shape: torch.Size([2, 288, 6])
y shape: torch.Size([2, 288])
{'time': tensor([[1713225750, 1713226050, 1713226350, 1713226650, 1713226950, 1713227250,
         1713227550, 1713227850, 1713228150, 1713228450, 1713228750, 1713229050,
         1713229350, 1713229650, 1713229950, 1713230250, 1713230550, 1713230850,
         1713231150, 1713231450, 1713231750, 1713232050, 1713232350, 1713232650,
         1713232950, 1713233250, 1713233550, 1713233850, 1713234150, 1713234450,
         1713234750, 1713235050, 1713235350, 1713235650, 1713235950, 1713236250,
         1713236550, 1713236850, 1713237150, 1713237450, 1713237750, 1713238050,
         1713238350, 1713238650, 1713238950, 1713239250, 1713239550, 1713239850,
         1713240150, 1713240450, 1713240750, 1713241050, 1713241350, 1713241650,
         1713241950, 1713242250, 1713242550, 1713242850, 1713243150, 1713243450,
    

In [19]:
print(loss)

tensor(1.4480)


In [None]:
#

In [7]:
# Cell 5: Train

#I was having bugs in this, where it was saying it failed creating a primitive. This was found to solve the issue (but shouldn't be used when proper training/running)
import os
os.environ["TORCH_DISABLE_MKL"] = "1"  # optional: disables MKL
os.environ["ONEDNN_VERBOSE"] = "0"
os.environ["DNNL_VERBOSE"] = "0"
torch.backends.mkldnn.enabled = False

if USE_WANDB:
    model = train_model(model, train_loader, val_loader, optimizer, loss_fn, device, num_epochs=config.epochs, patience=5)
else:
    model, history = train_model(model, train_loader, val_loader, optimizer, loss_fn, device, num_epochs=config.epochs, patience=5)

# ([2, 3, 288, 102])
# => [batch, channels, time, range]
    
#Best result using ce and dice-loss:
#Epoch 7/20 | Train Loss: 0.7003 | Val Loss: 0.9850 | Val F1: 0.4931
#⏹️ Early stopping triggered.
    
# With ce and graduated dice-loss:
# Starting Validation on Epoch #  6
# Epoch 7/20 | Train Loss: 0.7778 | Val Loss: 0.9043 | Val F1: 0.4931
# #
# Starting Validation on Epoch #  7
# Epoch 8/20 | Train Loss: 0.7598 | Val Loss: 0.9866 | Val F1: 0.4931
# ⏹️ Early stopping triggered.

#It's weird that the val F1 is identical for all of these. I wonder if something is going on, but hard to say

Starting Training on Epoch #  0
Loss (& total) on Batch #1: 1.4090192317962646 (1.4090192317962646)
Loss (& total) on Batch #2: 0.8244938254356384 (2.233513057231903)
Loss (& total) on Batch #3: 0.5441804528236389 (2.777693510055542)
Loss (& total) on Batch #4: 0.5083618760108948 (3.2860553860664368)
Loss (& total) on Batch #5: 0.5015065670013428 (3.7875619530677795)
Loss (& total) on Batch #6: 0.5005192756652832 (4.288081228733063)
Loss (& total) on Batch #7: 0.5002608895301819 (4.788342118263245)
Loss (& total) on Batch #8: 0.500089704990387 (5.288431823253632)
Loss (& total) on Batch #9: 0.5000426769256592 (5.788474500179291)
Loss (& total) on Batch #10: 0.5000271201133728 (6.288501620292664)
Loss (& total) on Batch #11: 0.5000137686729431 (6.788515388965607)
Loss (& total) on Batch #12: 0.5000052452087402 (7.288520634174347)
Starting Validation on Epoch #  0
Epoch 1/20 | Train Loss: 0.6074 | Val Loss: 0.5000 | Val F1: 1.0000
✅ New best model saved.
Starting Training on Epoch #  1
L

In [8]:
# Cell 6: Load Best Model and Evaluate
model.load_state_dict(torch.load("best_model.pt"))
model.eval()

all_preds = []
all_labels = []

for x, y, _ in val_loader:
    x = x.to(device)
    with torch.no_grad():
        out = model(x)
        #Need to reshape the outputs, and y to match dimensions:
        out = out.reshape(-1, out.shape[-1])  # (B*T, num_classes)
        #The prediction is the class with largest score per sample
        preds = torch.argmax(out, dim=1)

    #Need to reshape the outputs, and y to match dimensions:
    y = y.view(-1)                        # (B*T, )

    #Append the results
    all_preds.append(preds.cpu())
    all_labels.append(y)

y_pred = torch.cat(all_preds).numpy()
y_true = torch.cat(all_labels).numpy()

from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1728

    accuracy                           1.00      1728
   macro avg       1.00      1.00      1.00      1728
weighted avg       1.00      1.00      1.00      1728



In [39]:
print(y_pred.shape)
print(y_true.shape)

(1728,)
(1728,)


In [48]:
import numpy as np
print(np.where(y_true==0))
print(np.all(y_true==0))


(array([   0,    1,    2, ..., 1725, 1726, 1727], shape=(1728,)),)
True


In [6]:
print(h5_test_file) 

F:\Documents\Projects\ML\ADCP_ML\h5_24h_files\\20240406T000000_20240406T235959.h5


In [10]:
#Cell 7: Make test plots.  
#I want to push a file through the algorithm, and see how it performs

#IMPORT EVERYTHING:
import os
import torch
from torch.utils.data import DataLoader, random_split

from src.dataset_loader import ADCPDataset  # your custom dataset
from src.model import TemporalCNN # CNNClassifier  # your model
from src.utils import seed_everything, get_class_weights, combined_loss, train_model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#DEBUG:
#I was having bugs in this, where it was saying it failed creating a primitive. This was found to solve the issue (but shouldn't be used when proper training/running)
import os
os.environ["TORCH_DISABLE_MKL"] = "1"  # optional: disables MKL
os.environ["ONEDNN_VERBOSE"] = "0"
os.environ["DNNL_VERBOSE"] = "0"
torch.backends.mkldnn.enabled = False



# INITIALIZE THE MODEL
model_path = r"F:\Documents\GitHub\ml_development\ADCP_ML\\"
num_classes = 6 
#from resnet_temporal import ResNetTemporalClassifier # CNNClassifier  # your model
model = ResNetTemporalClassifier(
    num_classes=num_classes,
    pretrained=True,      # set False if you want to train from scratch
    variant='resnet50',     # options: 'resnet50', 'resnet101', 'resnet152'
    resize=(224, 224)        # input size for ResNet
)
#model = TemporalCNN(input_channels=3, num_classes=num_classes)
model.load_state_dict(torch.load(model_path + "best_model_20251028_resnet50.pt", map_location=torch.device('cpu') ))
#model.load_state_dict(torch.load(model_path + "best_model_20250508.pt", map_location=torch.device('cpu') ))
#model.load_state_dict(torch.load(model_path + "best_model_20250502.pt", map_location=torch.device('cpu') ))
model.eval()


#LOAD THE TEST FILE
file_path = r"F:\Documents\Projects\ML\ADCP_ML\h5_24h_files\\"
h5_filename = '20240406T000000_20240406T235959.h5'
#
h5_test_file = []
h5_test_file.append(file_path + h5_filename) 


#Create a generic function for classifying files
def classify_test_data(model, h5_test_file):
    test_file_dataset = ADCPDataset(h5_test_file)
    test_loader = DataLoader(test_file_dataset, batch_size=3, shuffle=False, num_workers=4)

    all_preds = []
    all_labels = []

    for x, y, meta in test_loader:
        x = x.to(device)
        model = model.to(device)  # ← Add this line
        with torch.no_grad():
            out = model(x)
            #Need to reshape the outputs, and y to match dimensions:
            out = out.reshape(-1, out.shape[-1])  # (B*T, num_classes)
            #The prediction is the class with largest score per sample
            preds = torch.argmax(out, dim=1)

        #Need to reshape the outputs, and y to match dimensions:
        y = y.view(-1)                        # (B*T, )

        #Append the results
        all_preds.append(preds.cpu())
        all_labels.append(y)
        
    return x, y, preds, meta


#Run the classification
x, y, preds, meta = classify_test_data(model, h5_test_file)



In [7]:
#DEBUG
#meta["time"][0].size()
print(meta["time"][0,0])
print(meta["time"][1,0])
print(meta["time"][2,0])
#So, I just need meta["time"][0]

print(meta["channels"][0][0])
print(meta["channels"][1][0])



tensor(1712361750)
tensor(1712361750)
tensor(1712361750)
velocity
backscatter


In [11]:
from sklearn.metrics import classification_report
print(classification_report(y, preds))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       817
           1       0.00      0.00      0.00        47

    accuracy                           0.95       864
   macro avg       0.47      0.50      0.49       864
weighted avg       0.89      0.95      0.92       864



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
print(x.shape)
print(y.shape)

torch.Size([3, 3, 288, 102])
torch.Size([864])


In [13]:
#print(test_file_dataset[1])
#print(x[2])
print(preds.shape)

torch.Size([864])


In [14]:
#I think this is a bug! I think annotations are being applied to all beams instead of just one!
# => RESOLVED
annotations = y.view(3,288)
for an in annotations:
    print(an)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

# NEW

In [36]:
x.shape[0]

3

In [15]:
import h5py
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import datetime


def get_segments(annotations, ann):
    if ann==1: #For annotations (may be multiclass)
        mask = np.diff(annotations) != 0 # Create a mask of where changes in annotations are non-zero
        #diffs = mask.astype(int)
    elif ann==0: #For predictions
        mask = annotations != 0 # Create a mask of where annotations are non-zero
        
    diffs = np.diff(mask.astype(int)) # Find the changes in the mask
    start_indices = np.where(diffs == 1)[0] + 1 # Start indices: where diff == 1 (0 → 1)
    end_indices = np.where(diffs == -1)[0] + 1 # End indices: where diff == -1 (1 → 0)
    # Handle edge cases: 
    if mask[0]: #if it starts with a non-zero 
        start_indices = np.r_[0, start_indices]

    if mask[-1]: # if it ends with a non-zero
        end_indices = np.r_[end_indices, len(annotations)]

    anomaly_segments = list(zip(start_indices, end_indices)) # Zip together
    return anomaly_segments


def plot_results(x, annotations, predictions, filename, meta) :
    x = x.cpu()
    annotations = annotations.cpu()
    predictions = predictions.cpu()

    n_beams = x.shape[0]#[2]
    n_channels = x.shape[1]#[2]

    #time_data = meta['time'][0]
    time_data = [datetime.datetime.utcfromtimestamp(t.item()) for t in meta['time'][0]]

    for beam in range(n_beams):
        fig, axs = plt.subplots(n_channels, 1, sharex=True, figsize=(12, 2.5*n_channels))
        if n_channels == 1:
            axs = [axs]

        #Get the annotations for this beam
        anomaly_segments = get_segments(annotations[beam].cpu().numpy(),ann = 1)
        pred_segments = get_segments(predictions[beam].cpu().numpy(), ann = 0)

        #print(anomaly_segments)
        #print(pred_segments)

        #Determine if any annotations present in this beam:
        cls_str = '' #Initialize as nothing
        ann = annotations[beam].cpu().numpy()
        if np.any(ann>0):
            cls = np.median(ann[ann>0])
            cls_str = ', class: {}'.format(int(cls))


        #Plot Velocity, backscatter, and correlation, for each beam
        for ch in range(n_channels):
            #Plot the Complex Data
            im = axs[ch].imshow(
                x[beam,ch,:,:].T, aspect='auto', origin='lower',
                #extent=[extent[0], extent[1], extent[2], extent[3]],
                extent=[time_data[0], time_data[-1], 0, x.shape[3]-1],
                interpolation='nearest',
                cmap='jet',
            )

            #Set the figure title
            if beam == 0 and ch == 0:
                fig.suptitle('Annotions = Shaded, Predictions = --')
                #fig.suptitle('File: {}'.format(filename))
            
            #Set the subplot titles
            if ch == 0:
                axs[ch].set_title('Beam #{} {}'.format(beam+1, cls_str))   
           
            #Add labels and titles
            #axs[ch].set_ylabel("Range bin" if range_dim is not None else '')
            #axs[ch].set_title(f"{var} - Channel {ch+1}")

            #Add dashed vertical lines for predictions
            for start, end in pred_segments:
                if 0 <= start < x.shape[2]:
                    axs[ch].axvline(x=time_data[start], color='black', linestyle='dashed', alpha=0.7)
                    #axs[ch].axvline(x=start, color='red', linestyle='dashed', alpha=0.7)
                if 0 <= end < x.shape[2]:
                    axs[ch].axvline(x=time_data[end], color='black', linestyle='dashed', alpha=0.7)
                    #axs[ch].axvline(x=end, color='red', linestyle='dashed', alpha=0.7)

            #Add shading for annotations
            for start, end in anomaly_segments:
                if 0 <= start < x.shape[2] and 0 <= end <= x.shape[2]:
                    axs[ch].axvspan(time_data[start], time_data[end], color='black', alpha=0.3)
                    #axs[ch].axvspan(start, end, color='black', alpha=0.3)

            #Add a colorbar
            fig.colorbar(im, ax=axs[ch], label=meta['channels'][ch][0])
            #fig.colorbar(im, ax=axs[ch], label='color')

            #Add y-axis label
            axs[ch].set_ylabel('{}'.format('Range [m]') )


        # -- Date formatting for X --
        axs[-1].xaxis_date()  # tells matplotlib to interpret x as dates
        axs[-1].xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
        fig.autofmt_xdate()  # Makes dates pretty (auto-rotates, etc.)

        #Add x-axis label
        axs[-1].set_xlabel('{} [HH:MM] UTC'.format(time_data[0].strftime('%Y-%m-%d')) )

        #axs[-1].set_xlabel(time_dt[0].astype('datetime64[D]').astype(str))   # 'yyyy-mm-dd' date for xlabel
        #axs[-1].set_xlabel("Time (hours since start)")
        #fig.suptitle(f"{var} (shape={arr.shape})")
        plt.tight_layout()
        
        #if outdir:
        #    if not os.path.exists(outdir):
        #        os.makedirs(outdir)
        #    plt.savefig(f"{outdir}/{var}.png", dpi=120)
        #if show:
        #    plt.show()
        plt.show()
        #plt.close()

        
#

In [None]:
#Get indices of start/end segments of anomalies
annotations = y.view(3,288)
predictions = preds.view(3,288)

plot_results(x, annotations, predictions, h5_filename, meta)

In [None]:
#Can I apply this to data from a DIFFERENT SITE? From another device??

#This was trained/tested on data from BACAX, 

#I want to make an example figure for Drew, for BACUS, 2024-08-08

data_parent = r'F:\Documents\Projects\ADCP\scan_for_data\BACUS\ADCP2MHZ\\'
folder_list = ['20240801']
#folder_list = os.listdir(data_parent)

#Define folders
h5_monthly_folder = r'F:\Documents\Projects\ML\ADCP_ML\BACUS\h5_files\\' # Define output folder -  Monthly h5
h5_24h_folder = r'F:\Documents\Projects\ML\ADCP_ML\BACUS\h5_24h_files\\'  #Output folder - 24hr h5:


from src import convert_monthly_mat_to_h5
from src import split_h5_to_24hr_files
annotations_file = '' #This is purely classification. No annotation exists. 
#annotations_file = r'F:\Documents\Projects\ML\ADCP_ML\annotations_table_ed05_revised.mat'

#PART 1: Convert monthly Mat to monthly h5:
for folder in folder_list:
    # Path to your .mat file
    file_list = os.listdir(data_parent + folder)
    mat_files = {k for k in file_list if os.path.splitext(k)[1] == ".mat"}
    print(mat_files)

    #Combine the filenames into a proper path:
    mat_paths = []
    for filename in mat_files:
        mat_paths.append(data_parent + folder + '\\' + filename) 

    #Run the extraction
    for mat_path in mat_paths:
        #print(mat_path)
        convert_monthly_mat_to_h5.extract_mat_to_h5(mat_path, h5_monthly_folder) 


    ######################################
    # PART #2: Split to 24 hours, embed annotations and save the time in python format
    ######################################
        
    #Paths to month(ish) HDF5 source file(s):
    for mat_path in mat_paths:
        filename_mat = os.path.basename(mat_path)
        filename_h5 = os.path.splitext(filename_mat)[0] + '.h5'
        input_file = h5_monthly_folder + filename_h5

        print('Splitting files from folder {}'.format(folder))

        split_h5_to_24hr_files.split_h5_to_24hr_files_with_ann(
            input_file,             # your big HDF5 source (created with import_monthly_mat_to_h5.py)
            h5_24h_folder,          # output dir for 24hr files
            annotations_file,        # your .mat annotations file
        )

#PART 3:

#Load, classify, and plot another test file, 

#LOAD THE TEST FILE
file_path = r"F:\Documents\Projects\ML\ADCP_ML\BACUS\h5_24h_files\\"
h5_filename = '20240808T000000_20240808T235959.h5'
#
h5_test_file = []
h5_test_file.append(file_path + h5_filename) 

#Run the classification
x, y, preds, meta = classify_test_data(model, h5_test_file)

#Get indices of start/end segments of anomalies
annotations = y.view(3,288)
predictions = preds.view(3,288)

plot_results(x, annotations, predictions, h5_filename, meta)

In [None]:
#Look for Any positives!

#Push all files through. If any are classified as drop-outs with more than 6 in a row (half an hour), make a plot

#Get a list of files from the directory:
#data_folder = "/scratch/slonimer/ML_ADCP/BACAX_24hr_h5/"
data_folder= r"F:\Documents\Projects\ML\ADCP_ML\BACUS\h5_24h_files\\"
file_list = os.listdir(data_folder)
h5_files = {k for k in file_list if os.path.splitext(k)[1] == ".h5"}
#Files are inherently NOT in order in python! So if you want them in order, need to do this:
h5_files = sorted(h5_files)  # Sorts alphabetically

h5_paths = []
for filename in h5_files:
    h5_paths.append(data_folder + filename) 


#Run the classification
for file_path in h5_paths:
    print(file_path)
    #Predict the class
    x, y, preds, meta = classify_test_data(model, [file_path])
    
    #Get indices of start/end segments of anomalies
    annotations = y.view(3,288)
    predictions = preds.view(3,288)

    
    #Determine if any annotations present in any beam:
    do_plot = 0
    for beam in range(3):
        ann_beam = annotations[beam].cpu().numpy()
        pred_beam = predictions[beam].cpu().numpy()
        
        #If more than one hour predicted in a day in any beam:
        n_samples = 24 # 1 hour
        #n_samples = 12 # 1 hour
        if np.all(ann_beam==0) and np.sum(pred_beam>0)>n_samples:
            do_plot = 1
    
    if do_plot == 1:
        #Plot the results
        plot_results(x, annotations, predictions, os.path.basename(file_path))

In [17]:
data_parent = r'F:\Documents\Projects\ADCP\scan_for_data\BACUS\ADCP2MHZ\\'
child_folders = os.listdir(data_parent)
print(child_folders[-12:-1])

['20240701', '20240801', '20240901', '20241001', '20241101', '20241201', '20250101', '20250201', '20250301', '20250401', '20250501']


In [None]:

#I moved the code from this cell to "ADCP_Anomaly_Detection.ipynb"
