In [4]:
import torch
import wandb
import argparse
import os
import sys
import numpy as np

from tqdm import tqdm
from torch import nn
import numpy as np
from torch.utils.data import DataLoader
from sktime.regression.base import BaseRegressor
from sktime.regression.dummy import DummyRegressor
from sktime.regression.kernel_based import RocketRegressor
from sktime.utils import mlflow_sktime

from util.utils import set_all_seeds
from data.dataloader import Platoon

import warnings
from sklearn.exceptions import DataConversionWarning
# Ignore specific warning
warnings.filterwarnings(action='ignore', category=UserWarning, module='sktime.base._base_panel')



In [5]:
model_name = "dummy_regressor"
project_name = "Version 0.1"
seed = 42
num_epochs = 2
batch_size = 10
window_size = 10
verbose = False
only_iri = False

In [6]:
data_path = 'D:/sc_ml_and_road_conditions/CrackDetect/data/processed/segments.hdf5'
trainset = Platoon(data_type='train', data_path=data_path)
train_loader = DataLoader(trainset, batch_size=None, shuffle=True, num_workers=0)
valset = Platoon(data_type='val', data_path=data_path)
val_loader = DataLoader(valset, batch_size=None, shuffle=False, num_workers=0)

## Testing different tsai models

In [7]:
from tsai.basics import *
import sktime
import sklearn
my_setup(sktime, sklearn)
from tsai.models.MINIROCKET import *
from sklearn.metrics import mean_squared_error, make_scorer

def create_batches(data, targets, batch_size):
    num_batches = len(data) // batch_size
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = start_idx + batch_size
        yield data[start_idx:end_idx], targets[start_idx:end_idx]

    """
    NOTE - this is a generator, so the last batch will be smaller, but for certain models 
    we have to have fixed batch size so we can't just yield the last batch (thus, at times we will lose some data)
    """    
    if len(data) % batch_size != 0:
        start_idx = num_batches * batch_size
        yield data[start_idx:], targets[start_idx:]


os              : Windows-10-10.0.19045-SP0
python          : 3.10.11
tsai            : 0.3.9
fastai          : 2.7.14
fastcore        : 1.5.29
sktime          : 0.27.0
sklearn         : 1.4.1.post1
torch           : 2.2.1+cpu
device          : cpu
cpu cores       : 6
threads per cpu : 2
RAM             : 15.93 GB
GPU memory      : [8.0] GB


In [12]:

rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
model = MiniRocketRegressor(scoring=rmse_scorer)
num_epochs = 2

total_loss = []
for epoch in range(num_epochs):
    train_iterator = tqdm(train_loader, unit="batch", position=0, leave=False)
    train_loss = []
    train_iterator.set_description(f"Epoch {epoch+1}/{num_epochs}")
    for data_segment, target_segment in train_iterator:
        for data, target in create_batches(data_segment, target_segment, batch_size=batch_size):
            model.fit(data.numpy(), target.numpy())
            output = model.predict(data.numpy())
            loss = mean_squared_error(target, output)
            train_loss.append(loss)
            train_iterator.set_postfix_str(f"train_loss: {loss:.6f}")
    total_loss.append(np.mean(train_loss))
    # train_iterator.set_postfix_str(f"train_loss: {total_loss[-1]:.6f}")

                                                                                    

ValueError: Input contains NaN.

In [6]:
val_iterator = tqdm(val_loader, unit="batch", position=0, leave=False)
RMSs = []
for data_segment, target_segment in val_iterator:
    for data, target in create_batches(data_segment, target_segment, batch_size=batch_size):
        y_pred = model.predict(data)
        rmse = mean_squared_error(target, y_pred, squared=False)
        RMSs.append(rmse)
        val_iterator.set_postfix_str(f"val_loss: {rmse:.6f}")

print(f"Mean RMSE: {np.mean(RMSs):.6f}")

                                                                      

Mean RMSE: 0.394083




# HDF5 LOADER SANDBOX

In [23]:
import h5py
from sklearn.model_selection import train_test_split

In [24]:
data_path='../data/processed/segments.hdf5'
segments = h5py.File(data_path, 'r')
print(1)

In [25]:
random_state = 42
keys = sorted([int(i) for i in list(segments.keys())])
train_indices, test_indices, _, _ = train_test_split(keys, keys, test_size=0.2, random_state=random_state)
train_indices, val_indices, _, _ = train_test_split(train_indices, train_indices, test_size=0.1, random_state=random_state)
        

In [27]:
len(test_indices)

44

In [28]:
len(val_indices)

18

In [29]:
len(train_indices)

157

In [41]:
segments['2']

<HDF5 group "/2" (112 members)>