In [None]:
import os
import shutil
from tqdm import tqdm

OLD_ROOT = "/data/TimeSeriesResearch/datasets/Satya"
NEW_ROOT = "/data/TimeSeriesResearch/datasets/Satya_New"

def restructure_dataset(old_root=OLD_ROOT, new_root=NEW_ROOT):
    os.makedirs(new_root, exist_ok=True)

    for folder in tqdm(os.listdir(old_root), desc="Processing Folders"):
        folder_path = os.path.join(old_root, folder)
        
        if not os.path.isdir(folder_path):
            continue

        for file_name in os.listdir(folder_path):
            if not file_name.lower().endswith(".csv"):
                continue
            
            parts = file_name.split('_')
            if len(parts) >= 3:
                label = '_'.join(parts[2:]).replace('.csv', '')
                index = parts[1]
            else:
                label = "Unknown"
                index = parts[1] if len(parts) > 1 else "0"

            label_folder = os.path.join(new_root, folder, label)
            os.makedirs(label_folder, exist_ok=True)

            new_file_name = f"file_{index}.csv"
            src_path = os.path.join(folder_path, file_name)
            dst_path = os.path.join(label_folder, new_file_name)

            try:
                shutil.copy(src_path, dst_path)
                print(f"Moved: {src_path} -> {dst_path}")
            except Exception as e:
                print(f"Error moving {src_path} to {dst_path}: {e}")

    print("Restructuring complete.")

restructure_dataset()


In [None]:
### File movement
import os
import shutil
import pandas as pd
from tqdm import tqdm

# Paths
source_dir = "/data/TimeSeriesResearch/datasets/kaggle/processed/processed_datasets_labelled_separated/SKAB - Skoltech Anomaly Benchmark"
dest_dir = "/data/TimeSeriesResearch/datasets/Satya_New/SKAB - Skoltech Anomaly Benchmark"

# Ensure the destination directory exists
os.makedirs(dest_dir, exist_ok=True)

# Dictionary to keep track of the file counts for each label
label_count = {}

# Process each file in the source directory
for file_name in tqdm(os.listdir(source_dir), desc="Processing files"):
    # Check if it's a CSV file
    if not file_name.endswith(".csv"):
        continue

    # Split the file name to extract the label
    parts = file_name.split('_')
    label = parts[-1].replace('.csv', '').strip()
    
    # Create the label directory if it doesn't exist
    label_dir = os.path.join(dest_dir, label)
    os.makedirs(label_dir, exist_ok=True)

    # Keep count of files within each label
    if label not in label_count:
        label_count[label] = 0

    # Construct the new file name based on the order
    new_file_name = f"file_{label_count[label]}.csv"
    label_count[label] += 1

    # Full paths for reading, processing, and moving
    src_path = os.path.join(source_dir, file_name)
    dest_path = os.path.join(label_dir, new_file_name)

    # Load the CSV and drop the label column
    try:
        df = pd.read_csv(src_path)
        if "Label" in df.columns:
            df = df.drop(columns=["Label"], errors='ignore')

        # Save the modified file in the new location
        df.to_csv(dest_path, index=False)

    except Exception as e:
        print(f"Error processing {file_name}: {e}")

print("Files processed and moved successfully!")


In [45]:
from collections import defaultdict
from ftse.data.Dataset import UnwindowedDataset
from tqdm.auto import tqdm
import os
import h5py
import numpy as np
import pandas as pd
import torch

os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"


ROOT_DIRS = [
    '/data/TimeSeriesResearch/datasets/Satya_New'
]

H5_FILE = '/data/TimeSeriesResearch/datasets/satya_data2.h5'

def create_h5py_dataset_from_root_dirs(root_dirs=ROOT_DIRS, h5_file=H5_FILE):
    with h5py.File(h5_file, "w") as h5f:
        for root_dir in root_dirs:
            root_prefix = os.path.basename(os.path.normpath(root_dir))

            for dataset_folder in tqdm(os.listdir(root_dir), desc=f"Datasets in {root_dir}"):
                dataset_path = os.path.join(root_dir, dataset_folder)
                if not os.path.isdir(dataset_path):
                    print("skipping", dataset_path)
                    print("\n"*50)
                    continue
                    
                #For labels
                for subfolder in os.listdir(dataset_path):
                    subfolder_path = os.path.join(dataset_path, subfolder)
                    if not os.path.isdir(subfolder_path):
                        continue

                    for csv_file in os.listdir(subfolder_path):
                        if not csv_file.lower().endswith(".csv"):
                            continue
                            
                        group_path = f"{root_prefix}/{dataset_folder}/{subfolder}"
                        group = h5f.require_group(group_path)
                        #print("group",group)

                        csv_path = os.path.join(subfolder_path, csv_file)
                        df = pd.read_csv(csv_path)

                        drop_columns = ['Unnamed: 0', 'time_sec', 'Time', 'Label']
                        df = df.drop(columns=[col for col in drop_columns if col in df.columns], errors='ignore')

                        column_names = df.columns.tolist()
                        data_array = df.to_numpy()
                        dataset_name = f"file_{len(group)}"
                        #print(group_path, "--", dataset_name)
                        dset = group.create_dataset(dataset_name, data=data_array)
                        dset.attrs["descriptions"] = list(column_names)
                        #print(f"Created dataset at: {group_path}/{dataset_name}")
    print("Flat HDF5 file created successfully:", h5_file)

Processing /data/TimeSeriesResearch/datasets/Satya_New:   0%|                                                                                                                                                                                                             | 0/18 [00:00<?, ?it/s]

Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/valve1/file_0
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/valve1/file_1
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/valve1/file_2
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/valve1/file_3
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/valve1/file_4
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/valve1/file_5
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/valve1/file_6
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/valve1/file_7
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/valve1/file_8
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/valve1/file_9
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/valve1/file_10
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/valve1/file_11
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmar

Processing /data/TimeSeriesResearch/datasets/Satya_New:   6%|██████████▉                                                                                                                                                                                          | 1/18 [00:00<00:04,  3.48it/s]

Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/Normal Operation/file_38
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/Normal Operation/file_39
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/Normal Operation/file_40
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/Normal Operation/file_41
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/Normal Operation/file_42
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/Normal Operation/file_43
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/Normal Operation/file_44
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/Normal Operation/file_45
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/Normal Operation/file_46
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/Normal Operation/file_47
Created dataset at: Satya_New/SKAB - Skoltech Anomaly Benchmark/Normal Operation/file_48
Created dataset at: S

Processing /data/TimeSeriesResearch/datasets/Satya_New:  17%|████████████████████████████████▊                                                                                                                                                                    | 3/18 [00:01<00:07,  1.93it/s]

Created dataset at: Satya_New/BCSV/Ball/file_6
Created dataset at: Satya_New/BCSV/Ball/file_7
Created dataset at: Satya_New/Wyoming/SO2/file_0
Created dataset at: Satya_New/Wyoming/SO2/file_1
Created dataset at: Satya_New/Wyoming/SO2/file_2
Created dataset at: Satya_New/Wyoming/SO2/file_3
Created dataset at: Satya_New/Wyoming/SO2/file_4
Created dataset at: Satya_New/Wyoming/SO2/file_5
Created dataset at: Satya_New/Wyoming/SO2/file_6
Created dataset at: Satya_New/Wyoming/SO2/file_7
Created dataset at: Satya_New/Wyoming/SO2/file_8
Created dataset at: Satya_New/Wyoming/SO2/file_9
Created dataset at: Satya_New/Wyoming/SO2/file_10
Created dataset at: Satya_New/Wyoming/SO2/file_11
Created dataset at: Satya_New/Wyoming/SO2/file_12
Created dataset at: Satya_New/Wyoming/SO2/file_13
Created dataset at: Satya_New/Wyoming/SO2/file_14
Created dataset at: Satya_New/Maryland/Ozone/file_0
Created dataset at: Satya_New/Maryland/Ozone/file_1
Created dataset at: Satya_New/Maryland/Ozone/file_2
Created da

Processing /data/TimeSeriesResearch/datasets/Satya_New:  50%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                  | 9/18 [00:01<00:01,  6.26it/s]

Created dataset at: Satya_New/California/CO/file_100
Created dataset at: Satya_New/California/CO/file_101
Created dataset at: Satya_New/California/CO/file_102
Created dataset at: Satya_New/California/CO/file_103
Created dataset at: Satya_New/California/CO/file_104
Created dataset at: Satya_New/California/CO/file_105
Created dataset at: Satya_New/California/CO/file_106
Created dataset at: Satya_New/California/CO/file_107
Created dataset at: Satya_New/California/CO/file_108
Created dataset at: Satya_New/California/CO/file_109
Created dataset at: Satya_New/Delaware/SO2/file_0
Created dataset at: Satya_New/Delaware/SO2/file_1
Created dataset at: Satya_New/Delaware/SO2/file_2
Created dataset at: Satya_New/Delaware/SO2/file_3
Created dataset at: Satya_New/Delaware/SO2/file_4
Created dataset at: Satya_New/Airquality_pattern/Unknown/file_0
Created dataset at: Satya_New/Airquality_pattern/Unknown/file_1
Created dataset at: Satya_New/Florida/PM25/file_0
Created dataset at: Satya_New/Florida/PM25

Processing /data/TimeSeriesResearch/datasets/Satya_New:  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                 | 12/18 [00:03<00:01,  3.65it/s]

Created dataset at: Satya_New/OCSV/Outer_and_Ball/file_8
Created dataset at: Satya_New/turbofan/Unknown/file_0
Created dataset at: Satya_New/turbofan/Unknown/file_1


Processing /data/TimeSeriesResearch/datasets/Satya_New:  72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 13/18 [00:03<00:01,  3.37it/s]

Created dataset at: Satya_New/turbofan/Unknown/file_2
Created dataset at: Satya_New/turbofan/Unknown/file_3
Created dataset at: Satya_New/NewYork/PM1/file_0
Created dataset at: Satya_New/NewYork/PM1/file_1
Created dataset at: Satya_New/NewYork/PM1/file_2
Created dataset at: Satya_New/NewYork/PM1/file_3
Created dataset at: Satya_New/NewYork/PM1/file_4
Created dataset at: Satya_New/IOCSV/Inner_and_Outer/file_0
Created dataset at: Satya_New/IOCSV/Inner_and_Outer/file_1
Created dataset at: Satya_New/IOCSV/Inner_and_Outer/file_2
Created dataset at: Satya_New/IOCSV/Inner_and_Outer/file_3


Processing /data/TimeSeriesResearch/datasets/Satya_New:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                | 15/18 [00:04<00:00,  3.02it/s]

Created dataset at: Satya_New/IOCSV/Inner_and_Outer/file_4
Created dataset at: Satya_New/Illinois/NO/file_0
Created dataset at: Satya_New/Illinois/NO/file_1
Created dataset at: Satya_New/Illinois/NO/file_2
Created dataset at: Satya_New/Illinois/NO/file_3
Created dataset at: Satya_New/Illinois/NO/file_4
Created dataset at: Satya_New/Illinois/NO/file_5
Created dataset at: Satya_New/Illinois/NO/file_6
Created dataset at: Satya_New/Illinois/NO/file_7
Created dataset at: Satya_New/Gyrometer/BuildingStairMove/file_0
Created dataset at: Satya_New/Gyrometer/BuildingStairMove/file_1
Created dataset at: Satya_New/Gyrometer/OutdoorMove/file_0
Created dataset at: Satya_New/Gyrometer/OutdoorMove/file_1
Created dataset at: Satya_New/Gyrometer/EscalatorMove/file_0
Created dataset at: Satya_New/Gyrometer/EscalatorMove/file_1
Created dataset at: Satya_New/Gyrometer/BuildingElevatorMove/file_0
Created dataset at: Satya_New/Gyrometer/BuildingElevatorMove/file_1
Created dataset at: Satya_New/Gyrometer/Flo

Processing /data/TimeSeriesResearch/datasets/Satya_New: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:05<00:00,  3.46it/s]

Created dataset at: Satya_New/IBCSV/Inner_and_Ball/file_2
Created dataset at: Satya_New/IBCSV/Inner_and_Ball/file_3





In [200]:
H5_FILE = '/data/TimeSeriesResearch/datasets/satya_data_New0522.h5'

def load_h5py_dataset(h5_file=H5_FILE, window_size=None, stride=None, concat=False):
    datasets = defaultdict(list)
    def recursive_group_traversal(group, path=""):
        for key in group.keys():
            item = group[key]

            if isinstance(item, h5py.Group):
                recursive_group_traversal(item, path + "/" + key)
            elif isinstance(item, h5py.Dataset):
                label = path.split("/")[-1]
                dataset_name = path.strip("/").split("/")[-2]
                
                datasets[dataset_name].append(
                    UnwindowedDataset(
                        data=item,
                        dataset_name=dataset_name,
                        descriptions=item.attrs.get("descriptions", []),
                        label=label
                    )
                )

    f = h5py.File(h5_file, 'r')
    recursive_group_traversal(f)

    if window_size and stride:
        for dataset_name in datasets:
            datasets[dataset_name] = [
                dataset.window(window_size=window_size, stride=stride)
                for dataset in datasets[dataset_name]
            ]

    if concat:
        for dataset_name in datasets:
            datasets[dataset_name] = torch.utils.data.ConcatDataset(
                datasets[dataset_name]
            )

    return datasets

In [201]:
# H5_FILE = '/data/TimeSeriesResearch/datasets/satya_new_data2.h5'
from collections import defaultdict
from ftse.data.Dataset import UnwindowedDataset
from tqdm.auto import tqdm
import os
import h5py
import numpy as np
import pandas as pd
import torch
datasets = load_h5py_dataset(H5_FILE, window_size=12, stride=1, concat=True)

In [202]:
datasets.keys()

dict_keys(['Accelerometer', 'Airquality_pattern', 'Appliances energy prediction Data Set', 'BCSV', 'California', 'Delaware', 'Florida', 'Gas sensor array temperature modulation', 'Gyrometer', 'Household Electric Power Consumption', 'IBCSV', 'IOCSV', 'Illinois', 'Machinery Fault Diagnosis', 'Maryland', 'MetroPT-3 Dataset', 'Michigan', 'NewYork', 'Nightly', 'OCSV', 'Predictive Maintenance Of Hydraulics System', 'SKAB - Skoltech Anomaly Benchmark', 'Texas', 'Unleashing the Power of Wearables', 'Wyoming', 'c3server', 'liu', 'turbofan'])

In [203]:
datasets['Predictive Maintenance Of Hydraulics System'][10]

Data(data=tensor([[1.8778e+01, 1.4780e+00, 2.3162e+03, 1.0137e+01, 8.9575e+00, 1.4543e+02,
         1.1852e+02, 2.7281e+00, 0.0000e+00, 8.4242e+00, 8.3840e+00, 2.9586e+01,
         5.6367e+01, 6.1004e+01, 5.8289e+01, 5.2168e+01, 7.7500e-01],
        [1.8790e+01, 1.4790e+00, 2.1761e+03, 7.8790e+00, 8.9548e+00, 1.4146e+02,
         1.1596e+02, 2.0582e+00, 0.0000e+00, 8.4230e+00, 8.3815e+00, 6.9977e+01,
         5.6270e+01, 6.1016e+01, 5.8312e+01, 5.2156e+01, 7.0600e-01],
        [1.8838e+01, 1.4850e+00, 2.1733e+03, 7.8253e+00, 8.9383e+00, 1.4132e+02,
         1.1589e+02, 2.0231e+00, 0.0000e+00, 8.4181e+00, 8.3774e+00, 6.9977e+01,
         5.6277e+01, 6.0988e+01, 5.8312e+01, 5.2168e+01, 6.6100e-01],
        [1.8761e+01, 1.4790e+00, 2.1723e+03, 7.7977e+00, 8.9390e+00, 1.4133e+02,
         1.1586e+02, 2.0128e+00, 0.0000e+00, 8.4173e+00, 8.3724e+00, 6.9317e+01,
         5.6277e+01, 6.1004e+01, 5.8301e+01, 5.2156e+01, 6.3200e-01],
        [1.8749e+01, 1.4780e+00, 2.1716e+03, 7.7988e+00, 8.939

In [195]:
def get_unique_labels(dataset, max_samples=None):
    unique_labels = set()
    sample_count = len(dataset) if max_samples is None else min(max_samples, len(dataset))
    
    for i in range(sample_count):
        try:
            sample = dataset[i]
            # Extract label from the sample
            label = sample.label if hasattr(sample, 'label') else None
            unique_labels.add(label)
        except Exception as e:
            print(f"Error processing sample {i}: {e}")
    
    return unique_labels

# Use the function
dataset = datasets['Predictive Maintenance Of Hydraulics System']
print(f"Total samples in dataset: {len(dataset)}")

# Check first 1000 samples (adjust as needed)
unique_labels = get_unique_labels(dataset, max_samples=len(dataset)-100)
print(f"Unique labels found: {unique_labels}")

Total samples in dataset: 6960
Unique labels found: {'close to total failure-severe lag-severe leakage-severely reduced pressure', 'close to total failure-optimal switching behavior-weak leakage-slightly reduced pressure', 'full efficiency-close to total failure-severe leakage-severely reduced pressure', 'close to total failure-severe lag-weak leakage-slightly reduced pressure', 'full efficiency-severe lag-weak leakage-slightly reduced pressure', 'close to total failure-close to total failure-no leakage-close to total failure', 'full efficiency-severe lag-no leakage-severely reduced pressure', 'close to total failure-optimal switching behavior-weak leakage-close to total failure', 'reduced efficiency-optimal switching behavior-severe leakage-severely reduced pressure', 'reduced efficiency-small lag-severe leakage-slightly reduced pressure', 'close to total failure-optimal switching behavior-severe leakage-close to total failure', 'close to total failure-close to total failure-severe le

In [185]:
from collections import defaultdict
from ftse.data.Dataset import UnwindowedDataset
from tqdm.auto import tqdm
import os
import h5py
import numpy as np
import pandas as pd
import torch

os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"


ROOT_DIRS = [
    '/data/TimeSeriesResearch/datasets/Satya_New'
]

H5_FILE = '/data/TimeSeriesResearch/datasets/satya_data_New0522.h5'

def create_h5py_dataset_from_root_dirs(root_dirs=ROOT_DIRS, h5_file=H5_FILE):
    with h5py.File(h5_file, "w") as h5f:
        for root_dir in root_dirs:
            root_prefix = os.path.basename(os.path.normpath(root_dir))

            for dataset_folder in tqdm(os.listdir(root_dir), desc=f"Datasets in {root_dir}"):
                dataset_path = os.path.join(root_dir, dataset_folder)
                if not os.path.isdir(dataset_path):
                    continue
                    
                #For labels
                print("dataset_path", dataset_path)
                for subfolder in os.listdir(dataset_path):
                    subfolder_path = os.path.join(dataset_path, subfolder)
                    if not os.path.isdir(subfolder_path):
                        continue
                    
                    print("subfolder_path", subfolder_path)
                    for csv_file in os.listdir(subfolder_path):
                        if not csv_file.lower().endswith(".csv"):
                            print(csv_file)
                            continue
                            
                        group_path = f"{root_prefix}/{dataset_folder}/{subfolder}"
                        group = h5f.require_group(group_path)
                        #print("group",group)

                        csv_path = os.path.join(subfolder_path, csv_file)
                        df = pd.read_csv(csv_path)

                        drop_columns = ['Unnamed: 0', 'time_sec', 'Time', 'Label']
                        df = df.drop(columns=[col for col in drop_columns if col in df.columns], errors='ignore')

                        column_names = df.columns.tolist()
                        data_array = df.to_numpy()
                        dataset_name = f"file_{len(group)}"
                        print(group_path, "--", dataset_name)
                        dset = group.create_dataset(dataset_name, data=data_array)
                        dset.attrs["descriptions"] = list(column_names)
                        #print(f"Created dataset at: {group_path}/{dataset_name}")
    print("Flat HDF5 file created successfully:", h5_file)

In [186]:
create_h5py_dataset_from_root_dirs()

Datasets in /data/TimeSeriesResearch/datasets/Satya_New3:   0%|          | 0/4 [00:00<?, ?it/s]

dataset_path /data/TimeSeriesResearch/datasets/Satya_New3/valve1
file_7.csv
file_8.csv
file_13.csv
file_1.csv
file_11.csv
file_10.csv
file_0.csv
file_2.csv
file_14.csv
file_5.csv
file_12.csv
file_4.csv
file_9.csv
file_15.csv
file_3.csv
file_6.csv
dataset_path /data/TimeSeriesResearch/datasets/Satya_New3/valve2
file_1.csv
file_0.csv
file_2.csv
file_3.csv
dataset_path /data/TimeSeriesResearch/datasets/Satya_New3/other
file_7.csv
file_8.csv
file_13.csv
file_1.csv
file_11.csv
file_10.csv
file_0.csv
file_2.csv
file_5.csv
file_12.csv
file_4.csv
file_9.csv
file_3.csv
file_6.csv
dataset_path /data/TimeSeriesResearch/datasets/Satya_New3/Normal Operation
file_18.csv
file_17.csv
file_25.csv
file_40.csv
file_20.csv
file_67.csv
file_51.csv
file_7.csv
file_55.csv
file_58.csv
file_46.csv
file_8.csv
file_13.csv
file_33.csv
file_31.csv
file_64.csv
file_60.csv
file_34.csv
file_1.csv
file_48.csv
file_11.csv
file_35.csv
file_22.csv
file_57.csv
file_10.csv
file_39.csv
file_26.csv
file_38.csv
file_45.csv
fi