## CSVs to .h5 conversion

In [1]:
from collections import defaultdict
from tqdm.auto import tqdm
import os
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
import h5py
import numpy as np
import pandas as pd
import torch
import gc
from ftse.data.Dataset import UnwindowedDataset


In [2]:
ROOT_DIRS = [
    '/workspace/research/ftse/ftse/notebooks/'
]

H5_FILE = '/workspace/research/ftse/ftse/notebooks/testing3.h5'

root_dirs=ROOT_DIRS
h5_file=H5_FILE

In [3]:
with h5py.File(h5_file, "w") as h5f:
    for root_dir in root_dirs:
        root_prefix = os.path.basename(os.path.normpath(root_dir))
        print("root_prefix", root_prefix)
        for dataset_folder in tqdm(os.listdir(root_dir), desc=f"Datasets in {root_dir}"):
            #print("dataset_folder", dataset_folder)
            dataset_path = os.path.join(root_dir, dataset_folder)
            if not os.path.isdir(dataset_path):
                print("skipping", dataset_path)
                continue  # Skip non-directory items
            label_counts = {}

            for csv_file in tqdm(os.listdir(dataset_path), desc=dataset_folder, leave=False):
                if not csv_file.lower().endswith(".csv"):
                    continue

                # Example filename: "file_5_Normal Operation.csv"
                # Extract label: take everything after the second underscore and strip ".csv".
                parts = csv_file.split('_')
                if len(parts) >= 3:
                    label = '_'.join(parts[2:]).replace('.csv', '')
                else:
                    label = "Unknown"

                # Initialize or update the file index counter for this label.
                if label not in label_counts:
                    label_counts[label] = 0
                file_idx = label_counts[label]
                label_counts[label] += 1

                # Read the CSV file.
                csv_path = os.path.join(dataset_path, csv_file)
                df = pd.read_csv(csv_path)
                # Drop unwanted columns.
                cols_to_drop = ['Unnamed: 0', 'time_sec', 'Time', 'Label']
                df = df.drop([col for col in cols_to_drop if col in df.columns], axis=1)
                # Save the column names as metadata.
                column_names = df.columns.tolist()
                data_array = df.to_numpy()
                # Create a flat key in the format:
                # "root_prefix___dataset_folder___label___file_idx"
                key = f"{dataset_folder}___{label}___{file_idx}"
                #print(key)
                dset = h5f.create_dataset(key, data=data_array)
                dset.attrs["descriptions"] = list(column_names)

root_prefix notebooks


Datasets in /workspace/research/ftse/ftse/notebooks/:   0%|          | 0/33 [00:00<?, ?it/s]

Michigan:   0%|          | 0/10 [00:00<?, ?it/s]

NCSV: 0it [00:00, ?it/s]

Gyrometer:   0%|          | 0/18 [00:00<?, ?it/s]

OCSV:   0%|          | 0/9 [00:00<?, ?it/s]

Illinois:   0%|          | 0/9 [00:00<?, ?it/s]

skipping /workspace/research/ftse/ftse/notebooks/indices.ipynb
skipping /workspace/research/ftse/ftse/notebooks/Satya_data.ipynb


Maryland:   0%|          | 0/21 [00:00<?, ?it/s]

California:   0%|          | 0/111 [00:00<?, ?it/s]

skipping /workspace/research/ftse/ftse/notebooks/testing3.h5
skipping /workspace/research/ftse/ftse/notebooks/temp2.h5


.ipynb_checkpoints:   0%|          | 0/7 [00:00<?, ?it/s]

Wyoming:   0%|          | 0/16 [00:00<?, ?it/s]

Delaware:   0%|          | 0/6 [00:00<?, ?it/s]

skipping /workspace/research/ftse/ftse/notebooks/Modeling.ipynb


IOCSV:   0%|          | 0/5 [00:00<?, ?it/s]

skipping /workspace/research/ftse/ftse/notebooks/temp.h5


Florida:   0%|          | 0/54 [00:00<?, ?it/s]

skipping /workspace/research/ftse/ftse/notebooks/Untitled.ipynb


Airquality_pattern:   0%|          | 0/3 [00:00<?, ?it/s]

turbofan:   0%|          | 0/5 [00:00<?, ?it/s]

test:   0%|          | 0/5 [00:00<?, ?it/s]

Accelerometer:   0%|          | 0/18 [00:00<?, ?it/s]

ICSV: 0it [00:00, ?it/s]

Texas:   0%|          | 0/6 [00:00<?, ?it/s]

skipping /workspace/research/ftse/ftse/notebooks/jepa.ipynb
skipping /workspace/research/ftse/ftse/notebooks/evals-classif.ipynb


NewYork:   0%|          | 0/6 [00:00<?, ?it/s]

skipping /workspace/research/ftse/ftse/notebooks/evals.ipynb


IBCSV:   0%|          | 0/4 [00:00<?, ?it/s]

skipping /workspace/research/ftse/ftse/notebooks/log.txt


BCSV:   0%|          | 0/8 [00:00<?, ?it/s]

.git:   0%|          | 0/9 [00:00<?, ?it/s]

In [13]:
h5_file=H5_FILE
window_size=None
stride=1
concat=True
datasets = defaultdict(list)
f = h5py.File(h5_file, 'r')
for key in f.keys():
    dataset_name, label, file_idx = key.split('___')
    datasets[dataset_name].append(
        UnwindowedDataset(
            data=f[key],
            dataset_name=dataset_name,
            descriptions=f[key].attrs["descriptions"],
            label=label
        )
    )

if window_size and stride:
    for dataset_name in datasets:
        datasets[dataset_name] = [
            dataset.window(window_size=window_size, stride=stride)
            for dataset in datasets[dataset_name]
        ]

if concat:
    for dataset_name in datasets:
        datasets[dataset_name] = torch.utils.data.ConcatDataset(
            datasets[dataset_name]
        )



In [14]:
print(list(datasets.keys()))

['Accelerometer', 'Airquality_pattern', 'BCSV', 'California', 'Delaware', 'Florida', 'Gyrometer', 'IBCSV', 'IOCSV', 'Illinois', 'Maryland', 'Michigan', 'NewYork', 'OCSV', 'Texas', 'Wyoming', 'turbofan']


In [17]:
datasets['Gyrometer']

[<ftse.data.Dataset.UnwindowedDataset at 0x7ce9836dd810>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836c7910>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836af0d0>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836b8f50>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836b9210>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836b8d90>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836b9110>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836b8490>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836b9d10>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836b9f10>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836ba150>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836ba390>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836ba5d0>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836ba810>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce9836baa50>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce984a58e90>,
 <ftse.data.Dataset.UnwindowedDataset at 0x7ce98439d950>]

### HUST bearing: a practical dataset for ball bearing fault diagnosis
https://data.mendeley.com/datasets/cbv7jyx4p9/3 

In [13]:
from scipy.io import loadmat
import numpy as np
import pandas as pd
import os
from collections import Counter, defaultdict

In [14]:
folder_mapping = {
    'BMat': 'BCSV',
    'IBMat': 'IBCSV',
    'IOMat': 'IOCSV',
    'OBMat': 'OCSV',
    'NMat': 'NCSV',
    'IMat': 'ICSV',
    'OMat': 'OCSV'
}
defect_map = {
    "I": "Inner",
    "O": "Outer",
    "B": "Ball",
    "IO": "Inner_and_Outer",
    "IB": "Inner_and_Ball",
    "OB": "Outer_and_Ball"
}

In [15]:
for source_dir, destination_dir in folder_mapping.items():
    os.makedirs(destination_dir, exist_ok=True)
    source_folder_name = os.path.basename(source_dir)
    folder_path = os.path.join("test", source_dir)
    if not os.path.isdir(folder_path):
        print(f"Skipping {source_dir}: directory 'test/{source_dir}' does not exist.")
        continue
    schema_counter = Counter()
    file_schema_map = defaultdict(list)
    
    for file in os.listdir(folder_path):
        if file.endswith(".mat"):
            file_path = os.path.join(folder_path, file)
            mat_data = loadmat(file_path)
            if 'data' in mat_data:
                if ('ru' in mat_data and 'rpm' in mat_data):
                    schema = ('ru', 'rpm')
                elif 'ru_raw' in mat_data:
                    schema = ('ru_raw',)
                else:
                    continue
    
                schema_counter[schema] += 1
                file_schema_map[schema].append(file)
    print(schema_counter)
    print(file_schema_map)
    dominant_schema = schema_counter.most_common(1)[0][0]
    print(dominant_schema)
    tracker = 0
    for file in file_schema_map[dominant_schema]:
        file_path = os.path.join(folder_path, file)
        key_label_key="None"
        key_label_value="None"
        for key in sorted(defect_map.keys(), key=len, reverse=True):
            if file.startswith(key):
                key_label_key=key
                key_label_value=defect_map[key_label_key]
                break
        if key_label_key == "None":
            continue 
        mat_data = loadmat(file_path)
        fs = float(mat_data['fs'])
        if 'data' in mat_data:
            vibration = mat_data['data'].flatten()
            time = np.arange(len(vibration)) / fs
            if dominant_schema == ('ru', 'rpm'):
                ru = mat_data['ru'].flatten()
                rpm = mat_data['rpm'].flatten()
                min_len = min(len(vibration), len(ru), len(rpm))
                df = pd.DataFrame({
                    'time_sec': time[:min_len],
                    'Vibration signal': vibration[:min_len],
                    'ru: Vibration signal during run-up time': ru[:min_len],
                    'rpm: Shaft Angular velocity in RPM during run-up time': rpm[:min_len]
                })
            elif dominant_schema == ('ru_raw',):
                ru = mat_data['ru_raw'].flatten()
                min_len = min(len(vibration), len(ru))
                df = pd.DataFrame({
                    'time_sec': time[:min_len],
                    'Vibration signal': vibration[:min_len],
                    'ru_raw: Vibration signal during run-up time': ru[:min_len]
                })
            output_filename = f"file_{tracker}_{key_label_value}.csv"
            output_path = os.path.join(destination_dir,output_filename)
            df.to_csv(output_path, index = False)
            tracker+=1

Counter({('ru', 'rpm'): 8, ('ru_raw',): 4})
defaultdict(<class 'list'>, {('ru', 'rpm'): ['B604.mat', 'B800.mat', 'B600.mat', 'B804.mat', 'B700.mat', 'B602.mat', 'B500.mat', 'B802.mat'], ('ru_raw',): ['B502.mat', 'B704.mat', 'B702.mat', 'B504.mat']})
('ru', 'rpm')
Counter({('ru', 'rpm'): 4, ('ru_raw',): 3})
defaultdict(<class 'list'>, {('ru_raw',): ['IB500.mat', 'IB504.mat', 'IB502.mat'], ('ru', 'rpm'): ['IB600.mat', 'IB604.mat', 'IB700.mat', 'IB602.mat']})
('ru', 'rpm')
Counter({('ru', 'rpm'): 5, ('ru_raw',): 3})
defaultdict(<class 'list'>, {('ru', 'rpm'): ['IO404.mat', 'IO402.mat', 'IO602.mat', 'IO600.mat', 'IO400.mat'], ('ru_raw',): ['IO504.mat', 'IO502.mat', 'IO500.mat']})
('ru', 'rpm')
Counter({('ru', 'rpm'): 9})
defaultdict(<class 'list'>, {('ru', 'rpm'): ['OB404.mat', 'OB402.mat', 'OB600.mat', 'OB604.mat', 'OB602.mat', 'OB502.mat', 'OB504.mat', 'OB700.mat', 'OB500.mat']})
('ru', 'rpm')
Skipping NMat: directory 'test/NMat' does not exist.
Skipping IMat: directory 'test/IMat' does 

### Air Data: Air Quality Data Collected at Outdoor Monitors Across the US
https://www.epa.gov/outdoor-air-quality-data/download-daily-data

In [16]:
import pandas as pd
import numpy as np
import os

In [23]:
source_dir = 'Il'
folder_path = os.path.join("test", source_dir)
output_dir = 'Illinois'
os.makedirs(output_dir, exist_ok= True)

In [24]:
for file in os.listdir(folder_path): 
    df = pd.read_csv(os.path.join(folder_path, file))
    df = df.drop(columns=[col for col in df.columns if df[col].dtype == 'object' or pd.api.types.is_string_dtype(df[col])])
    df.rename(columns={'Daily Obs Count': 'Daily Observations Count'}, inplace=True)
    df.drop(columns=['POC', 'AQS Parameter Code', 'Method Code', 'CBSA Code', 'State FIPS Code', 'County FIPS Code', 'Site Latitude', 'Site Longitude'], inplace=True)
    grouped = df.groupby('Site ID')
    for index, (site_id, group) in enumerate(grouped):
        filename = f'file_{index}_NO.csv'
        filepath = os.path.join(output_dir, filename)
        group.drop(columns=['Site ID'], inplace=True)
        group.to_csv(filepath, index=False)

### Air quality pattern data


In [19]:
import pandas as pd
import numpy as np
import os

In [20]:
df=pd.read_csv(os.path.join("test","airquality_pattern.csv"))
df = df.iloc[2:].reset_index(drop=True)

In [21]:
for col in df.columns:
    if df[col].dtype == 'object':
        try:
            df[col] = pd.to_numeric(df[col], errors='raise')
        except ValueError:
            pass

In [22]:
df.dtypes

Date                             object
Time GMT -4                      object
Timestamp                         int64
Ozone - Low Conc.               float64
Hydrogen Sulfide - Low Conc.    float64
Total VOCs (ppm) - PID          float64
Carbon Dioxide - Low Conc.      float64
Particulate Matter 1            float64
Particulate Matter 2.5          float64
Particulate Matter 10           float64
Temperature (Internal)          float64
Humidity (Internal)             float64
Temperature (External)          float64
Humidity (External)             float64
Latitude                        float64
Longitude                       float64
Unnamed: 16                     float64
dtype: object

In [23]:
drop_cols = ["Timestamp", "Latitude", "Longitude", "Unnamed: 16"] + [
    col for col in df.columns if df[col].dtype == 'object' or pd.api.types.is_string_dtype(df[col])
]
df = df.drop(columns=drop_cols)

In [24]:
df.dtypes

Ozone - Low Conc.               float64
Hydrogen Sulfide - Low Conc.    float64
Total VOCs (ppm) - PID          float64
Carbon Dioxide - Low Conc.      float64
Particulate Matter 1            float64
Particulate Matter 2.5          float64
Particulate Matter 10           float64
Temperature (Internal)          float64
Humidity (Internal)             float64
Temperature (External)          float64
Humidity (External)             float64
dtype: object

In [25]:
df.head()

Unnamed: 0,Ozone - Low Conc.,Hydrogen Sulfide - Low Conc.,Total VOCs (ppm) - PID,Carbon Dioxide - Low Conc.,Particulate Matter 1,Particulate Matter 2.5,Particulate Matter 10,Temperature (Internal),Humidity (Internal),Temperature (External),Humidity (External)
0,0.402,0.0,0.191,382.043,2.719,3.164,3.506,25.82,44.41,17.68,68.42
1,0.446,0.0,0.192,379.549,2.652,2.846,2.886,25.88,44.33,17.69,69.42
2,0.412,0.0,0.188,377.549,2.029,2.172,2.197,25.78,44.29,17.75,68.52
3,0.417,0.0,0.183,377.316,3.183,3.645,3.977,25.9,44.44,17.69,68.2
4,0.433,0.0,0.184,381.616,2.995,3.44,3.766,25.91,44.19,17.7,67.68


In [26]:
df.to_csv("file_1.csv")

### NASA Turbofan Jet Engine Data Set
https://www.kaggle.com/datasets/behrad3d/nasa-cmaps/data

In [8]:
column_names = ['engine', 'time', 'op_setting_1', 'op_setting_2', 
                'op_setting_3'] + [f'sm_{i}' for i in range(1, 22)]
Sensor_dictionary={}
dict_list=[ "(Fan inlet temperature) (◦R)",
"(LPC outlet temperature) (◦R)",
"(HPC outlet temperature) (◦R)",
"(LPT outlet temperature) (◦R)",
"(Fan inlet Pressure) (psia)",
"(bypass-duct pressure) (psia)",
"(HPC outlet pressure) (psia)",
"(Physical fan speed) (rpm)",
"(Physical core speed) (rpm)",
"(Engine pressure ratio(P50/P2)",
"(HPC outlet Static pressure) (psia)",
"(Ratio of fuel flow to Ps30) (pps/psia)",
"(Corrected fan speed) (rpm)",
"(Corrected core speed) (rpm)",
"(Bypass Ratio) ",
"(Burner fuel-air ratio)",
"(Bleed Enthalpy)",
"(Required fan speed)",
"(Required fan conversion speed)",
"(High-pressure turbines Cool air flow)",
"(Low-pressure turbines Cool air flow)" ]

i=1
for x in dict_list :
    Sensor_dictionary[f'sm_{i}']=x
    i+=1
Sensor_dictionary

{'sm_1': '(Fan inlet temperature) (◦R)',
 'sm_2': '(LPC outlet temperature) (◦R)',
 'sm_3': '(HPC outlet temperature) (◦R)',
 'sm_4': '(LPT outlet temperature) (◦R)',
 'sm_5': '(Fan inlet Pressure) (psia)',
 'sm_6': '(bypass-duct pressure) (psia)',
 'sm_7': '(HPC outlet pressure) (psia)',
 'sm_8': '(Physical fan speed) (rpm)',
 'sm_9': '(Physical core speed) (rpm)',
 'sm_10': '(Engine pressure ratio(P50/P2)',
 'sm_11': '(HPC outlet Static pressure) (psia)',
 'sm_12': '(Ratio of fuel flow to Ps30) (pps/psia)',
 'sm_13': '(Corrected fan speed) (rpm)',
 'sm_14': '(Corrected core speed) (rpm)',
 'sm_15': '(Bypass Ratio) ',
 'sm_16': '(Burner fuel-air ratio)',
 'sm_17': '(Bleed Enthalpy)',
 'sm_18': '(Required fan speed)',
 'sm_19': '(Required fan conversion speed)',
 'sm_20': '(High-pressure turbines Cool air flow)',
 'sm_21': '(Low-pressure turbines Cool air flow)'}

In [None]:
for txt_file in tqdm(os.listdir("test/turbofan"), leave=False):
    if not txt_file.endswith(".txt"):
        continue
    df = pd.read_csv(os.path.join("test/turbofan",txt_file),sep = ' ',header=None,names=column_names , index_col=False )
    df.rename(columns=Sensor_dictionary, inplace=True)
    csv_filename = txt_file.replace(".txt", ".csv")
    df.drop(columns=["engine", "time"], inplace= True)
    df.to_csv(os.path.join("test/turbofan",csv_filename), index=False)

### HASC (Human Activity Sensing Consortium)

In [None]:
df = pd.read_csv("test/hasc/seg1011.csv")
df.columns = ['timestamp', 'X-axis', 'Y-axis', 'Z-axis']

In [None]:
df.head()

In [None]:
df = df.drop(columns=['timestamp'], errors='ignore')
df.to_csv("Hasc/Gyrometer/file_15_BuildingStairMove.csv", index = False)

In [None]:
with open("test/hasc/hasc-111018-165936.label", "r") as f:
    labels = [line.strip() for line in f]

print(labels[:5]) 

In [None]:
labels

### Sleep-EDF Database

In [None]:
import pandas as pd
import mne

In [None]:
raw = mne.io.read_raw_edf("test/sleepedf/SC4101E0-PSG.edf", preload=True)
print(raw.ch_names)

In [None]:
data, times = raw.get_data(return_times=True)
df = pd.DataFrame(data.T, columns=raw.ch_names)
df["timestamp"] = times

In [None]:
columns_mapping={
    'EEG Fpz-Cz' : 'EEG channel 1 (frontal to central)',
    'EEG Pz-Oz' : 'EEG channel 2 (parietal to occipital)',
    'EOG horizontal' : 'Electrooculogram (eye movements)',
    'EMG submental' : 'Electromyogram (muscle activity)'
}
df.drop(columns=["Event marker", "timestamp"], inplace=True)
df.rename(columns = columns_mapping, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.to_csv("sleepedf/file_14_Healthy.csv")

In [None]:
import os
import pandas as pd

folder_path = "sleepedf"

for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        file_path = os.path.join(folder_path, file)
        
        # Read the CSV
        df = pd.read_csv(file_path, index_col=0)  # This assumes the first column is the index

        # If the index is just 0, 1, 2... reset it
        if df.index.is_monotonic_increasing and df.index.equals(pd.RangeIndex(start=0, stop=len(df))):
            df.reset_index(drop=True, inplace=True)

        # Save back to same file without the index
        df.to_csv(file_path, index=False)


In [18]:
import numpy as np
import pandas as pd
import os

# Base path
base_path = "test/HAR/Inertial Signals"
output_path = "formatted_timeseries/train"
os.makedirs(output_path, exist_ok=True)

sensors = {
    "body_acc": "body_acc",
    "body_gyro": "body_gyro",
    "total_acc": "total_acc"
}

# Axis list
axes = ["x", "y", "z"]

for sensor_name, file_prefix in sensors.items():
    # Load x, y, z axis files
    data_axes = {}
    for axis in axes:
        file_name = f"{file_prefix}_{axis}_train.txt"
        full_path = os.path.join(base_path, file_name)
        data_axes[axis] = np.loadtxt(full_path)  # shape: (n_samples, 128)

    # Now reformat into time-major format: one long time series per axis
    n_samples, n_timesteps = data_axes["x"].shape

    # For each time step across all samples
    rows = []
    for t in range(n_timesteps):
        # Stack all sample values at time t for each axis
        for sample in range(n_samples):
            row = {
                "timestep": sample * n_timesteps + t,
                "x": data_axes["x"][sample, t],
                "y": data_axes["y"][sample, t],
                "z": data_axes["z"][sample, t],
            }
            rows.append(row)

    df = pd.DataFrame(rows)
    df.to_csv(os.path.join(output_path, f"{sensor_name}_timeseries.csv"), index=False)

    print(f"Saved: {sensor_name}_timeseries.csv → shape: {df.shape}")


Saved: body_acc_timeseries.csv → shape: (941056, 4)
Saved: body_gyro_timeseries.csv → shape: (941056, 4)
Saved: total_acc_timeseries.csv → shape: (941056, 4)


In [22]:
output_path = "formatted_timeseries/train"

for file in os.listdir(output_path):
    if file.endswith(".csv"):
        file_path = os.path.join(output_path, file)   
        print(file_path)
        df = pd.read_csv(file_path)
        # df.drop(columns=['timestep'], inplace=True)
        df.rename(columns={"x":"x-axis", "y":"y-axis", "z":"z-axis"}, inplace=True)
        df.to_csv(file_path, index=False)
        

formatted_timeseries/train/body_gyro_timeseries.csv
formatted_timeseries/train/body_acc_timeseries.csv
formatted_timeseries/train/total_acc_timeseries.csv
