In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from collections import Counter

# Setup directories
parent_directory = '/content/drive/MyDrive/FYP_Nur_Time_Series_Representation_using_CL-main/Data/HydraulicPump'
output_dir = "/content/drive/MyDrive/FYP_Nur_Time_Series_Representation_using_CL-main/TSTCC/data/HydraulicPump"
os.makedirs(output_dir, exist_ok=True)

# Initialize storage for all data and labels
all_data = []
all_labels = []
label_mapping = {}

# Directory traversal and file processing
print("Checking directories in:", parent_directory)
assert os.path.exists(parent_directory), f"The directory {parent_directory} does not exist."

for idx, class_dir in enumerate(os.listdir(parent_directory)):
    class_path = os.path.join(parent_directory, class_dir)
    if os.path.isdir(class_path):
        print(f"Processing directory: {class_dir}")
        label_mapping[class_dir] = idx
        class_data = []
        for filename in os.listdir(class_path):
            file_path = os.path.join(class_path, filename)
            with open(file_path, 'r') as file:
                # Read and convert all lines to floats, stripping whitespace
                data = [float(line.strip()) for line in file]
                class_data.extend(data)

        if class_data:
            all_data.append(class_data)
            all_labels.append(idx)
            print(f"Processed {len(class_data)} data points from {class_dir}")

# Handle case where no data is read
if not all_data:
    raise ValueError("No data files were read. Please check the file paths and formats.")

# Decide on sample_length based on your requirements
sample_length = 64  # Example value, adjust based on your needs

# Process all data into numpy arrays and corresponding labels
processed_data = []
processed_labels = []
for data, label in zip(all_data, all_labels):
    num_samples = len(data) // sample_length
    data = np.array(data[:num_samples * sample_length]).reshape(-1, sample_length)
    labels = np.full((data.shape[0],), label)
    processed_data.append(data)
    processed_labels.append(labels)

# Concatenate all data and labels
processed_data = np.concatenate(processed_data, axis=0)
processed_labels = np.concatenate(processed_labels, axis=0)

# Normalize the data
scaler = MinMaxScaler()
processed_data = scaler.fit_transform(processed_data)

# Split data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(processed_data, processed_labels, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Save the datasets as .pt files
datasets = {
    "train": (X_train, y_train),
    "val": (X_val, y_val),
    "test": (X_test, y_test)
}

for set_name, (X, y) in datasets.items():
    torch.save({
        "samples": torch.from_numpy(X).unsqueeze(1),  # Add channel dimension
        "labels": torch.from_numpy(y)
    }, os.path.join(output_dir, f"{set_name}.pt"))

print(f"Data sets saved in {output_dir}")
print(f"Label mapping: {label_mapping}")

label_counts = Counter(processed_labels)
for label, count in label_counts.items():
    print(f"Label {label} (mapped from {list(label_mapping.keys())[list(label_mapping.values()).index(label)]}): {count} samples")


Checking directories in: /content/drive/MyDrive/FYP_Nur_Time_Series_Representation_using_CL-main/Data/HydraulicPump
Processing directory: piston shoes and swashplate wearing
Processed 6144 data points from piston shoes and swashplate wearing
Processing directory: normal
Processed 3072 data points from normal
Processing directory: valve plate wearing
Processed 4096 data points from valve plate wearing
Data sets saved in /content/drive/MyDrive/FYP_Nur_Time_Series_Representation_using_CL-main/TSTCC/data/HydraulicPump
Label mapping: {'piston shoes and swashplate wearing': 0, 'normal': 1, 'valve plate wearing': 2}
Label 0 (mapped from piston shoes and swashplate wearing): 96 samples
Label 1 (mapped from normal): 48 samples
Label 2 (mapped from valve plate wearing): 64 samples
