In [1]:
import os
import math
import json
import numpy as np
import pandas as pd
from random import sample
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split

In [2]:
# This function converts a list that represents the data in a DataFrame back 
# to a DataFrame object.
def NASA_dataset_window_slice_to_dfs(list_df):
    col_names = ["unit","cycle","operational_setting_1","operational_setting_2",
                 "operational_setting_3","sensor_measurement_1",
                 "sensor_measurement_2","sensor_measurement_3",
                 "sensor_measurement_4","sensor_measurement_5",
                 "sensor_measurement_6","sensor_measurement_7",
                 "sensor_measurement_8","sensor_measurement_9",
                 "sensor_measurement_10","sensor_measurement_11",
                 "sensor_measurement_12","sensor_measurement_13",
                 "sensor_measurement_14","sensor_measurement_15",
                 "sensor_measurement_16","sensor_measurement_17",
                 "sensor_measurement_18","sensor_measurement_19",
                 "sensor_measurement_20","sensor_measurement_21","RUL"]
    return pd.DataFrame(list_df, columns=col_names)

In [3]:
class NormalityNASADataset(Dataset):
    def __init__(self,
                 root_data_dir_path: str,
                 dataset_csv_name: str,
                 window_length: int,
                 normality_length: int,
                 normalize: bool = False,
                 DEBUGGING: bool = False):

        if DEBUGGING:
            print("\nNASADataset object in DEBUGGING mode.")
        
        assert window_length < normality_length, "window_lenght={} must be less".format(window_length)\
            + " than normality_lenght={}".format(normality_length)

        self.__DEBUGGING__ = DEBUGGING
        self.normalize = normalize
        self.window_length = window_length
        self.normality_length = normality_length
        self.path_to_dataset = os.path.join(root_data_dir_path, dataset_csv_name)

        # In order to make use of the .csv files, we must create the data based on the window_length variable.
        # The data will be stored as a JSON file. Check if this JSON file exists, if not, create
        # and save the file. This can be a computationally expensive process, thus the need to save it as a JSON file.
        # We only want the dataset name "FD00X", so take the first five chars [:5]
        dataset_name = "normalized_{}".format(dataset_csv_name[:5]) if self.normalize else dataset_csv_name[:5]

        if self.__DEBUGGING__:
            JSON_file_name = "{}_{}_window{}_normalityLen{}.json".format("DEBUGGING", 
                                                                         dataset_name, 
                                                                         self.window_length, 
                                                                         self.normality_length)
        else:
            JSON_file_name = "{}_window{}_normalityLen{}.json".format(dataset_name, 
                                                                      self.window_length, 
                                                                      self.normality_length)

        # Define the directory where all JSON files will be stored.
        all_JSONs_files_dir_path = os.path.join(root_data_dir_path, "JSON_data")
        JSON_file_path = os.path.join(all_JSONs_files_dir_path, JSON_file_name)

        # If NOT exists, CREATE the data
        if not os.path.exists(JSON_file_path):
            if not os.path.exists(all_JSONs_files_dir_path):
                os.makedirs(all_JSONs_files_dir_path)
            # Read data files only if the JSON file does not exist. These are big files so no need to read
            # them if there is no need.
            print("JSON file '{}' does not exist at {}. \n\nCreating JSON file...".format(JSON_file_name,
                                                                                          JSON_file_path))
            df_dataset: pd.DataFrame = pd.read_csv(self.path_to_dataset)

            # Normalize all columns in the DataFrame
            if self.normalize:
                df_dataset.iloc[:, 2:-1] = (df_dataset.iloc[:, 2:-1] - df_dataset.iloc[:, 2:-1].mean()) / \
                                           (df_dataset.iloc[:, 2:-1].std() + 1)

            # Define the column in the DataFrame that identifies every unit in the dataset
            unit_identifier_col_name = "unit"
            dict_all_samples_of_window_length = self.__create_JSON_from_window_size__(
                df_dataset=df_dataset,
                unit_identifier_col_name=unit_identifier_col_name,
                path_to_json_file=JSON_file_path
            )
        # Else, LOAD the data
        else:
            # Read file
            print("Reading {} JSON file at: \n{}\n".format(JSON_file_name, all_JSONs_files_dir_path))
            with open(JSON_file_path, 'r') as file:
                dict_all_samples_of_window_length = json.load(file)

        self.data = dict_all_samples_of_window_length["samples"]
        self.size = len(self.data)

        print("{} total samples generated with normality_length={} and window_length={}."\
              .format(self.size,
                      self.normality_length,
                      self.window_length))

    def __create_JSON_from_window_size__(self,
                                         df_dataset: pd.DataFrame,
                                         unit_identifier_col_name: str,
                                         path_to_json_file: str):

        # Get a list that contains the unique units in the list
        all_unique_units_list: list = df_dataset[unit_identifier_col_name].unique().tolist()

        # Iterate over all units and get their samples of window_length and save to list. Save them to a list
        all_samples_list: list = []
        for i, unit in enumerate(all_unique_units_list):
            unit_samples_list = self.__get_samples_for_unit__(df_dataset,
                                                              unit=unit,
                                                              unit_identifier_col_name=unit_identifier_col_name)
            all_samples_list += unit_samples_list

            print("Number of samples in unit {}: {}".format(unit, len(unit_samples_list)))
            if self.__DEBUGGING__ and i == 0: break

        # Save the data
        dict_all_samples_of_window_length: dict = {
            "samples": all_samples_list,
        }

        json_object = json.dumps(dict_all_samples_of_window_length)
        with open(path_to_json_file, 'w') as file:
            file.write(json_object)

        print("\nFinished writing JSON file to: \n{}.\n".format(path_to_json_file))

        return dict_all_samples_of_window_length

    def __get_samples_for_unit__(self,
                                 df_dataset: pd.DataFrame,
                                 unit: str,
                                 unit_identifier_col_name: str):

        # Get the subset DataFrame for a unique unit
        df_dataset_for_unit = df_dataset.query("{} == {}".format(unit_identifier_col_name, unit))

        # Adjust the total number of samples by making it only the lenght of the normality range
        # tot_num_samples = df_dataset_for_unit.shape[0]
        tot_num_samples = self.normality_length
       

        # Perform basic error checking
        if self.window_length > tot_num_samples:
            error = "Window (window_size={}) must  be less than the number of total samples " \
                    "(unit_number_total_samples={}) for unit={}"\
                .format(self.window_length, tot_num_samples, unit)
            raise RuntimeError(error)
        if self.window_length <= 0:
            error = "window_size must be greater than 0"
            raise RuntimeError(error)

        # Compute the number of samples of size window_size that will be obtained form this unit.
        # Since we are using the sliding window approach, this can easily be computed.
        number_sliding_window_samples = tot_num_samples - self.window_length + 1

        # We start from the first cycle, which is at index=0
        list_all_samples = []
        for start_cycle_index in range(0, number_sliding_window_samples):
            # Only create window-slices of length window_length up until the normality_length
            # if start_cycle_index < self.normality_length:
            
            # Get the window as a dataframe (This may not be optimal since we are indexing a DataFrame)
            # Consider changing this to a 2D array?
            df_window = df_dataset_for_unit.iloc[start_cycle_index:start_cycle_index + self.window_length, :]

            # Save the window as a list
            list_all_samples.append(df_window.values.tolist())
        return list_all_samples

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        # Every sample in the data array should have the following structure:
        # [
        #   "unit","cycle","operational_setting_1","operational_setting_2","operational_setting_3",
        #   "sensor_measurement_1", "sensor_measurement_2","sensor_measurement_3","sensor_measurement_4",
        #   "sensor_measurement_5","sensor_measurement_6", "sensor_measurement_7","sensor_measurement_8",
        #   "sensor_measurement_9","sensor_measurement_10", "sensor_measurement_11", "sensor_measurement_12",
        #   "sensor_measurement_13","sensor_measurement_14", "sensor_measurement_15","sensor_measurement_16",
        #   "sensor_measurement_17","sensor_measurement_18", "sensor_measurement_19","sensor_measurement_20",
        #   "sensor_measurement_21","RUL"
        # ]
        # Where the first two columns represent the unit and cycle, respectively.
        window = np.array(self.data[idx], dtype=np.float32)

        # Create the features used for training and set as float32.
        # Also create the labels (RUL) which include system name for labeling purposes.
        # Features are all columns except the first two (Unit and cycle) and last one (RUL),
        # RUL = last column
        unit = window[:, 0][0].astype(int)
        features = window[:, 2:-1]
        rul = window[:, -1][-1]

        return unit, features, rul

In [4]:
root_data_dir_path = "/Users/rafaeltoche/Documents/School/Research/Rainwaters_Lab/"\
                        "DART-LP2/NASA_Turbofan/NASA_turbofan_data/train"
csv_dataset_name = "FD001_train.csv"
window_length = 10
normality_length = 20
print("Reading data from: \n{}".format(os.path.join(root_data_dir_path, csv_dataset_name)))

Reading data from: 
/Users/rafaeltoche/Documents/School/Research/Rainwaters_Lab/DART-LP2/NASA_Turbofan/NASA_turbofan_data/train/FD001_train.csv


### Testing with Only one unit
DEBUGGING=TRUE 

set i = 0 in `__create_JSON_from_window_size__` function to get only **one** unit

In [5]:
# NASA_dataset = NormalityNASADataset(root_data_dir_path=root_data_dir_path, 
#                                     dataset_csv_name=csv_dataset_name, 
#                                     window_length=window_length, 
#                                     normality_length=normality_length,
#                                     DEBUGGING=True)

In [6]:
# Define the DataLoader
# dataloader = DataLoader(NASA_dataset, shuffle=False)

View the first instance of the window data for this one unit

In [7]:
# NASA_dataset_window_slice_to_dfs(dataloader.dataset.data[0])

View the last instance of the window data for this one unit

In [8]:
# NASA_dataset_window_slice_to_dfs(dataloader.dataset.data[len(dataloader.dataset.data) -1])

In [9]:
# train_unit_ids, train_features, train_labels = next(iter(dataloader))

In [10]:
# print("Unit IDs Shape: {}".format(train_unit_ids.shape))
# print("Features Shape: {}".format(train_features.shape))
# print("Labels Shape: {}\n".format(train_labels.shape))

# print("Unit IDs Dtype: {}".format(train_unit_ids.dtype))
# print("Features Dtype: {}".format(train_features.dtype))
# print("Labels Dtype: {}".format(train_labels.dtype))

### Testing with more than one unit

In [11]:
NASA_dataset = NormalityNASADataset(root_data_dir_path=root_data_dir_path, 
                                    dataset_csv_name=csv_dataset_name, 
                                    window_length=window_length, 
                                    normality_length=normality_length,
                                    DEBUGGING=False)

JSON file 'FD001_window10_normalityLen20.json' does not exist at /Users/rafaeltoche/Documents/School/Research/Rainwaters_Lab/DART-LP2/NASA_Turbofan/NASA_turbofan_data/train/JSON_data/FD001_window10_normalityLen20.json. 

Creating JSON file...
Number of samples in unit 1: 11
Number of samples in unit 2: 11
Number of samples in unit 3: 11
Number of samples in unit 4: 11
Number of samples in unit 5: 11
Number of samples in unit 6: 11
Number of samples in unit 7: 11
Number of samples in unit 8: 11
Number of samples in unit 9: 11
Number of samples in unit 10: 11
Number of samples in unit 11: 11
Number of samples in unit 12: 11
Number of samples in unit 13: 11
Number of samples in unit 14: 11
Number of samples in unit 15: 11
Number of samples in unit 16: 11
Number of samples in unit 17: 11
Number of samples in unit 18: 11
Number of samples in unit 19: 11
Number of samples in unit 20: 11
Number of samples in unit 21: 11
Number of samples in unit 22: 11
Number of samples in unit 23: 11
Number 

In [12]:
split = 0.80
train_size = math.ceil(NASA_dataset.size * split)
test_size = NASA_dataset.size - train_size

print("Train Size: {}".format(train_size))
print("Test Size: {}".format(test_size))

Train Size: 880
Test Size: 220


In [13]:
# Define train and test split
train_dataset, test_dataset = random_split(NASA_dataset, [train_size, test_size])

In [14]:
# Define the DataLoaders
batch_size = 5
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
train_unit_ids, train_features, train_RUL_labels = next(iter(train_dataloader))

print("Unit IDs Shape: {}".format(train_unit_ids.shape))
print("Features Shape: {}".format(train_features.shape))
print("Labels Shape: {}\n".format(train_RUL_labels.shape))

print("Unit IDs Dtype: {}".format(train_unit_ids.dtype))
print("Features Dtype: {}".format(train_features.dtype))
print("Labels Dtype: {}".format(train_RUL_labels.dtype))

Unit IDs Shape: torch.Size([5])
Features Shape: torch.Size([5, 10, 24])
Labels Shape: torch.Size([5])

Unit IDs Dtype: torch.int64
Features Dtype: torch.float32
Labels Dtype: torch.float32
