In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import h5py

from sklearn.preprocessing import MinMaxScaler, StandardScaler

import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
# filename = "N-CMAPSS_DS04.h5"
filename = "N-CMAPSS_DS01-005.h5"
filename = f"../data/17. Turbofan Engine Degradation Simulation Data Set 2/data_set/{filename}"

# Load data
with h5py.File(filename, 'r') as hdf:
        # Development set
        W_dev = np.array(hdf.get('W_dev'))             # W
        X_s_dev = np.array(hdf.get('X_s_dev'))         # X_s
        X_v_dev = np.array(hdf.get('X_v_dev'))         # X_v
        T_dev = np.array(hdf.get('T_dev'))             # T
        Y_dev = np.array(hdf.get('Y_dev'))             # RUL  
        A_dev = np.array(hdf.get('A_dev'))             # Auxiliary

        # Test set
        W_test = np.array(hdf.get('W_test'))           # W
        X_s_test = np.array(hdf.get('X_s_test'))       # X_s
        X_v_test = np.array(hdf.get('X_v_test'))       # X_v
        T_test = np.array(hdf.get('T_test'))           # T
        Y_test = np.array(hdf.get('Y_test'))           # RUL  
        A_test = np.array(hdf.get('A_test'))           # Auxiliary
        
        # Varnams
        W_var = np.array(hdf.get('W_var'))
        X_s_var = np.array(hdf.get('X_s_var'))  
        X_v_var = np.array(hdf.get('X_v_var')) 
        T_var = np.array(hdf.get('T_var'))
        A_var = np.array(hdf.get('A_var'))
        
        # from np.array to list dtype U4/U5
        W_var = list(np.array(W_var, dtype='U20'))
        X_s_var = list(np.array(X_s_var, dtype='U20'))  
        X_v_var = list(np.array(X_v_var, dtype='U20')) 
        T_var = list(np.array(T_var, dtype='U20'))
        A_var = list(np.array(A_var, dtype='U20'))
                          
W = np.concatenate((W_dev, W_test), axis=0)  
X_s = np.concatenate((X_s_dev, X_s_test), axis=0)
X_v = np.concatenate((X_v_dev, X_v_test), axis=0)
T = np.concatenate((T_dev, T_test), axis=0)
Y = np.concatenate((Y_dev, Y_test), axis=0) 
A = np.concatenate((A_dev, A_test), axis=0) 
    
print('')
# print("Operation time (min): " , (time.process_time()-t)/60)
print('')
print ("W shape: " + str(W.shape))
print ("X_s shape: " + str(X_s.shape))
print ("X_v shape: " + str(X_v.shape))
print ("T shape: " + str(T.shape))
print ("A shape: " + str(A.shape))



W shape: (7641868, 4)
X_s shape: (7641868, 14)
X_v shape: (7641868, 14)
T shape: (7641868, 10)
A shape: (7641868, 4)


In [4]:
X_train, y_train, X_test, y_test  = pd.DataFrame(data=np.hstack((A_dev, W_dev, X_s_dev,)), columns=(A_var + W_var + X_s_var)), pd.DataFrame(data=Y_dev, columns=['RUL']), pd.DataFrame(data=np.hstack((A_test, W_test, X_s_test,)), columns=(A_var + W_var + X_s_var)), pd.DataFrame(data=Y_test, columns=['RUL'])

In [10]:
X_train

Unnamed: 0,unit,cycle,Fc,hs,alt,Mach,TRA,T2,T24,T30,...,P15,P2,P21,P24,Ps30,P40,P50,Nf,Nc,Wf
0,1.0,1.0,1.0,1.0,3013.0,0.376362,70.311996,522.314770,618.288596,1470.469798,...,19.432070,14.484611,19.727990,24.410990,394.701872,401.205188,15.974771,2142.253462,8693.176503,4.621622
1,1.0,1.0,1.0,1.0,3020.0,0.376866,70.311996,522.327145,618.296355,1470.415593,...,19.431385,14.484683,19.727295,24.410483,394.629899,401.132851,15.970518,2142.218596,8693.000298,4.620561
2,1.0,1.0,1.0,1.0,3025.0,0.377685,70.311996,522.371840,618.336514,1470.453853,...,19.435163,14.488224,19.731130,24.415476,394.667850,401.171401,15.969419,2142.257956,8693.106262,4.621064
3,1.0,1.0,1.0,1.0,3035.0,0.376992,70.399887,522.282418,618.302173,1470.650929,...,19.426003,14.477632,19.721830,24.406544,394.773533,401.272707,15.967762,2142.794673,8693.680367,4.624305
4,1.0,1.0,1.0,1.0,3043.0,0.377622,70.399887,522.300605,618.345228,1470.640421,...,19.427484,14.478114,19.723334,24.410159,394.732158,401.234620,15.962571,2143.015150,8693.714825,4.622768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4906631,6.0,94.0,2.0,0.0,3030.0,0.201159,31.728291,511.974677,570.015066,1285.435213,...,16.109128,13.492444,16.354444,18.654150,241.261497,245.973498,14.369619,1669.228868,8065.817536,2.410657
4906632,6.0,94.0,2.0,0.0,3021.0,0.203238,31.728291,512.092657,570.111968,1285.592440,...,16.121885,13.504848,16.367396,18.668551,241.413982,246.127004,14.375768,1669.215512,8066.219755,2.413073
4906633,6.0,94.0,2.0,0.0,3015.0,0.202986,31.728291,512.103385,570.136953,1285.638575,...,16.124832,13.506797,16.370388,18.672528,241.458997,246.175087,14.378145,1669.388929,8066.452994,2.412957
4906634,6.0,94.0,2.0,0.0,3007.0,0.203301,31.728291,512.145375,570.176550,1285.711680,...,16.130621,13.512026,16.376265,18.679114,241.536966,246.253701,14.382804,1669.386855,8066.648461,2.414061


In [8]:
scaler = MinMaxScaler()

features = [feature for feature in X_train.columns if feature not in ['unit', 'cycle']]

X_train_scaled = scaler.fit_transform(X_train[features])
X_test_scaled = scaler.transform(X_test[features])

X_train[features] = X_train_scaled
X_test[features] = X_test_scaled

In [9]:
X_train.describe()

Unnamed: 0,unit,cycle,Fc,hs,alt,Mach,TRA,T2,T24,T30,T48,T50,P15,P2,P21,P24,Ps30,P40,P50,Nf,Nc,Wf
count,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0,4906636.0
mean,3.727439,45.50336,0.6509933,0.2601379,0.3959388,0.7184791,0.567992,0.6090522,0.5705035,0.5718437,0.6649639,0.6539532,0.482751,0.5053093,0.482751,0.4630213,0.4243299,0.4250496,0.4721482,0.5989384,0.5799216,0.419582
std,1.643444,26.63813,0.3729075,0.4387097,0.2499784,0.1594253,0.2822063,0.1734946,0.1392049,0.149446,0.120049,0.09441606,0.195961,0.2116838,0.195961,0.1746408,0.1590165,0.1590313,0.2166531,0.2299465,0.1513161,0.1485953
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,23.0,0.5,0.0,0.1937125,0.5924462,0.3571429,0.4699906,0.4718546,0.4722076,0.583215,0.5879551,0.3169714,0.3203334,0.3169714,0.3235311,0.3057654,0.306462,0.2814286,0.4548756,0.4791827,0.3137243
50%,4.0,45.0,0.5,0.0,0.3637925,0.7314098,0.6415094,0.6428302,0.5546819,0.5624459,0.6729608,0.6386895,0.4942887,0.5246644,0.4942887,0.4632415,0.3905425,0.391917,0.4772938,0.6563468,0.5748787,0.3796691
75%,5.0,67.0,1.0,1.0,0.6040834,0.8518674,0.8207547,0.7518765,0.6604,0.657782,0.7408467,0.7064675,0.6368866,0.6718656,0.6368866,0.5866766,0.5139076,0.5149418,0.6374008,0.7946279,0.6691359,0.4936186
max,6.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
list(X_train.columns)

[np.str_('unit'),
 np.str_('cycle'),
 np.str_('Fc'),
 np.str_('hs'),
 np.str_('alt'),
 np.str_('Mach'),
 np.str_('TRA'),
 np.str_('T2'),
 np.str_('T24'),
 np.str_('T30'),
 np.str_('T48'),
 np.str_('T50'),
 np.str_('P15'),
 np.str_('P2'),
 np.str_('P21'),
 np.str_('P24'),
 np.str_('Ps30'),
 np.str_('P40'),
 np.str_('P50'),
 np.str_('Nf'),
 np.str_('Nc'),
 np.str_('Wf')]

In [11]:
filename = '../data/processed/data.h5'
with h5py.File(filename, 'w') as f:
    f.create_dataset('X_train', data=X_train, compression='gzip')
    f.create_dataset('y_train', data=y_train, compression='gzip')
    f.create_dataset('X_test', data=X_test, compression='gzip')
    f.create_dataset('y_test', data=y_test, compression='gzip')

    # Save column names as datasets
    f.create_dataset('X_columns', data=X_train.columns.tolist(), 
                     dtype=h5py.string_dtype())
    f.create_dataset('y_columns', data=['RUL'], 
                     dtype=h5py.string_dtype())

In [12]:
with h5py.File(filename, 'r') as f:
    X_columns = f['X_columns'][:].astype(str)
    y_columns = f['y_columns'][:].astype(str)

In [13]:
X_columns

array(['unit', 'cycle', 'Fc', 'hs', 'alt', 'Mach', 'TRA', 'T2', 'T24',
       'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 'Ps30', 'P40',
       'P50', 'Nf', 'Nc', 'Wf'], dtype='<U5')

In [None]:
def create_sequences(X, y, sequence_lenght=30):
    sequences = []
    targets = []

    for unit

In [3]:
class NCMAPSSPreprocessor:
    def __init__(self, sequence_length=30, scaler_type='minmax'):
        """
        CMAPSS data preprocessor for LSTM models using H5 files
        
        Args:
            sequence_length: Length of sequences for LSTM input
            scaler_type: 'minmax' or 'standard' scaling
        """
        self.sequence_length = sequence_length
        self.scaler_type = scaler_type
        self.feature_scaler = None
        
    def load_raw_h5_data(self, filename):
        """Load raw CMAPSS dataset from H5 file (your original format)"""
        with h5py.File(filename, 'r') as hdf:
            # Development set
            W_dev = np.array(hdf.get('W_dev'))             # W
            X_s_dev = np.array(hdf.get('X_s_dev'))         # X_s
            X_v_dev = np.array(hdf.get('X_v_dev'))         # X_v
            T_dev = np.array(hdf.get('T_dev'))             # T
            Y_dev = np.array(hdf.get('Y_dev'))             # RUL  
            A_dev = np.array(hdf.get('A_dev'))             # Auxiliary

            # Test set
            W_test = np.array(hdf.get('W_test'))           # W
            X_s_test = np.array(hdf.get('X_s_test'))       # X_s
            X_v_test = np.array(hdf.get('X_v_test'))       # X_v
            T_test = np.array(hdf.get('T_test'))           # T
            Y_test = np.array(hdf.get('Y_test'))           # RUL  
            A_test = np.array(hdf.get('A_test'))           # Auxiliary
            
            # Variable names
            W_var = np.array(hdf.get('W_var'))
            X_s_var = np.array(hdf.get('X_s_var'))  
            X_v_var = np.array(hdf.get('X_v_var')) 
            T_var = np.array(hdf.get('T_var'))
            A_var = np.array(hdf.get('A_var'))
            
            # Convert to string lists
            W_var = list(np.array(W_var, dtype='U20'))
            X_s_var = list(np.array(X_s_var, dtype='U20'))  
            X_v_var = list(np.array(X_v_var, dtype='U20')) 
            T_var = list(np.array(T_var, dtype='U20'))
            A_var = list(np.array(A_var, dtype='U20'))

        # Create DataFrames
        X_train = pd.DataFrame(
            data=np.hstack((A_dev, W_dev, X_s_dev)), 
            columns=(A_var + W_var + X_s_var)
        )
        y_train = pd.DataFrame(data=Y_dev, columns=['RUL'])
        
        X_test = pd.DataFrame(
            data=np.hstack((A_test, W_test, X_s_test)), 
            columns=(A_var + W_var + X_s_var)
        )
        y_test = pd.DataFrame(data=Y_test, columns=['RUL'])
        
        return X_train, y_train, X_test, y_test
    
    def load_processed_h5_data(self, filename):
        """Load preprocessed data from H5 file"""
        with h5py.File(filename, 'r') as f:
            X_train = pd.DataFrame(np.array(f['X_train']))
            y_train = pd.DataFrame(np.array(f['y_train']))
            X_test = pd.DataFrame(np.array(f['X_test']))
            y_test = pd.DataFrame(np.array(f['y_test']))
            
            # Get column names
            X_columns = f['X_columns'][:].astype(str)
            y_columns = f['y_columns'][:].astype(str)

            
            X_train.columns = X_columns
            y_train.columns = y_columns
            X_test.columns = X_columns
            y_test.columns = y_columns
            
        return X_train, y_train, X_test, y_test
    
    def preprocess_and_save(self, raw_filename, processed_filename):
        """Complete preprocessing pipeline from raw to processed H5 file"""
        print("Loading raw data...")
        X_train, y_train, X_test, y_test = self.load_raw_h5_data(raw_filename)
        
        print("Scaling features...")
        # Get feature columns (exclude unit and cycle)
        features = [feature for feature in X_train.columns if feature not in ['unit', 'cycle']]
        
        # Initialize and fit scaler on training data
        if self.scaler_type == 'minmax':
            self.feature_scaler = MinMaxScaler()
        else:
            self.feature_scaler = StandardScaler()
        
        X_train_scaled = self.feature_scaler.fit_transform(X_train[features])
        X_test_scaled = self.feature_scaler.transform(X_test[features])
        
        # Update DataFrames with scaled features
        X_train[features] = X_train_scaled
        X_test[features] = X_test_scaled
        
        print("Saving processed data...")
        # Save processed data
        with h5py.File(processed_filename, 'w') as f:
            f.create_dataset('X_train', data=X_train.values, compression='gzip')
            f.create_dataset('y_train', data=y_train.values, compression='gzip')
            f.create_dataset('X_test', data=X_test.values, compression='gzip')
            f.create_dataset('y_test', data=y_test.values, compression='gzip')

            f.create_dataset('X_columns', data=X_train.columns.tolist(), 
                     dtype=h5py.string_dtype())
            f.create_dataset('y_columns', data=['RUL'], 
                     dtype=h5py.string_dtype())
        
        print(f"Processed data saved to {processed_filename}")
        return X_train, y_train, X_test, y_test
    
    def create_sequences_from_dataframe(self, X_df, y_df):
        """Create sequences for LSTM from DataFrame format"""
        # Get feature columns (exclude unit and cycle)
        features = [col for col in X_df.columns if col not in ['unit', 'cycle']]
        
        sequences = []
        targets = []
        unit_ids = []
        
        # Get unique units
        units = X_df['unit'].unique()
        
        for unit_id in units:
            # Get data for this unit
            unit_mask = X_df['unit'] == unit_id
            unit_X = X_df[unit_mask]
            unit_y = y_df[unit_mask]
            
            # Create sequences for this unit
            for i in range(len(unit_X) - self.sequence_length + 1):
                # Get sequence of features
                seq = unit_X[features].iloc[i:i+self.sequence_length].values
                sequences.append(seq)
                
                # Target is RUL at the end of sequence
                target = unit_y['RUL'].iloc[i+self.sequence_length-1]
                targets.append(target)
                
                unit_ids.append(unit_id)
        
        return np.array(sequences), np.array(targets), np.array(unit_ids)
    
    def create_test_sequences_from_dataframe(self, X_df, y_df=None):
        """Create test sequences (last sequence per unit) from DataFrame format"""
        features = [col for col in X_df.columns if col not in ['unit', 'cycle']]
        
        sequences = []
        targets = []
        unit_ids = []
        
        units = X_df['unit'].unique()
        
        for unit_id in units:
            unit_mask = X_df['unit'] == unit_id
            unit_X = X_df[unit_mask]
            
            # Take the last sequence_length cycles
            if len(unit_X) >= self.sequence_length:
                seq = unit_X[features].iloc[-self.sequence_length:].values
                sequences.append(seq)
                unit_ids.append(unit_id)
                
                # Get target if available
                if y_df is not None:
                    unit_y = y_df[unit_mask].sort_values(X_df[unit_mask]['cycle'].index)
                    target = unit_y['RUL'].iloc[-1]  # Last RUL value
                    targets.append(target)
        
        sequences = np.array(sequences)
        targets = np.array(targets) if targets else None
        
        return sequences, targets, np.array(unit_ids)

In [4]:
class NCMAPSSDataset(Dataset):
    """PyTorch Dataset for NCMAPSS data"""
    def __init__(self, sequences, targets=None):
        self.sequences = torch.FloatTensor(sequences)
        self.targets = torch.FloatTensor(targets) if targets is not None else None
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        if self.targets is not None:
            return self.sequences[idx], self.targets[idx]
        else:
            return self.sequences[idx]

In [8]:
class NCMAPSSDatasetLazy(Dataset):
    """Memory-efficient lazy loading PyTorch Dataset for NCMAPSS data"""
    
    def __init__(self, X_df, y_df, sequence_length, features):
        """
        Args:
            X_df: Feature DataFrame (kept in memory as reference)
            y_df: Target DataFrame 
            sequence_length: Length of sequences for LSTM
            features: List of feature column names
        """
        self.X_df = X_df
        self.y_df = y_df
        self.sequence_length = sequence_length
        self.features = features
        
        # Convert to float32 to save memory
        self.X_df[features] = self.X_df[features].astype(np.float32)
        self.y_df = self.y_df.astype(np.float32)
        
        # Pre-calculate sequence indices (lightweight metadata only)
        self.sequence_indices = self._calculate_sequence_indices()
        
        print(f"Lazy dataset created with {len(self.sequence_indices)} sequences")
    
    def _calculate_sequence_indices(self):
        """Calculate where each valid sequence starts (metadata only)"""
        indices = []
        units = self.X_df['unit'].unique()
        
        for unit_id in units:
            unit_mask = self.X_df['unit'] == unit_id
            unit_length = unit_mask.sum()
            
            # Only store indices, not actual data
            unit_indices = self.X_df[unit_mask].index.tolist()
            
            # Calculate valid sequence starting positions
            for i in range(unit_length - self.sequence_length + 1):
                indices.append({
                    'unit_id': unit_id,
                    'start_idx': unit_indices[i],
                    'end_idx': unit_indices[i + self.sequence_length - 1],
                    'unit_position': i
                })
        
        return indices
    
    def __len__(self):
        return len(self.sequence_indices)
    
    def __getitem__(self, idx):
        """Generate sequence on-demand (lazy loading)"""
        seq_info = self.sequence_indices[idx]
        unit_id = seq_info['unit_id']
        unit_position = seq_info['unit_position']
        
        # Get unit data (still efficient since data is sorted)
        unit_mask = self.X_df['unit'] == unit_id
        unit_X = self.X_df[unit_mask]
        unit_y = self.y_df[unit_mask]
        
        # Extract sequence (only creates small sequence, not full dataset)
        start_pos = unit_position
        end_pos = start_pos + self.sequence_length
        
        sequence = unit_X[self.features].iloc[start_pos:end_pos].values
        target = unit_y['RUL'].iloc[end_pos - 1]
        
        # Convert to tensors on-the-fly
        return torch.FloatTensor(sequence), torch.FloatTensor([target])

class NCMAPSSDatasetTestLazy(Dataset):
    """Lazy loading for test data (last sequence per unit)"""
    
    def __init__(self, X_df, y_df, sequence_length, features):
        self.X_df = X_df
        self.y_df = y_df
        self.sequence_length = sequence_length
        self.features = features
        
        # Convert to float32
        self.X_df[features] = self.X_df[features].astype(np.float32)
        if y_df is not None:
            self.y_df = self.y_df.astype(np.float32)
        
        # Calculate test sequence indices (one per unit)
        self.test_indices = self._calculate_test_indices()
        
        print(f"Lazy test dataset created with {len(self.test_indices)} sequences")
    
    def _calculate_test_indices(self):
        """Calculate indices for last sequence of each unit"""
        indices = []
        units = self.X_df['unit'].unique()
        
        for unit_id in units:
            unit_mask = self.X_df['unit'] == unit_id
            unit_length = unit_mask.sum()
            
            # Only include units with enough data
            if unit_length >= self.sequence_length:
                indices.append({
                    'unit_id': unit_id,
                    'start_position': unit_length - self.sequence_length
                })
        
        return indices
    
    def __len__(self):
        return len(self.test_indices)
    
    def __getitem__(self, idx):
        """Generate last sequence for unit on-demand"""
        test_info = self.test_indices[idx]
        unit_id = test_info['unit_id']
        start_pos = test_info['start_position']
        
        # Get unit data
        unit_mask = self.X_df['unit'] == unit_id
        unit_X = self.X_df[unit_mask]
        
        # Extract last sequence
        sequence = unit_X[self.features].iloc[start_pos:start_pos + self.sequence_length].values
        
        # Get target if available
        if self.y_df is not None:
            unit_y = self.y_df[unit_mask]
            target = unit_y['RUL'].iloc[-1]  # Last RUL value
            return torch.FloatTensor(sequence), torch.FloatTensor([target])
        else:
            return torch.FloatTensor(sequence)

# Updated preprocessing functions
def create_lazy_dataloaders_from_h5(processed_filename, sequence_length=30, batch_size=32, num_workers=0):
    """Create memory-efficient lazy loading dataloaders"""
    
    print("Loading processed data...")
    # Load data (this is the only time full data is in memory)
    with h5py.File(processed_filename, 'r') as f:
        X_train = pd.DataFrame(np.array(f['X_train']))
        y_train = pd.DataFrame(np.array(f['y_train']))
        X_test = pd.DataFrame(np.array(f['X_test']))
        y_test = pd.DataFrame(np.array(f['y_test']))
        
        # Get column names
        X_columns = f['X_columns'][:].astype(str)
        y_columns = f['y_columns'][:].astype(str)
        
        X_train.columns = X_columns
        y_train.columns = y_columns
        X_test.columns = X_columns
        y_test.columns = y_columns
    
    print("Creating lazy datasets...")
    # Get feature columns
    features = [col for col in X_train.columns if col not in ['unit', 'cycle']]
    
    # Create lazy datasets (minimal memory usage)
    train_dataset = NCMAPSSDatasetLazy(X_train, y_train, sequence_length, features)
    test_dataset = NCMAPSSDatasetTestLazy(X_test, y_test, sequence_length, features)
    
    print(f"Features: {len(features)}")
    print(f"Training sequences available: {len(train_dataset)}")
    print(f"Test sequences available: {len(test_dataset)}")
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True,
        num_workers=num_workers,  # Use multiple workers for better performance
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size, 
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    return train_loader, test_loader, features

# Example usage showing memory monitoring
def demonstrate_lazy_loading():
    """Demonstrate memory usage difference"""
    import psutil
    import os
    
    process = psutil.Process(os.getpid())
    
    print("=== Lazy Loading Demonstration ===")
    
    # Initial memory
    initial_memory = process.memory_info().rss / 1024 / 1024  # MB
    print(f"Initial memory: {initial_memory:.1f} MB")
    
    # Load data with lazy loading
    processed_filename = '../data/processed/data.h5'
    
    train_loader, test_loader, features = create_lazy_dataloaders_from_h5(
        processed_filename, 
        sequence_length=30, 
        batch_size=32,
        num_workers=2  # Use multiple workers
    )
    
    after_loading_memory = process.memory_info().rss / 1024 / 1024  # MB
    print(f"Memory after loading: {after_loading_memory:.1f} MB")
    print(f"Memory increase: {after_loading_memory - initial_memory:.1f} MB")
    
    # Test iteration (this should use minimal additional memory)
    print("\nTesting data loading...")
    for i, (batch_x, batch_y) in enumerate(train_loader):
        if i == 0:
            print(f"Batch shape: {batch_x.shape}")
            print(f"Target shape: {batch_y.shape}")
        if i >= 5:  # Just test first few batches
            break
    
    final_memory = process.memory_info().rss / 1024 / 1024  # MB
    print(f"Memory after iteration: {final_memory:.1f} MB")
    print(f"Total memory increase: {final_memory - initial_memory:.1f} MB")
    
    return train_loader, test_loader, features

In [6]:
def preprocess_cmapss_h5(raw_filename, processed_filename, sequence_length=30, scaler_type='minmax'):
    """Complete preprocessing from raw H5 to processed H5"""
    preprocessor = NCMAPSSPreprocessor(sequence_length=sequence_length, scaler_type=scaler_type)
    X_train, y_train, X_test, y_test = preprocessor.preprocess_and_save(raw_filename, processed_filename)
    return preprocessor

def create_lstm_dataloaders_from_h5(processed_filename, sequence_length=30, batch_size=32, scaler_type='minmax'):
    """Create LSTM dataloaders from processed H5 file"""
    preprocessor = NCMAPSSPreprocessor(sequence_length=sequence_length, scaler_type=scaler_type)
    
    print("Loading processed data...")
    X_train, y_train, X_test, y_test = preprocessor.load_processed_h5_data(processed_filename)
    
    print("Creating sequences...")
    # Create training sequences
    X_train_seq, y_train_seq, train_units = preprocessor.create_sequences_from_dataframe(X_train, y_train)
    
    # Create test sequences (last sequence per unit)
    X_test_seq, y_test_seq, test_units = preprocessor.create_test_sequences_from_dataframe(X_test, y_test)
    
    print(f"Training sequences shape: {X_train_seq.shape}")
    print(f"Training targets shape: {y_train_seq.shape}")
    print(f"Test sequences shape: {X_test_seq.shape}")
    print(f"Test targets shape: {y_test_seq.shape}")
    print(f"Number of features: {X_train_seq.shape[2]}")
    
    # Create PyTorch datasets
    train_dataset = NCMAPSSDatasetLazy(X_train_seq, y_train_seq)
    test_dataset = NCMAPSSDatasetLazy(X_test_seq, y_test_seq)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, test_loader, preprocessor, y_test_seq

In [None]:
filename = "N-CMAPSS_DS01-005.h5"
raw_filename = f"../data/17. Turbofan Engine Degradation Simulation Data Set 2/data_set/{filename}"
processed_filename = '../data/processed/data.h5'

# preprocessor = preprocess_cmapss_h5(raw_filename, processed_filename, sequence_length=30)

train_loader, test_loader, preprocessor, y_test = create_lstm_dataloaders_from_h5(
    processed_filename, sequence_length=30, batch_size=32
)

Loading processed data...
Creating sequences...


In [9]:
processed_filename = '../data/processed/data.h5'
    
train_loader, test_loader, features = create_lazy_dataloaders_from_h5(
    processed_filename, 
    sequence_length=30, 
    batch_size=32,
    num_workers=2  # Use multiple workers
)

Loading processed data...
Creating lazy datasets...
Lazy dataset created with 4906462 sequences
Lazy test dataset created with 4 sequences
Features: 20
Training sequences available: 4906462
Test sequences available: 4
