In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
rng = np.random.RandomState(42)
column_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 27)]
train_df = pd.read_csv('./dataset/train_FD001.txt', sep='\s+', header=None, names=column_names)
test_df = pd.read_csv('./dataset/test_FD001.txt', sep='\s+', header=None, names=column_names)
true_rul = pd.read_csv('./dataset/RUL_FD001.txt', header=None)

In [2]:
# Feature extraction with TSFRESH
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

In [3]:
train_df = train_df.dropna(axis=1, how="all")

In [4]:
columns_to_drop = ["setting3", "sensor1", "sensor5", "sensor6", "sensor10", "sensor16", "sensor18", "sensor19"]
train_df_dropped = train_df.drop(columns=columns_to_drop)

In [5]:
# Normalization
from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Separate the columns to normalize and the columns to skip
columns_to_skip = train_df_dropped.columns[:2]
columns_to_normalize = train_df_dropped.columns[2:]

# Normalize only the selected columns
normalized_data = scaler.fit_transform(train_df_dropped[columns_to_normalize])

# Combine the normalized and unnormalized columns
train_df_normalized = pd.DataFrame(train_df_dropped[columns_to_skip].values, columns=columns_to_skip)
train_df_normalized = pd.concat([train_df_normalized, pd.DataFrame(normalized_data, columns=columns_to_normalize)], axis=1)

train_df_normalized['RUL'] = train_df_normalized.groupby('engine_id')['cycle'].transform(lambda x: x.max() - x)

In [22]:
# from tsfresh.utilities.dataframe_functions import roll_time_series
# df_rolled = roll_time_series(train_df_normalized, column_id="engine_id", column_sort="cycle", max_timeshift=30 - 1, min_timeshift=30 - 1)

Rolling: 100%|██████████| 28/28 [00:03<00:00,  7.90it/s]


In [6]:
# Melt the DataFrame to stack sensor data for feature extraction
sensor_columns = [col for col in train_df_normalized.columns if 'sensor' in col]
melted_df = train_df_normalized.melt(
    id_vars=["engine_id", "cycle"],
    value_vars=sensor_columns,
    var_name="kind",
    value_name="value"
)

In [7]:
extracted_features = extract_features(
    melted_df,
    column_id="engine_id",
    column_sort="cycle",
    column_kind="kind",
    column_value="value",
    # disable_progressbar=False
)

Feature Extraction: 100%|██████████| 30/30 [00:26<00:00,  1.14it/s]


In [8]:
extracted_features.shape

(100, 10962)

In [14]:
# # Feature extraction with TSFRESH
# extracted_features = extract_features(train_df_normalized, column_id="engine_id", column_sort="cycle")

Feature Extraction:   0%|          | 0/30 [00:00<?, ?it/s]

Feature Extraction: 100%|██████████| 30/30 [00:33<00:00,  1.11s/it]


In [16]:
extracted_features.shape

(100, 13311)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_c)

In [7]:
# Filling up NaN as recommended by TSFRESH
impute(extracted_features)

 'setting2__friedrich_coefficients__coeff_1__m_3__r_30'
 'setting2__friedrich_coefficients__coeff_2__m_3__r_30'
 'setting2__friedrich_coefficients__coeff_3__m_3__r_30'
 'setting2__max_langevin_fixed_point__m_3__r_30'
 'setting2__query_similarity_count__query_None__threshold_0.0'
 'sensor2__query_similarity_count__query_None__threshold_0.0'
 'sensor3__query_similarity_count__query_None__threshold_0.0'
 'sensor4__query_similarity_count__query_None__threshold_0.0'
 'sensor7__query_similarity_count__query_None__threshold_0.0'
 'sensor8__friedrich_coefficients__coeff_0__m_3__r_30'
 'sensor8__friedrich_coefficients__coeff_1__m_3__r_30'
 'sensor8__friedrich_coefficients__coeff_2__m_3__r_30'
 'sensor8__friedrich_coefficients__coeff_3__m_3__r_30'
 'sensor8__max_langevin_fixed_point__m_3__r_30'
 'sensor8__query_similarity_count__query_None__threshold_0.0'
 'sensor9__query_similarity_count__query_None__threshold_0.0'
 'sensor11__query_similarity_count__query_None__threshold_0.0'
 'sensor12__query

Unnamed: 0,setting2__variance_larger_than_standard_deviation,setting2__has_duplicate_max,setting2__has_duplicate_min,setting2__has_duplicate,setting2__sum_values,setting2__abs_energy,setting2__mean_abs_change,setting2__mean_change,setting2__mean_second_derivative_central,setting2__median,...,setting1__fourier_entropy__bins_5,setting1__fourier_entropy__bins_10,setting1__fourier_entropy__bins_100,setting1__permutation_entropy__dimension_3__tau_1,setting1__permutation_entropy__dimension_4__tau_1,setting1__permutation_entropy__dimension_5__tau_1,setting1__permutation_entropy__dimension_6__tau_1,setting1__permutation_entropy__dimension_7__tau_1,setting1__query_similarity_count__query_None__threshold_0.0,setting1__mean_n_absolute_max__number_of_maxima_7
1,0.0,1.0,1.0,1.0,103.750000,66.631944,0.277487,0.001745,0.000658,0.583333,...,1.282133,1.906408,3.743622,1.786637,3.136431,4.423680,5.050390,5.203387,0.0,0.707718
2,0.0,1.0,1.0,1.0,164.583333,111.993056,0.276807,0.000000,0.002632,0.583333,...,0.911323,1.506294,3.493835,1.789825,3.137994,4.536881,5.384423,5.593954,0.0,0.831691
3,0.0,1.0,1.0,1.0,87.416667,53.062500,0.261704,-0.002341,0.000942,0.500000,...,1.131835,1.737235,3.568428,1.787387,3.138004,4.447641,5.007678,5.153292,0.0,0.792282
4,0.0,1.0,1.0,1.0,105.083333,69.076389,0.266401,-0.003103,0.001337,0.583333,...,0.959535,1.579948,3.562688,1.785316,3.144221,4.474631,5.084010,5.186760,0.0,0.771757
5,0.0,1.0,1.0,1.0,120.916667,68.840278,0.263371,0.000311,0.000624,0.416667,...,1.012771,1.669208,3.687892,1.786792,3.145695,4.562154,5.331026,5.514172,0.0,0.774220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0.0,1.0,1.0,1.0,175.583333,109.631944,0.261940,-0.000249,-0.001497,0.500000,...,0.665889,1.261253,3.272664,1.790514,3.159163,4.637228,5.592154,5.773887,0.0,0.760263
97,0.0,1.0,1.0,1.0,113.916667,75.854167,0.259536,-0.000829,0.001250,0.583333,...,0.961462,1.609686,3.579486,1.791210,3.132879,4.453631,5.139807,5.256896,0.0,0.737274
98,0.0,1.0,1.0,1.0,69.416667,39.006944,0.248925,-0.000538,-0.000541,0.416667,...,1.148980,1.787885,3.576245,1.788247,3.140119,4.382408,4.903646,4.992151,0.0,0.800493
99,0.0,1.0,1.0,1.0,95.083333,58.451389,0.262228,-0.004076,0.000228,0.500000,...,0.945631,1.537181,3.355833,1.785488,3.127782,4.473574,5.040812,5.164152,0.0,0.752053


In [10]:
y = train_df_normalized["RUL"].reset_index(drop=True)

In [11]:
features_filtered = select_features(extracted_features, y)

AssertionError: X and y must contain the same number of samples.

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute

# Load the CMAPSS dataset
column_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 27)]
train_df = pd.read_csv('./dataset/train_FD001.txt', sep='\s+', header=None, names=column_names)

# Drop columns with NaNs and uninformative columns
columns_to_drop = ["setting3", "sensor1", "sensor5", "sensor6", "sensor10", "sensor16", "sensor18", "sensor19"]
train_df_dropped = train_df.drop(columns=columns_to_drop)

# Normalize the data
scaler = MinMaxScaler()
columns_to_skip = train_df_dropped.columns[:2]  # Skip 'engine_id' and 'cycle'
columns_to_normalize = train_df_dropped.columns[2:]
normalized_data = scaler.fit_transform(train_df_dropped[columns_to_normalize])
train_df_normalized = pd.concat([train_df_dropped[columns_to_skip].reset_index(drop=True), 
                                 pd.DataFrame(normalized_data, columns=columns_to_normalize)], axis=1)

# Calculate RUL and add it as a target column
train_df_normalized['RUL'] = train_df_normalized.groupby('engine_id')['cycle'].transform(lambda x: x.max() - x)

# Create a unique ID for each cycle by combining engine_id and cycle
train_df_normalized['unique_id'] = train_df_normalized['engine_id'].astype(str) + '_' + train_df_normalized['cycle'].astype(str)

# Melt the DataFrame to stack sensor data for feature extraction at each cycle
sensor_columns = [col for col in train_df_normalized.columns if 'sensor' in col]
melted_df = train_df_normalized.melt(
    id_vars=["unique_id", "engine_id", "cycle"],
    value_vars=sensor_columns,
    var_name="kind",   # Identifies the sensor type
    value_name="value" # Contains the sensor reading values
)

# Step 1: Check for missing values in the melted DataFrame
print(melted_df.isnull().sum())

# Step 2: Impute or drop missing values
# Option 1: Fill NaNs with the mean of each sensor's values
melted_df['value'].fillna(melted_df['value'].mean(), inplace=True)

# Option 2: Drop rows with NaN values in 'value' (use only if dropping is acceptable)
# melted_df = melted_df.dropna(subset=['value'])
# Extract features for each cycle using the unique_id as the identifier
extracted_features = extract_features(
    melted_df,
    column_id="unique_id",
    column_sort="cycle",
    column_kind="kind",
    column_value="value",
    disable_progressbar=False
)

# Impute any missing values in the features
impute(extracted_features)

# Display extracted features
print(extracted_features.head())

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  melted_df['value'].fillna(melted_df['value'].mean(), inplace=True)


unique_id         0
engine_id         0
cycle             0
kind              0
value        103155
dtype: int64


Feature Extraction:   0%|          | 0/30 [00:00<?, ?it/s]

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import MinimalFCParameters
import logging

# Set up logging for debugging purposes
logging.basicConfig(level=logging.INFO)  # Set logging level to show details

# Load and preprocess the data
column_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 27)]
train_df = pd.read_csv('./dataset/train_FD001.txt', sep='\s+', header=None, names=column_names)

# Drop uninformative columns
columns_to_drop = ["setting3", "sensor1", "sensor5", "sensor6", "sensor10", "sensor16", "sensor18", "sensor19"]
train_df_dropped = train_df.drop(columns=columns_to_drop)

# Normalize data
scaler = MinMaxScaler()
columns_to_skip = train_df_dropped.columns[:2]
columns_to_normalize = train_df_dropped.columns[2:]
normalized_data = scaler.fit_transform(train_df_dropped[columns_to_normalize])
train_df_normalized = pd.concat([train_df_dropped[columns_to_skip].reset_index(drop=True), 
                                 pd.DataFrame(normalized_data, columns=columns_to_normalize)], axis=1)

# Calculate RUL
train_df_normalized['RUL'] = train_df_normalized.groupby('engine_id')['cycle'].transform(lambda x: x.max() - x)

# Sensor columns for interpolation
sensor_columns = [col for col in train_df_normalized.columns if 'sensor' in col]

# Step 1: Interpolate missing values for each sensor column (row-wise interpolation)
train_df_normalized[sensor_columns] = train_df_normalized[sensor_columns].interpolate(method='linear', limit_direction='both')

# Step 2: For any remaining NaNs, fill with the median of each column as a fallback
train_df_normalized[sensor_columns] = train_df_normalized[sensor_columns].fillna(train_df_normalized[sensor_columns].median())

# Step 3: Ensure no NaN values remain by filling any final NaNs with 0 as a last resort
train_df_normalized[sensor_columns] = train_df_normalized[sensor_columns].fillna(0)

# Create a unique ID for each cycle (for row-by-row extraction)
train_df_normalized['unique_id'] = train_df_normalized['engine_id'].astype(str) + '_' + train_df_normalized['cycle'].astype(str)

# Melt the DataFrame for feature extraction with tsfresh
melted_df = train_df_normalized.melt(
    id_vars=["unique_id", "engine_id", "cycle"],
    value_vars=sensor_columns,
    var_name="kind",
    value_name="value"
)

# Verify that there are no NaN values in melted_df before extraction
assert melted_df['value'].isnull().sum() == 0, "NaN values found in 'value' column after cleaning!"

# Minimal feature extraction for debugging
extracted_features = extract_features(
    melted_df,
    column_id="unique_id",
    column_sort="cycle",
    column_kind="kind",
    column_value="value",
    default_fc_parameters=MinimalFCParameters(),  # Minimal feature set for testing
    disable_progressbar=False,
    show_warnings=True
)

# Impute any remaining missing values in extracted features
impute(extracted_features)

# Display the extracted features
print(extracted_features.head())


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
Feature Extraction: 100%|██████████| 30/30 [00:56<00:00,  1.87s/it]


         sensor11__sum_values  sensor11__median  sensor11__mean  \
100_1                0.303571          0.303571        0.303571   
100_10               0.244048          0.244048        0.244048   
100_100              0.494048          0.494048        0.494048   
100_101              0.476190          0.476190        0.476190   
100_102              0.440476          0.440476        0.440476   

         sensor11__length  sensor11__standard_deviation  sensor11__variance  \
100_1                 1.0                           0.0                 0.0   
100_10                1.0                           0.0                 0.0   
100_100               1.0                           0.0                 0.0   
100_101               1.0                           0.0                 0.0   
100_102               1.0                           0.0                 0.0   

         sensor11__root_mean_square  sensor11__maximum  \
100_1                      0.303571           0.303571   
100_10

In [2]:
extracted_features.shape

(20631, 190)

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import roll_time_series, impute
from tsfresh.feature_extraction import EfficientFCParameters
import logging

# Set up logging for debugging
logging.basicConfig(level=logging.INFO)

# Load the data
column_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 27)]
train_df = pd.read_csv('./dataset/train_FD001.txt', sep='\s+', header=None, names=column_names)

# Drop columns that aren't informative
columns_to_drop = ["setting3", "sensor1", "sensor5", "sensor6", "sensor10", "sensor16", "sensor18", "sensor19"]
train_df_dropped = train_df.drop(columns=columns_to_drop)

# Normalize the data
scaler = MinMaxScaler()
columns_to_skip = train_df_dropped.columns[:2]
columns_to_normalize = train_df_dropped.columns[2:]
normalized_data = scaler.fit_transform(train_df_dropped[columns_to_normalize])
train_df_normalized = pd.concat([train_df_dropped[columns_to_skip].reset_index(drop=True), 
                                 pd.DataFrame(normalized_data, columns=columns_to_normalize)], axis=1)

# Calculate RUL for each cycle
train_df_normalized['RUL'] = train_df_normalized.groupby('engine_id')['cycle'].transform(lambda x: x.max() - x)

# Sensor columns for later processing
sensor_columns = [col for col in train_df_normalized.columns if 'sensor' in col]

# Step 1: Apply rolling window on each engine's data
# This generates a DataFrame where each row corresponds to a 30-cycle window for each engine
df_rolled = roll_time_series(
    train_df_normalized,
    column_id="engine_id",
    column_sort="cycle",
    max_timeshift=29,  # Window of 30 cycles
    min_timeshift=29
)

# Step 2: Melt the rolled DataFrame to use with tsfresh
melted_df = df_rolled.melt(
    id_vars=["engine_id", "cycle"],
    value_vars=sensor_columns,
    var_name="kind",
    value_name="value"
)

# Verify that there are no NaN values
if melted_df['value'].isnull().sum() > 0:
    melted_df['value'].fillna(0, inplace=True)

# Step 3: Feature extraction on each rolling window
try:
    extracted_features = extract_features(
        melted_df,
        column_id="engine_id",
        column_sort="cycle",
        column_kind="kind",
        column_value="value",
        default_fc_parameters=EfficientFCParameters(),
        disable_progressbar=False,
        show_warnings=True
    )
except Exception as e:
    logging.error(f"Error during feature extraction: {e}")
    raise

# Step 4: Impute any remaining missing values in extracted features
impute(extracted_features)

# Display the extracted features
print(extracted_features.head())


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
Rolling: 100%|██████████| 28/28 [00:03<00:00,  7.93it/s]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  melted_df['value'].fillna(0, inplace=True)
Feature Extraction: 100%|██████████| 30/30 [02:39<00:00,  5.33s/it]
 'sensor13__friedrich_coefficients__coeff_1__m_3__r_30'
 'sensor13__friedrich_coefficients__coeff_2__m_3__r_30'
 'sensor13__friedrich_coefficients__coeff_3__m_3__r_30'
 'sensor13__max_langevin_fixed_point__m_3__r_30'
 'sensor13__query_similarity_count__query_None__threshold_0.0'
 'sensor14__query_similarity_count__query_None__threshold_0.0'
 'sensor1

   sensor13__variance_larger_than_standard_deviation  \
1                                                0.0   
2                                                0.0   
3                                                0.0   
4                                                0.0   
5                                                0.0   

   sensor13__has_duplicate_max  sensor13__has_duplicate_min  \
1                          1.0                          1.0   
2                          1.0                          1.0   
3                          0.0                          1.0   
4                          1.0                          1.0   
5                          1.0                          1.0   

   sensor13__has_duplicate  sensor13__sum_values  sensor13__abs_energy  \
1                      1.0           1607.338235            573.411548   
2                      1.0           1927.529412            531.705882   
3                      1.0           1076.838235            26

In [5]:
extracted_features.shape

(100, 14763)

In [6]:
df_rolled.shape

(531930, 25)

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import roll_time_series, impute
from tsfresh.feature_extraction import ComprehensiveFCParameters
import logging

# Set up logging for debugging
logging.basicConfig(level=logging.INFO)

# Load the data
column_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 27)]
train_df = pd.read_csv('./dataset/train_FD001.txt', sep='\s+', header=None, names=column_names)

# Drop columns that aren't informative
columns_to_drop = ["setting3", "sensor1", "sensor5", "sensor6", "sensor10", "sensor16", "sensor18", "sensor19"]
train_df_dropped = train_df.drop(columns=columns_to_drop)

# Normalize the data
scaler = MinMaxScaler()
columns_to_skip = train_df_dropped.columns[:2]
columns_to_normalize = train_df_dropped.columns[2:]
normalized_data = scaler.fit_transform(train_df_dropped[columns_to_normalize])
train_df_normalized = pd.concat([train_df_dropped[columns_to_skip].reset_index(drop=True), 
                                 pd.DataFrame(normalized_data, columns=columns_to_normalize)], axis=1)

# Calculate RUL for each cycle
train_df_normalized['RUL'] = train_df_normalized.groupby('engine_id')['cycle'].transform(lambda x: x.max() - x)

# Sensor columns for later processing
sensor_columns = [col for col in train_df_normalized.columns if 'sensor' in col]

# Step 1: Apply rolling window on each engine's data
df_rolled = roll_time_series(
    train_df_normalized,
    column_id="engine_id",
    column_sort="cycle",
    max_timeshift=29,  # Window of 30 cycles
    min_timeshift=29
)

# Step 2: Melt the rolled DataFrame to use with tsfresh
melted_df = df_rolled.melt(
    id_vars=["engine_id", "cycle"],
    value_vars=sensor_columns,
    var_name="kind",
    value_name="value"
)

# Verify that there are no NaN values
melted_df['value'].fillna(0, inplace=True)

# Use a comprehensive feature set to ensure all relevant features are captured
feature_params = ComprehensiveFCParameters()

# Step 3: Extract features on each rolling window
try:
    extracted_features = extract_features(
        melted_df,
        column_id="engine_id",
        column_sort="cycle",
        column_kind="kind",
        column_value="value",
        default_fc_parameters=feature_params,
        disable_progressbar=False,
        show_warnings=True
    )
except Exception as e:
    logging.error(f"Error during feature extraction: {e}")
    raise

# Step 4: Impute any remaining missing values in extracted features
impute(extracted_features)

# Check the feature count
print("Extracted feature shape:", extracted_features.shape)
print("Extracted features head:\n", extracted_features.head())



  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
Rolling: 100%|██████████| 28/28 [00:03<00:00,  7.72it/s]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  melted_df['value'].fillna(0, inplace=True)
Feature Extraction:  20%|██        | 6/30 [22:08<53:51, 134.65s/it]  

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import roll_time_series, impute
from tsfresh.feature_extraction import MinimalFCParameters
import logging

# Set up logging for debugging
logging.basicConfig(level=logging.INFO)

# Load the data
column_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 27)]
train_df = pd.read_csv('./dataset/train_FD001.txt', sep='\s+', header=None, names=column_names)

# Drop columns that aren't informative
columns_to_drop = ["setting3", "sensor1", "sensor5", "sensor6", "sensor10", "sensor16", "sensor18", "sensor19"]
train_df_dropped = train_df.drop(columns=columns_to_drop)

# Normalize the data
scaler = MinMaxScaler()
columns_to_skip = train_df_dropped.columns[:2]
columns_to_normalize = train_df_dropped.columns[2:]
normalized_data = scaler.fit_transform(train_df_dropped[columns_to_normalize])
train_df_normalized = pd.concat([train_df_dropped[columns_to_skip].reset_index(drop=True), 
                                 pd.DataFrame(normalized_data, columns=columns_to_normalize)], axis=1)

# Calculate RUL for each cycle
train_df_normalized['RUL'] = train_df_normalized.groupby('engine_id')['cycle'].transform(lambda x: x.max() - x)

# Sensor columns for later processing
sensor_columns = [col for col in train_df_normalized.columns if 'sensor' in col]

# Step 1: Apply a smaller rolling window to reduce data size
df_rolled = roll_time_series(
    train_df_normalized,
    column_id="engine_id",
    column_sort="cycle",
    max_timeshift=9,  # Smaller window size of 10 cycles
    min_timeshift=9
)

# Step 2: Melt the rolled DataFrame to use with tsfresh
melted_df = df_rolled.melt(
    id_vars=["engine_id", "cycle"],
    value_vars=sensor_columns,
    var_name="kind",
    value_name="value"
)

# Verify that there are no NaN values
melted_df['value'].fillna(0, inplace=True)

# Define a minimal or custom feature set for efficiency
feature_params = MinimalFCParameters()

# Step 3: Efficient feature extraction on each rolling window
try:
    extracted_features = extract_features(
        melted_df,
        column_id="engine_id",
        column_sort="cycle",
        column_kind="kind",
        column_value="value",
        default_fc_parameters=feature_params,
        disable_progressbar=False,
        show_warnings=True,
        n_jobs=4  # Parallel processing to speed up extraction
    )
except Exception as e:
    logging.error(f"Error during feature extraction: {e}")
    raise

# Step 4: Impute any remaining missing values in extracted features
impute(extracted_features)

# Display the extracted features
print("Extracted feature shape:", extracted_features.shape)
print("Extracted features head:\n", extracted_features.head())


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
Rolling: 100%|██████████| 28/28 [00:03<00:00,  8.00it/s]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  melted_df['value'].fillna(0, inplace=True)
Feature Extraction: 100%|██████████| 20/20 [00:00<00:00, 21.20it/s]


Extracted feature shape: (100, 190)
Extracted features head:
    sensor11__sum_values  sensor11__median  sensor11__mean  sensor11__length  \
1            710.654762          0.357143        0.388336            1830.0   
2            878.571429          0.267857        0.316033            2780.0   
3            572.369048          0.309524        0.336688            1700.0   
4            701.297619          0.363095        0.389610            1800.0   
5            840.648810          0.285714        0.323326            2600.0   

   sensor11__standard_deviation  sensor11__variance  \
1                      0.149070            0.022222   
2                      0.165276            0.027316   
3                      0.128815            0.016593   
4                      0.120972            0.014634   
5                      0.160058            0.025619   

   sensor11__root_mean_square  sensor11__maximum  sensor11__absolute_maximum  \
1                    0.415965           0.880952    

In [2]:
extracted_features.sha






pe

(100, 190)

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import roll_time_series, impute
import logging

# Set up logging for debugging
logging.basicConfig(level=logging.INFO)

# Load the data
column_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 27)]
train_df = pd.read_csv('./dataset/train_FD001.txt', sep='\s+', header=None, names=column_names)

# Drop columns that aren't informative
columns_to_drop = ["setting3", "sensor1", "sensor5", "sensor6", "sensor10", "sensor16", "sensor18", "sensor19"]
train_df_dropped = train_df.drop(columns=columns_to_drop)

# Normalize the data
scaler = MinMaxScaler()
columns_to_skip = train_df_dropped.columns[:2]
columns_to_normalize = train_df_dropped.columns[2:]
normalized_data = scaler.fit_transform(train_df_dropped[columns_to_normalize])
train_df_normalized = pd.concat([train_df_dropped[columns_to_skip].reset_index(drop=True), 
                                 pd.DataFrame(normalized_data, columns=columns_to_normalize)], axis=1)

# Calculate RUL for each cycle
train_df_normalized['RUL'] = train_df_normalized.groupby('engine_id')['cycle'].transform(lambda x: x.max() - x)

# Sensor columns for later processing
sensor_columns = [col for col in train_df_normalized.columns if 'sensor' in col]

# Updated custom feature parameters (replacing 'trend' with 'linear_trend')
custom_fc_parameters = {
    "mean": None,
    "standard_deviation": None,
    "minimum": None,
    "maximum": None,
    "variance": None,
    "skewness": None,
    "kurtosis": None,
    "absolute_sum_of_changes": None,
    "longest_strike_below_mean": None,
    "longest_strike_above_mean": None,
    "count_above_mean": None,
    "count_below_mean": None,
    "linear_trend": [{"attr": "slope"}, {"attr": "intercept"}],  # Replaces 'trend'
}

# Step 1: Apply a rolling window of size 10 cycles for feature extraction
df_rolled = roll_time_series(
    train_df_normalized,
    column_id="engine_id",
    column_sort="cycle",
    max_timeshift=9,  # Window of 10 cycles
    min_timeshift=9
)

# Step 2: Melt the rolled DataFrame to use with tsfresh
melted_df = df_rolled.melt(
    id_vars=["engine_id", "cycle"],
    value_vars=sensor_columns,
    var_name="kind",
    value_name="value"
)

# Verify that there are no NaN values
melted_df['value'].fillna(0, inplace=True)

# Step 3: Feature extraction using custom feature set
try:
    extracted_features = extract_features(
        melted_df,
        column_id="engine_id",
        column_sort="cycle",
        column_kind="kind",
        column_value="value",
        default_fc_parameters=custom_fc_parameters,  # Using custom feature parameters
        disable_progressbar=False,
        show_warnings=True,
        n_jobs=4  # Use parallel processing for efficiency
    )
except Exception as e:
    logging.error(f"Error during feature extraction: {e}")
    raise

# Step 4: Impute any remaining missing values in extracted features
impute(extracted_features)

# Display the extracted features
print("Extracted feature shape:", extracted_features.shape)
print("Extracted features head:\n", extracted_features.head())


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
Rolling: 100%|██████████| 28/28 [00:03<00:00,  7.73it/s]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  melted_df['value'].fillna(0, inplace=True)
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 17.26it/s]

Extracted feature shape: (100, 266)
Extracted features head:
    sensor11__mean  sensor11__standard_deviation  sensor11__minimum  \
1        0.388336                      0.149070           0.107143   
2        0.316033                      0.165276           0.047619   
3        0.336688                      0.128815           0.053571   
4        0.389610                      0.120972           0.178571   
5        0.323326                      0.160058           0.071429   

   sensor11__maximum  sensor11__variance  sensor11__skewness  \
1           0.880952            0.022222            0.742803   
2           0.845238            0.027316            1.071960   
3           0.910714            0.016593            1.155786   
4           0.809524            0.014634            0.893333   
5           0.845238            0.025619            1.007237   

   sensor11__kurtosis  sensor11__absolute_sum_of_changes  \
1            0.034417                          13.619048   
2           




In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import roll_time_series, impute
from tsfresh.feature_extraction import ComprehensiveFCParameters
import logging

# Set up logging for debugging
logging.basicConfig(level=logging.INFO)

# Load the data
column_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 27)]
train_df = pd.read_csv('./dataset/train_FD001.txt', sep='\s+', header=None, names=column_names)

# Drop columns that aren't informative
columns_to_drop = ["setting3", "sensor1", "sensor5", "sensor6", "sensor10", "sensor16", "sensor18", "sensor19"]
train_df_dropped = train_df.drop(columns=columns_to_drop)

# Normalize the data
scaler = MinMaxScaler()
columns_to_skip = train_df_dropped.columns[:2]
columns_to_normalize = train_df_dropped.columns[2:]
normalized_data = scaler.fit_transform(train_df_dropped[columns_to_normalize])
train_df_normalized = pd.concat([train_df_dropped[columns_to_skip].reset_index(drop=True), 
                                 pd.DataFrame(normalized_data, columns=columns_to_normalize)], axis=1)

# Calculate RUL for each cycle
train_df_normalized['RUL'] = train_df_normalized.groupby('engine_id')['cycle'].transform(lambda x: x.max() - x)

# Sensor columns for later processing
sensor_columns = [col for col in train_df_normalized.columns if 'sensor' in col]

# Use comprehensive feature parameters for a rich set of features
comprehensive_fc_parameters = ComprehensiveFCParameters()

# Step 1: Apply a rolling window of size 10 cycles for feature extraction
df_rolled = roll_time_series(
    train_df_normalized,
    column_id="engine_id",
    column_sort="cycle",
    max_timeshift=9,  # Window of 10 cycles
    min_timeshift=9
)

# Step 2: Melt the rolled DataFrame to use with tsfresh
melted_df = df_rolled.melt(
    id_vars=["engine_id", "cycle"],
    value_vars=sensor_columns,
    var_name="kind",
    value_name="value"
)

# Verify that there are no NaN values
melted_df['value'].fillna(0, inplace=True)

# Step 3: Feature extraction using comprehensive feature set
try:
    extracted_features = extract_features(
        melted_df,
        column_id="engine_id",
        column_sort="cycle",
        column_kind="kind",
        column_value="value",
        default_fc_parameters=comprehensive_fc_parameters,  # Comprehensive feature set
        disable_progressbar=False,
        show_warnings=True,
        n_jobs=4  # Use parallel processing for efficiency
    )
except Exception as e:
    logging.error(f"Error during feature extraction: {e}")
    raise

# Step 4: Impute any remaining missing values in extracted features
impute(extracted_features)

# Display the extracted features
print("Extracted feature shape:", extracted_features.shape)
print("Extracted features head:\n", extracted_features.head())


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
Rolling: 100%|██████████| 28/28 [00:03<00:00,  7.60it/s]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  melted_df['value'].fillna(0, inplace=True)
Feature Extraction: 100%|██████████| 20/20 [1:45:21<00:00, 316.06s/it]   
 'sensor12__query_similarity_count__query_None__threshold_0.0'
 'sensor13__friedrich_coefficients__coeff_0__m_3__r_30'
 'sensor13__friedrich_coefficients__coeff_1__m_3__r_30'
 'sensor13__friedrich_coefficients__coeff_2__m_3__r_30'
 'sensor13__friedrich_coefficients__coeff_3__m_3__r_30'
 'sensor13__max_langevin_fixed_point__m_3__r_30'
 'sensor13

Extracted feature shape: (100, 14877)
Extracted features head:
    sensor11__variance_larger_than_standard_deviation  \
1                                                0.0   
2                                                0.0   
3                                                0.0   
4                                                0.0   
5                                                0.0   

   sensor11__has_duplicate_max  sensor11__has_duplicate_min  \
1                          1.0                          1.0   
2                          1.0                          0.0   
3                          1.0                          1.0   
4                          1.0                          1.0   
5                          1.0                          1.0   

   sensor11__has_duplicate  sensor11__sum_values  sensor11__abs_energy  \
1                      1.0            710.654762            316.639031   
2                      1.0            878.571429            353.596088  

In [8]:
extracted_features.shape

(100, 14877)