In [6]:
import sys
from pathlib import Path
project_root = Path("..").resolve()
sys.path.append(str(project_root))

import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression
from src.features import standardize_per_engine, create_rolling_features

In [7]:
# Load train with RUL
train_df = pd.read_csv('../data/processed/train_with_rul.csv')

# Drop low-var/constant
drop_cols = ['s1', 's5', 's6', 's10', 's16', 's18', 's19', 'op3', 'max_cycle']
train_df = train_df.drop(columns=drop_cols, errors='ignore')

# Key sensors from EDA
key_sensors = ['s2', 's3', 's4', 's7', 's8', 's9', 's11', 's12', 's13', 's14', 's15', 's17', 's20', 's21']

print("Train shape after drops:", train_df.shape)

Train shape after drops: (20631, 21)


In [8]:
train_df = standardize_per_engine(train_df, key_sensors)
print("Standardized sensors per engine.")

Standardized sensors per engine.


In [9]:
for sensor in key_sensors:
    train_df[f'{sensor}_cum_delta'] = train_df.groupby('engine_id')[sensor].cumsum()

In [10]:
train_df = create_rolling_features(train_df, key_sensors)

  df[f'{sensor}_std_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).std())
  df[f'{sensor}_slope_{w}'] = grouped.transform(lambda x: (x - x.shift(w)) / w).fillna(0)
  df[f'{sensor}_mean_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).mean())
  df[f'{sensor}_std_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).std())
  df[f'{sensor}_slope_{w}'] = grouped.transform(lambda x: (x - x.shift(w)) / w).fillna(0)
  df[f'{sensor}_mean_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).mean())
  df[f'{sensor}_std_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).std())
  df[f'{sensor}_slope_{w}'] = grouped.transform(lambda x: (x - x.shift(w)) / w).fillna(0)
  df[f'{sensor}_mean_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).mean())
  df[f'{sensor}_std_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).std())
  df[f'{sensor}_slope_{w}'] = grouped.transform(lambda x: (x - x.shift(w)) / w).fillna(

In [11]:
for sensor in key_sensors:
    train_df[f'{sensor}_rate'] = train_df.groupby('engine_id')[sensor].diff().fillna(0)

# Limited interactions
high_corr_pairs = [('s4', 's11'), ('s2', 's15')]
for pair in high_corr_pairs:
    train_df[f'{pair[0]}_{pair[1]}_interact'] = train_df[pair[0]] * train_df[pair[1]]

  train_df[f'{sensor}_rate'] = train_df.groupby('engine_id')[sensor].diff().fillna(0)
  train_df[f'{sensor}_rate'] = train_df.groupby('engine_id')[sensor].diff().fillna(0)
  train_df[f'{sensor}_rate'] = train_df.groupby('engine_id')[sensor].diff().fillna(0)
  train_df[f'{sensor}_rate'] = train_df.groupby('engine_id')[sensor].diff().fillna(0)
  train_df[f'{sensor}_rate'] = train_df.groupby('engine_id')[sensor].diff().fillna(0)
  train_df[f'{sensor}_rate'] = train_df.groupby('engine_id')[sensor].diff().fillna(0)
  train_df[f'{sensor}_rate'] = train_df.groupby('engine_id')[sensor].diff().fillna(0)
  train_df[f'{sensor}_rate'] = train_df.groupby('engine_id')[sensor].diff().fillna(0)
  train_df[f'{sensor}_rate'] = train_df.groupby('engine_id')[sensor].diff().fillna(0)
  train_df[f'{sensor}_rate'] = train_df.groupby('engine_id')[sensor].diff().fillna(0)
  train_df[f'{sensor}_rate'] = train_df.groupby('engine_id')[sensor].diff().fillna(0)
  train_df[f'{sensor}_rate'] = train_df.groupby('engin

In [12]:
train_df['cycle_norm'] = train_df['cycle'] / train_df.groupby('engine_id')['cycle'].transform('max')

  train_df['cycle_norm'] = train_df['cycle'] / train_df.groupby('engine_id')['cycle'].transform('max')


In [None]:
# Endpoint df
endpoint_df = train_df.groupby('engine_id').tail(1)

X_end = endpoint_df.drop(['engine_id', 'rul', 'cycle'], axis=1, errors='ignore')
X_end = X_end.select_dtypes(include=[np.number])
y_end = endpoint_df['rul']

mi_scores = mutual_info_regression(X_end, y_end)
mi_df = pd.Series(mi_scores, index=X_end.columns).sort_values(ascending=False)

# Robust selection
min_features = 50
max_features = 200

top_features = mi_df[mi_df > 0.1].index.tolist()
if len(top_features) < min_features:
    top_features = mi_df.head(max_features).index.tolist()
elif len(top_features) > max_features:
    top_features = mi_df.head(max_features).index.tolist()

print("Final feature count:", len(top_features))

pd.Series(top_features).to_csv(
    '../data/processed/selected_features.txt',
    index=False,
    header=False
)

train_df = train_df[['engine_id', 'rul', 'cycle'] + top_features]

Final feature count: 174


In [None]:
# Load test
test_df = pd.read_csv('../data/raw/test_FD001.txt', sep='\s+', header=None, engine='python')
test_df = test_df.iloc[:, :26]
columns = ['engine_id', 'cycle', 'op1', 'op2', 'op3'] + [f's{i}' for i in range(1, 22)]
test_df.columns = columns

# Drop same
test_df = test_df.drop(columns=drop_cols[:-1], errors='ignore')

# Standardize
test_df = standardize_per_engine(test_df, key_sensors)

# Features
for sensor in key_sensors:
    test_df[f'{sensor}_cum_delta'] = test_df.groupby('engine_id')[sensor].cumsum()
test_df = create_rolling_features(test_df, key_sensors)
for sensor in key_sensors:
    test_df[f'{sensor}_rate'] = test_df.groupby('engine_id')[sensor].diff().fillna(0)
for pair in high_corr_pairs:
    test_df[f'{pair[0]}_{pair[1]}_interact'] = test_df[pair[0]] * test_df[pair[1]]

# Normalized cycle
test_df['cycle_norm'] = test_df['cycle'] / test_df.groupby('engine_id')['cycle'].transform('max')

# Select same top 
test_df = test_df[['engine_id', 'cycle'] + [col for col in top_features if col in test_df.columns]]

# Add RUL
rul_test = pd.read_csv('../data/raw/RUL_FD001.txt', header=None, names=['rul'])
test_df['max_cycle'] = test_df.groupby('engine_id')['cycle'].transform('max')
test_df = test_df.merge(rul_test.reset_index().rename(columns={'index': 'engine_id'}), on='engine_id', how='left')
test_df['rul'] = test_df['rul'] + test_df['max_cycle'] - test_df['cycle']
test_df = test_df.drop('max_cycle', axis=1)

# Final cleanup
test_df = test_df.fillna(0)
test_df = test_df.copy()

test_df.to_csv('../data/processed/engineered_test.csv', index=False)
print("Processed test set.")

  df[f'{sensor}_slope_{w}'] = grouped.transform(lambda x: (x - x.shift(w)) / w).fillna(0)
  df[f'{sensor}_mean_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).mean())
  df[f'{sensor}_std_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).std())
  df[f'{sensor}_slope_{w}'] = grouped.transform(lambda x: (x - x.shift(w)) / w).fillna(0)
  df[f'{sensor}_mean_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).mean())
  df[f'{sensor}_std_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).std())
  df[f'{sensor}_slope_{w}'] = grouped.transform(lambda x: (x - x.shift(w)) / w).fillna(0)
  df[f'{sensor}_mean_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).mean())
  df[f'{sensor}_std_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).std())
  df[f'{sensor}_slope_{w}'] = grouped.transform(lambda x: (x - x.shift(w)) / w).fillna(0)
  df[f'{sensor}_mean_{w}'] = grouped.transform(lambda x: x.rolling(w, min_periods=1).mea

Processed test set.


In [15]:
train_df.to_csv('../data/processed/engineered_train.csv', index=False)