# NASA C-MAPSS Dataset Exploration

This notebook explores the NASA Commercial Modular Aero-Propulsion System Simulation (C-MAPSS) turbofan engine degradation dataset.

## Dataset Description

The C-MAPSS dataset contains run-to-failure simulations of turbofan engines under different operating conditions and fault modes:

- **FD001**: Single operating condition, single fault mode (HPC degradation)
- **FD002**: Six operating conditions, single fault mode
- **FD003**: Single operating condition, two fault modes (HPC + Fan degradation)
- **FD004**: Six operating conditions, two fault modes

Each dataset contains:
- **21 sensor measurements**: Temperatures, pressures, speeds, etc.
- **3 operational settings**: Flight conditions
- **Time series data**: Multiple cycles until failure

## Objectives

1. Load and understand the data structure
2. Visualize sensor readings over time
3. Analyze RUL distributions
4. Identify correlations between sensors
5. Detect degradation patterns

In [None]:
# Import libraries
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
%matplotlib inline

# Import custom modules
from data_loader import CMAPSSDataLoader

## 1. Load Dataset

In [None]:
# Initialize data loader
dataset_name = 'FD001'  # Change to FD002, FD003, or FD004 as needed

loader = CMAPSSDataLoader(
    data_dir='../data/CMAPSS',
    dataset_name=dataset_name
)

# Load raw data
train_df, test_df, test_rul = loader.load_raw_data()

In [None]:
# Display basic information
print(f"Dataset: {dataset_name}")
print(f"Training samples: {len(train_df)}")
print(f"Training units: {train_df['unit_id'].nunique()}")
print(f"Test samples: {len(test_df)}")
print(f"Test units: {test_df['unit_id'].nunique()}")
print(f"\nColumns: {train_df.shape[1]}")

# Show first few rows
train_df.head()

## 2. Add RUL Labels

In [None]:
# Add RUL labels to training data
train_df = loader.add_rul_labels(train_df, max_rul=125)

# Display RUL statistics
print("RUL Statistics:")
print(train_df['RUL'].describe())

# Plot RUL distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.hist(train_df['RUL'], bins=50, edgecolor='black', alpha=0.7)
ax1.set_xlabel('RUL (cycles)')
ax1.set_ylabel('Frequency')
ax1.set_title('RUL Distribution (All Samples)')
ax1.grid(True, alpha=0.3)

# RUL distribution per unit (last RUL value)
unit_lifespans = train_df.groupby('unit_id')['time_cycles'].max()
ax2.hist(unit_lifespans, bins=30, edgecolor='black', alpha=0.7, color='orange')
ax2.set_xlabel('Total Lifespan (cycles)')
ax2.set_ylabel('Frequency')
ax2.set_title('Unit Lifespan Distribution')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Visualize Sensor Readings Over Time

In [None]:
# Select a sample unit
sample_unit = 1
unit_data = train_df[train_df['unit_id'] == sample_unit]

# Get sensor columns
sensor_cols = [col for col in train_df.columns if col.startswith('sensor_')]

# Plot sensor readings
fig, axes = plt.subplots(5, 4, figsize=(16, 14))
axes = axes.flatten()

for i, sensor in enumerate(sensor_cols):
    if i < len(axes):
        ax = axes[i]
        ax.plot(unit_data['time_cycles'], unit_data[sensor], linewidth=1.5)
        ax.set_xlabel('Time (cycles)')
        ax.set_ylabel(sensor)
        ax.set_title(f'{sensor} - Unit {sample_unit}')
        ax.grid(True, alpha=0.3)

# Remove extra subplots
for i in range(len(sensor_cols), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

## 4. Analyze Sensor Variance

In [None]:
# Calculate variance for each sensor
sensor_variance = train_df[sensor_cols].var().sort_values(ascending=False)

# Plot variance
fig, ax = plt.subplots(figsize=(12, 6))
sensor_variance.plot(kind='bar', ax=ax, color='skyblue', edgecolor='black')
ax.set_xlabel('Sensor')
ax.set_ylabel('Variance')
ax.set_title('Sensor Variance Analysis')
ax.grid(True, axis='y', alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Low variance sensors (likely to be dropped):")
print(sensor_variance[sensor_variance < 0.01])

## 5. Correlation Analysis

In [None]:
# Compute correlation matrix
# Drop low-variance sensors first
active_sensors = [s for s in sensor_cols if s not in loader.DROP_SENSORS]

corr_matrix = train_df[active_sensors].corr()

# Plot heatmap
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(
    corr_matrix,
    cmap='coolwarm',
    center=0,
    vmin=-1,
    vmax=1,
    square=True,
    linewidths=0.5,
    cbar_kws={'label': 'Correlation'},
    ax=ax
)
ax.set_title('Sensor Correlation Heatmap')
plt.tight_layout()
plt.show()

## 6. RUL vs Sensor Readings

In [None]:
# Plot selected sensors vs RUL
important_sensors = ['sensor_2', 'sensor_3', 'sensor_4', 'sensor_7', 'sensor_11', 'sensor_12']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, sensor in enumerate(important_sensors):
    ax = axes[i]
    
    # Sample data for visualization
    sample = train_df.sample(n=min(5000, len(train_df)))
    
    scatter = ax.scatter(
        sample['RUL'],
        sample[sensor],
        c=sample['RUL'],
        cmap='viridis',
        alpha=0.3,
        s=10
    )
    
    ax.set_xlabel('RUL (cycles)')
    ax.set_ylabel(sensor)
    ax.set_title(f'{sensor} vs RUL')
    ax.grid(True, alpha=0.3)
    
    plt.colorbar(scatter, ax=ax, label='RUL')

plt.tight_layout()
plt.show()

## 7. Degradation Patterns

In [None]:
# Plot degradation for multiple units
sample_units = np.random.choice(train_df['unit_id'].unique(), size=5, replace=False)

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, sensor in enumerate(important_sensors):
    ax = axes[i]
    
    for unit in sample_units:
        unit_data = train_df[train_df['unit_id'] == unit]
        # Normalize time to [0, 1]
        normalized_time = unit_data['time_cycles'] / unit_data['time_cycles'].max()
        ax.plot(normalized_time, unit_data[sensor], alpha=0.7, label=f'Unit {unit}')
    
    ax.set_xlabel('Normalized Time (0=start, 1=failure)')
    ax.set_ylabel(sensor)
    ax.set_title(f'{sensor} Degradation Pattern')
    ax.grid(True, alpha=0.3)
    if i == 0:
        ax.legend()

plt.tight_layout()
plt.show()

## 8. Summary Statistics

In [None]:
# Summary statistics
print(f"Dataset: {dataset_name}")
print(f"\nTraining Data:")
print(f"  Total samples: {len(train_df):,}")
print(f"  Number of units: {train_df['unit_id'].nunique()}")
print(f"  Average lifespan: {train_df.groupby('unit_id')['time_cycles'].max().mean():.1f} cycles")
print(f"  Min lifespan: {train_df.groupby('unit_id')['time_cycles'].max().min()} cycles")
print(f"  Max lifespan: {train_df.groupby('unit_id')['time_cycles'].max().max()} cycles")

print(f"\nTest Data:")
print(f"  Total samples: {len(test_df):,}")
print(f"  Number of units: {test_df['unit_id'].nunique()}")
print(f"  Average RUL: {test_rul.mean():.1f} cycles")
print(f"  Min RUL: {test_rul.min()} cycles")
print(f"  Max RUL: {test_rul.max()} cycles")

print(f"\nSensor Information:")
print(f"  Total sensors: {len(sensor_cols)}")
print(f"  Active sensors (after dropping low variance): {len(active_sensors)}")
print(f"  Dropped sensors: {loader.DROP_SENSORS}")

## Key Insights

From this exploration, we can observe:

1. **Sensor Variability**: Some sensors have very low variance and can be dropped
2. **Degradation Patterns**: Clear degradation trends in certain sensors as engines approach failure
3. **Correlations**: Strong correlations between related sensors (e.g., temperature sensors)
4. **RUL Distribution**: Most training samples have low RUL values (near failure)
5. **Operating Conditions**: Different datasets have different levels of complexity

## Next Steps

1. Preprocess data (normalization, windowing)
2. Build and train models
3. Evaluate performance
4. Analyze attention mechanisms
5. Compare with baselines