# 03 - Data Preparation

**CRISP-DM Phase 3: Data Preparation**

This notebook covers data cleaning, RUL labeling, and feature engineering.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

from data_processing.rul_calculator import RULCalculator
from data_processing.event_labeler import EventLabeler
from feature_engineering.time_series_features import TimeSeriesFeatureGenerator
from feature_engineering.statistical_features import StatisticalFeatureGenerator
from utils.helpers import load_config, handle_missing_values, save_dataframe

sns.set_style('whitegrid')
config = load_config()

## 1. Load Raw Data

In [None]:
# Load data
# sensor_data = pd.read_csv('data/raw/sensor_data.csv')
# failure_events = pd.read_csv('data/raw/failure_events.csv')

print("Placeholder: Load raw data")

## 2. Data Cleaning

In [None]:
# Handle missing values
# strategy = config['data_processing']['missing_value_strategy']
# sensor_data = handle_missing_values(sensor_data, strategy=strategy)

# Convert timestamps
# sensor_data['timestamp'] = pd.to_datetime(sensor_data['timestamp'])
# failure_events['failure_timestamp'] = pd.to_datetime(failure_events['failure_timestamp'])

print("Placeholder: Clean data")

## 3. RUL Calculation

In [None]:
# Initialize RUL calculator
# clip_max = config['data_processing']['rul_clip_max']
# rul_calc = RULCalculator(clip_max=clip_max)

# Compute RUL
# sensor_data = rul_calc.compute_rul(
#     sensor_data,
#     failure_events,
#     time_unit='hours'
# )

# Display RUL distribution
# sensor_data['RUL'].hist(bins=50, figsize=(10, 6))
# plt.title('RUL Distribution')
# plt.xlabel('RUL (hours)')
# plt.ylabel('Frequency')
# plt.show()

print("Placeholder: Calculate RUL")

## 4. Failure Labeling for Classification

In [None]:
# Initialize event labeler
# horizon_days = config['modeling']['failure_classification']['prediction_horizon_days']
# labeler = EventLabeler(prediction_horizon=horizon_days, time_unit='days')

# Label failure windows
# sensor_data = labeler.label_failure_window(sensor_data, failure_events)

# Check class balance
# print("Class distribution:")
# print(sensor_data['failure_label'].value_counts())
# print(f"\nPositive class ratio: {sensor_data['failure_label'].mean():.4f}")

print("Placeholder: Label failure windows")

## 5. Feature Engineering

In [None]:
# Initialize feature generators
# window_sizes = config['feature_engineering']['rolling_windows']
# lag_sizes = config['feature_engineering']['lag_features']

# ts_generator = TimeSeriesFeatureGenerator(window_sizes=window_sizes, lag_sizes=lag_sizes)
# stat_generator = StatisticalFeatureGenerator()

# Generate time-series features
# sensor_data = ts_generator.generate_features(sensor_data)
# print(f"After time-series features: {sensor_data.shape}")

# Generate statistical features
# sensor_data = stat_generator.generate_features(sensor_data)
# print(f"After statistical features: {sensor_data.shape}")

print("Placeholder: Generate features")

## 6. Train/Validation/Test Split

In [None]:
# from sklearn.model_selection import train_test_split

# Split data
# test_size = config['data_processing']['test_size']
# val_size = config['data_processing']['validation_size']

# train_val, test = train_test_split(sensor_data, test_size=test_size, random_state=42)
# train, val = train_test_split(train_val, test_size=val_size/(1-test_size), random_state=42)

# print(f"Train size: {len(train)}")
# print(f"Validation size: {len(val)}")
# print(f"Test size: {len(test)}")

print("Placeholder: Split data")

## 7. Save Processed Data

In [None]:
# Save processed data
# save_dataframe(train, 'data/processed/train_data.csv')
# save_dataframe(val, 'data/processed/val_data.csv')
# save_dataframe(test, 'data/processed/test_data.csv')

print("Placeholder: Save processed data")

## Next Steps

Proceed to **04 - Modeling** to train XGBoost models for RUL prediction and failure classification.