# 02 - Data Understanding

**CRISP-DM Phase 2: Data Understanding**

This notebook explores the sensor data, failure events, and data quality for the predictive maintenance project.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

from data_extraction.sql_extractor import SQLExtractor
from utils.helpers import load_config, setup_logging

# Setup
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Logging
logger = setup_logging()

## 1. Data Extraction

Extract data from SQL Server database.

In [None]:
# Initialize SQL extractor
# Note: Ensure database_config.yaml is configured with your credentials

# For demo purposes, we'll assume data is already extracted
# Uncomment below to extract from SQL Server:

# with SQLExtractor('config/database_config.yaml') as extractor:
#     sensor_data = extractor.extract_sensor_data(
#         start_date='2023-01-01',
#         end_date='2023-12-31',
#         output_path='data/raw/sensor_data.csv'
#     )
#     
#     failure_events = extractor.extract_failure_events(
#         start_date='2023-01-01',
#         end_date='2023-12-31',
#         output_path='data/raw/failure_events.csv'
#     )

# For demo, load from files if they exist
# sensor_data = pd.read_csv('data/raw/sensor_data.csv')
# failure_events = pd.read_csv('data/raw/failure_events.csv')

print("Data extraction completed (or load existing files).")

## 2. Initial Data Exploration

Explore basic statistics and structure of the data.

In [None]:
# Display sensor data info
# print("Sensor Data Info:")
# print(sensor_data.info())
# print("\nFirst few rows:")
# sensor_data.head()

print("Placeholder: Display sensor data summary")

In [None]:
# Display failure events info
# print("Failure Events Info:")
# print(failure_events.info())
# print("\nFirst few rows:")
# failure_events.head()

print("Placeholder: Display failure events summary")

## 3. Data Quality Assessment

In [None]:
# Check for missing values
# missing_data = sensor_data.isnull().sum()
# print("Missing values per column:")
# print(missing_data[missing_data > 0])

# Visualize missing data
# plt.figure(figsize=(12, 6))
# sns.heatmap(sensor_data.isnull(), cbar=False, yticklabels=False)
# plt.title('Missing Data Heatmap')
# plt.show()

print("Placeholder: Check for missing values")

## 4. Sensor Data Analysis

In [None]:
# Plot sensor distributions
# sensor_cols = [col for col in sensor_data.columns if 'sensor' in col.lower()]
# 
# fig, axes = plt.subplots(len(sensor_cols), 1, figsize=(12, 4*len(sensor_cols)))
# for idx, col in enumerate(sensor_cols):
#     sensor_data[col].hist(bins=50, ax=axes[idx])
#     axes[idx].set_title(f'{col} Distribution')
#     axes[idx].set_xlabel(col)
#     axes[idx].set_ylabel('Frequency')
# plt.tight_layout()
# plt.show()

print("Placeholder: Plot sensor distributions")

## 5. Failure Analysis

In [None]:
# Analyze failure frequency
# failure_counts = failure_events.groupby('machine_id').size()
# print(f"Total machines: {len(failure_counts)}")
# print(f"Total failures: {failure_counts.sum()}")
# print(f"Failures per machine (mean): {failure_counts.mean():.2f}")

# Plot failure distribution
# plt.figure(figsize=(10, 6))
# failure_counts.hist(bins=20)
# plt.title('Distribution of Failures per Machine')
# plt.xlabel('Number of Failures')
# plt.ylabel('Number of Machines')
# plt.show()

print("Placeholder: Analyze failure patterns")

## 6. Time Series Patterns

In [None]:
# Plot sensor readings over time for a sample machine
# sample_machine = sensor_data['machine_id'].iloc[0]
# machine_data = sensor_data[sensor_data['machine_id'] == sample_machine].head(1000)

# sensor_cols = [col for col in machine_data.columns if 'sensor' in col.lower()]
# fig, axes = plt.subplots(len(sensor_cols), 1, figsize=(14, 3*len(sensor_cols)))
# for idx, col in enumerate(sensor_cols):
#     axes[idx].plot(machine_data['timestamp'], machine_data[col])
#     axes[idx].set_title(f'{col} Over Time (Machine {sample_machine})')
#     axes[idx].set_xlabel('Timestamp')
#     axes[idx].set_ylabel(col)
# plt.tight_layout()
# plt.show()

print("Placeholder: Plot time series patterns")

## 7. Summary and Findings

Document key findings:
- Data quality issues identified
- Sensor value ranges and distributions
- Failure frequency and patterns
- Temporal patterns in sensor data

## Next Steps

Proceed to **03 - Data Preparation** to:
1. Clean and preprocess data
2. Compute RUL labels
3. Engineer features from time-series data
4. Prepare train/validation/test splits