# 02 - Feature Engineering and Feature Store Setup

This notebook demonstrates:
- Loading and validating raw data
- Setting up the feature store
- Registering features
- Computing and storing features
- Exploring feature lineage

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from feature_store import FeatureStore
from data_validators import RawDataValidator
from features import ClinicalFeatureEngineer, get_feature_lineage, get_feature_descriptions

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

print("Libraries imported successfully!")

## 1. Load Raw Data

In [None]:
# Load the synthetic data we generated
raw_data = pd.read_csv('../data/raw/synthetic_patients.csv')

print(f"Loaded {len(raw_data)} patient records")
raw_data.head()

## 2. Validate Raw Data

Before ingesting into the feature store, we validate to catch data quality issues.

In [None]:
# Initialize validator
validator = RawDataValidator()

# Run validation
validation_report = validator.validate(raw_data)

# Print report
validation_report.print_report()

The validation catches our intentional errors! Let's clean the data before proceeding.

In [None]:
# Remove rows with invalid data
clean_data = raw_data[
    (raw_data['age'] >= 0) & (raw_data['age'] <= 120) &
    (raw_data['tmb_score'] <= 100) &
    (raw_data['comorbidity_count'] <= 10)
]

print(f"Removed {len(raw_data) - len(clean_data)} invalid records")
print(f"Clean dataset: {len(clean_data)} patients")

# Re-validate
clean_report = validator.validate(clean_data)
clean_report.print_report()

## 3. Initialize Feature Store

In [None]:
# Create feature store instance
fs = FeatureStore(
    db_path='../data/feature_store.duckdb',
    config_dir='../config'
)

print("Feature store initialized!")

## 4. Register Features

Register features from our configuration file.

In [None]:
# Register all features from config
fs.register_features_from_config()

# List registered features
features_df = fs.list_features()
print(f"\nRegistered {len(features_df)} features")
features_df[['feature_name', 'feature_type', 'category', 'version']]

## 5. Ingest Raw Data

In [None]:
# Ingest clean data into feature store
fs.ingest_raw_data(clean_data, validate=True, data_version=1)

## 6. Compute Features

Apply feature engineering pipeline to create ML-ready features.

In [None]:
# Compute all features
features = fs.compute_features(feature_version=1, validate=True)

print(f"\nComputed features for {len(features)} patients")
features.head()

## 7. Explore Computed Features

In [None]:
# Visualize scaled features (should be mean~0, std~1)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(features['age_scaled'], bins=30, edgecolor='black')
axes[0].set_xlabel('Age (scaled)')
axes[0].set_ylabel('Count')
axes[0].set_title(f'Scaled Age: mean={features["age_scaled"].mean():.3f}, std={features["age_scaled"].std():.3f}')
axes[0].axvline(0, color='red', linestyle='--', label='Mean=0')
axes[0].legend()

axes[1].hist(features['tmb_score_scaled'], bins=30, edgecolor='black')
axes[1].set_xlabel('TMB Score (scaled)')
axes[1].set_ylabel('Count')
axes[1].set_title(f'Scaled TMB: mean={features["tmb_score_scaled"].mean():.3f}, std={features["tmb_score_scaled"].std():.3f}')
axes[1].axvline(0, color='red', linestyle='--', label='Mean=0')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Visualize derived features
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Mutation burden
mutation_counts = features['mutation_burden'].value_counts().sort_index()
axes[0].bar(mutation_counts.index, mutation_counts.values)
axes[0].set_xlabel('Number of Mutations')
axes[0].set_ylabel('Count')
axes[0].set_title('Mutation Burden Distribution')
axes[0].set_xticks([0, 1, 2, 3])

# Clinical risk score
axes[1].hist(features['clinical_risk_score'], bins=30, edgecolor='black')
axes[1].set_xlabel('Clinical Risk Score')
axes[1].set_ylabel('Count')
axes[1].set_title('Clinical Risk Score Distribution')
axes[1].axvline(20, color='red', linestyle='--', label='High risk threshold')
axes[1].legend()

# Age groups
age_group_counts = features['age_group'].value_counts()
axes[2].bar(age_group_counts.index, age_group_counts.values)
axes[2].set_xlabel('Age Group')
axes[2].set_ylabel('Count')
axes[2].set_title('Age Group Distribution')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print(f"\nHigh-risk patients: {(features['high_risk_patient']==1).sum()} ({(features['high_risk_patient']==1).mean()*100:.1f}%)")

## 8. Feature Lineage

Understand which raw columns created which features.

In [None]:
# Get feature lineage
lineage = get_feature_lineage()
descriptions = get_feature_descriptions()

print("Feature Lineage:\n")
for feature_name, source_cols in lineage.items():
    print(f"{feature_name}:")
    print(f"  Sources: {', '.join(source_cols)}")
    print(f"  Description: {descriptions[feature_name]}")
    print()

## 9. Retrieve Features from Store

Show how to retrieve features for specific patients or feature sets.

In [None]:
# Get features for specific patients
patient_ids = clean_data['patient_id'].head(10).tolist()
patient_features = fs.get_features(patient_ids=patient_ids, feature_version=1)

print(f"Retrieved features for {len(patient_features)} patients")
patient_features

In [None]:
# Get specific features for all patients
specific_features = fs.get_features(
    feature_list=['mutation_burden', 'clinical_risk_score', 'high_risk_patient'],
    feature_version=1
)

print(f"Retrieved {len(specific_features)} rows with selected features")
specific_features.head()

## 10. Data Quality History

In [None]:
# Check data quality history
quality_history = fs.get_data_quality_history(limit=10)

print("Recent data quality checks:")
quality_history[['check_timestamp', 'check_type', 'passed', 'error_count', 'warning_count']]

## Summary

In this notebook we:
1. Validated raw clinical data
2. Set up a feature store with DuckDB
3. Registered features with metadata
4. Computed and stored features
5. Explored feature lineage
6. Retrieved features for ML

Next: `03_model_training_demo.ipynb` - Train a model using the feature store!

In [None]:
# Close feature store connection
fs.close()

In [None]:
## End of Notebook ##