# 04 - Data Quality Monitoring

This notebook demonstrates:
- Generating data quality reports
- Detecting distribution drift
- Monitoring feature correlations
- Tracking data quality over time
- Best practices for ML monitoring

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from feature_store import FeatureStore
from monitoring import DataQualityMonitor
from data_generator import ClinicalDataGenerator

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

print("Libraries imported successfully!")

## 1. Load Data from Feature Store

In [None]:
# Connect to feature store
fs = FeatureStore(
    db_path='../data/feature_store.duckdb',
    config_dir='../config'
)

# Get raw data and features
raw_data = fs.get_raw_data()
features = fs.get_features(feature_version=1)

print(f"Loaded {len(raw_data)} patient records")
print(f"Loaded {len(features)} feature records")

## 2. Initialize Monitor and Generate Report

In [None]:
# Initialize monitor
monitor = DataQualityMonitor(feature_store=fs)

# Generate comprehensive quality report
report_path = monitor.generate_quality_report(
    df=raw_data,
    report_name="raw_data_quality",
    output_dir="../reports"
)

print(f"\nReport generated: {report_path}")
print("Open the HTML file in a browser to view the full report.")

## 3. Missing Data Analysis

In [None]:
# Compute missing data metrics
missing_metrics = monitor.compute_missing_data_metrics(raw_data)

# Convert to DataFrame for visualization
missing_df = pd.DataFrame(missing_metrics).T
missing_df = missing_df[missing_df['missing_rate'] > 0].sort_values('missing_rate', ascending=False)

print("Missing Data Summary:")
print(missing_df[['missing_count', 'missing_rate']])

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(missing_df.index, missing_df['missing_rate'] * 100)
ax.set_xlabel('Missing Data (%)')
ax.set_title('Missing Data by Column')
ax.axvline(15, color='red', linestyle='--', label='15% threshold')
ax.legend()
plt.tight_layout()
plt.show()

## 4. Distribution Analysis

In [None]:
# Compute distribution metrics
dist_metrics = monitor.compute_distribution_metrics(raw_data)

# Display key statistics
dist_df = pd.DataFrame(dist_metrics).T
print("Distribution Statistics:")
print(dist_df[['count', 'mean', 'median', 'std']].head(10))

In [None]:
# Visualize distributions for key features
key_features = ['age', 'tmb_score', 'wbc_count', 'survival_months']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, feature in enumerate(key_features):
    values = raw_data[feature].dropna()
    
    axes[idx].hist(values, bins=30, edgecolor='black', alpha=0.7)
    axes[idx].axvline(values.mean(), color='red', linestyle='--', label=f'Mean: {values.mean():.2f}')
    axes[idx].axvline(values.median(), color='green', linestyle='--', label=f'Median: {values.median():.2f}')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Count')
    axes[idx].set_title(f'{feature} Distribution')
    axes[idx].legend()

plt.tight_layout()
plt.show()

## 5. Categorical Feature Analysis

In [None]:
# Compute categorical metrics
cat_metrics = monitor.compute_categorical_metrics(raw_data)

print("Categorical Feature Summary:\n")
for feature, metrics in cat_metrics.items():
    print(f"{feature}:")
    print(f"  Unique values: {metrics['unique_values']}")
    print(f"  Most common: {metrics['most_common']} ({metrics['most_common_pct']:.1f}%)")
    print()

In [None]:
# Visualize key categorical distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Diagnosis
diagnosis_counts = raw_data['diagnosis'].value_counts()
axes[0, 0].barh(diagnosis_counts.index, diagnosis_counts.values)
axes[0, 0].set_xlabel('Count')
axes[0, 0].set_title('Diagnosis Distribution')

# Treatment Response
response_counts = raw_data['treatment_response'].value_counts()
axes[0, 1].bar(range(len(response_counts)), response_counts.values)
axes[0, 1].set_xticks(range(len(response_counts)))
axes[0, 1].set_xticklabels(response_counts.index, rotation=45, ha='right')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Treatment Response Distribution')

# MSI Status
msi_counts = raw_data['msi_status'].value_counts()
axes[1, 0].pie(msi_counts.values, labels=msi_counts.index, autopct='%1.1f%%')
axes[1, 0].set_title('MSI Status Distribution')

# Sex (showing inconsistent formatting)
sex_counts = raw_data['sex'].value_counts()
axes[1, 1].bar(range(len(sex_counts)), sex_counts.values)
axes[1, 1].set_xticks(range(len(sex_counts)))
axes[1, 1].set_xticklabels(sex_counts.index, rotation=45)
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('Sex Distribution (Note: Inconsistent Format)')

plt.tight_layout()
plt.show()

## 6. Feature Correlation Analysis

In [None]:
# Compute correlations
corr_metrics = monitor.compute_correlation_metrics(raw_data, threshold=0.5)

print("High Correlations Detected:\n")
if corr_metrics.get('high_correlations'):
    for corr in corr_metrics['high_correlations']:
        print(f"  {corr['feature1']} <-> {corr['feature2']}: {corr['correlation']:.3f}")
else:
    print("  No high correlations above threshold")

In [None]:
# Visualize correlation matrix
numeric_cols = ['age', 'comorbidity_count', 'tmb_score', 'wbc_count', 'hemoglobin', 
                'platelet_count', 'survival_months']
corr_matrix = raw_data[numeric_cols].corr()

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
ax.set_title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 7. Simulate Distribution Drift

Generate a new batch of data and compare to detect drift.

In [None]:
# Generate new batch with slightly different distribution
# (Simulating data drift that might occur in production)
generator = ClinicalDataGenerator(seed=100)  # Different seed
new_data = generator.generate_dataset(n_patients=500, introduce_errors=False)

# Remove any validation errors
new_data_clean = new_data[
    (new_data['age'] >= 0) & (new_data['age'] <= 120) &
    (new_data['tmb_score'] <= 100) &
    (new_data['comorbidity_count'] <= 10)
]

print(f"Original data: {len(raw_data)} patients")
print(f"New data batch: {len(new_data_clean)} patients")

In [None]:
# Compare distributions
comparison = monitor.compare_distributions(
    df1=raw_data,
    df2=new_data_clean,
    label1="Original",
    label2="New Batch"
)

# Display features with drift
print("Distribution Drift Analysis:\n")
for feature, metrics in comparison['differences'].items():
    if metrics['drift_detected']:
        print(f"{feature}:")
        print(f"  Original mean: {metrics['Original_mean']:.2f}")
        print(f"  New mean: {metrics['New Batch_mean']:.2f}")
        print(f"  Drift: {metrics['mean_drift_pct']:.1f}%")
        print()

In [None]:
# Visualize drift for key features
drift_features = ['age', 'tmb_score', 'comorbidity_count']

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for idx, feature in enumerate(drift_features):
    axes[idx].hist(raw_data[feature].dropna(), bins=30, alpha=0.5, label='Original', edgecolor='black')
    axes[idx].hist(new_data_clean[feature].dropna(), bins=30, alpha=0.5, label='New Batch', edgecolor='black')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Count')
    axes[idx].set_title(f'{feature} Distribution Comparison')
    axes[idx].legend()

plt.tight_layout()
plt.show()

## 8. Data Quality History

Track quality checks over time.

In [None]:
# Get quality check history from feature store
quality_history = fs.get_data_quality_history(limit=20)

print("Recent Data Quality Checks:")
print(quality_history[['check_timestamp', 'check_type', 'passed', 'error_count', 'warning_count']])

## 9. Best Practices for ML Monitoring

### Key Monitoring Metrics:

1. **Data Quality Metrics**
   - Missing data rates
   - Invalid values
   - Data type errors

2. **Distribution Metrics**
   - Mean/median shifts
   - Standard deviation changes
   - Outlier detection

3. **Feature Metrics**
   - Feature correlations
   - Feature importance drift
   - New categorical values

4. **Model Performance**
   - Prediction distribution
   - Confidence scores
   - Error rates by segment

### Alerting Thresholds:

- Missing data > 20%: **Alert**
- Distribution shift > 20%: **Warning**
- New categorical values: **Warning**
- Model performance drop > 5%: **Alert**

### Monitoring Frequency:

- **Real-time**: Critical features and model predictions
- **Daily**: Data quality and distribution checks
- **Weekly**: Feature importance and correlation analysis
- **Monthly**: Comprehensive reports and trend analysis

## Summary

In this notebook we:
1. Generated comprehensive data quality reports
2. Analyzed missing data patterns
3. Monitored feature distributions
4. Detected distribution drift between batches
5. Tracked feature correlations
6. Reviewed quality check history
7. Learned monitoring best practices

### Why Monitoring Matters:

- **Prevent Silent Failures**: Catch data issues before they affect models
- **Detect Drift**: Identify when data patterns change over time
- **Maintain Quality**: Ensure consistent feature engineering
- **Enable Debugging**: Track down root causes of issues
- **Build Trust**: Demonstrate data quality to stakeholders

In [None]:
# Close feature store
fs.close()

print("\nMonitoring demo complete!")

In [None]:
## End of Noteboook ##