## Real-World Case Studies

### Healthcare - Medical Prediction Errors:
**Description**: Implement validation rules using a healthcare dataset to reduce errors in
predictive models by automating data quality checks.

In [None]:
# write your code from here

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import unittest

# --- Step 1: Simulated Healthcare Data (e.g., predicting disease from vitals/lab tests) ---
def generate_healthcare_data():
    np.random.seed(0)
    size = 1000
    data = pd.DataFrame({
        'Age': np.random.randint(18, 90, size),
        'Blood_Pressure': np.random.normal(120, 15, size),
        'Cholesterol': np.random.normal(200, 30, size),
        'Heart_Rate': np.random.normal(75, 10, size),
        'Diabetes': np.random.choice([0, 1], size=size, p=[0.85, 0.15]),  # target variable
    })

    # Introduce some invalid data to test validation
    data.loc[5, 'Age'] = 200       # Invalid age
    data.loc[10, 'Cholesterol'] = -50  # Invalid cholesterol
    return data

# --- Step 2: Validation Rules for Data Quality ---
def validate_healthcare_data(df):
    errors = []
    if df['Age'].between(0, 120).mean() < 0.99:
        errors.append("Age out of valid range (0-120)")
    if (df['Blood_Pressure'] < 0).any():
        errors.append("Negative blood pressure values found")
    if (df['Cholesterol'] < 0).any():
        errors.append("Negative cholesterol values found")
    if (df['Heart_Rate'] < 0).any():
        errors.append("Negative heart rate values found")
    
    if errors:
        raise ValueError(f"Data validation failed: {errors}")
    return True

# --- Step 3: Preprocessing ---
def preprocess_data(df):
    df_clean = df[(df['Age'] <= 120) & (df['Cholesterol'] >= 0)]
    X = df_clean.drop('Diabetes', axis=1)
    y = df_clean['Diabetes']
    return X, y

# --- Step 4: Splitting Data ---
def split_data(X, y):
    return train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# --- Step 5: Model Training ---
def train_model(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

# --- Step 6: Evaluation ---
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    acc = accuracy_score(y_test, y_pred)
    if acc < 0.85:
        raise ValueError("Model accuracy below acceptable threshold")
    print(f"✅ Model Accuracy: {acc:.2f}")
    return acc

# --- Pipeline Execution ---
def run_pipeline():
    print("🔍 Generating Healthcare Data...")
    df = generate_healthcare_data()

    print("🛡️ Validating Data...")
    validate_healthcare_data(df)

    print("⚙️ Preprocessing...")
    X, y = preprocess_data(df)

    print("✂️ Splitting Data...")
    X_train, X_test, y_train, y_test = split_data(X, y)

    print("🎯 Training Model...")
    model = train_model(X_train, y_train)

    print("📊 Evaluating Model...")
    evaluate_model(model, X_test, y_test)

    print("✅ Pipeline completed successfully.")

# --- Unit Tests for Validation and Preprocessing ---
class TestMedicalPredictionPipeline(unittest.TestCase):

    def test_valid_data_pass(self):
        df = pd.DataFrame({
            'Age': [25, 40, 60],
            'Blood_Pressure': [110, 120, 130],
            'Cholesterol': [180, 200, 220],
            'Heart_Rate': [70, 72, 74],
            'Diabetes': [0, 1, 0]
        })
        self.assertTrue(validate_healthcare_data(df))

    def test_invalid_age_fails(self):
        df = pd.DataFrame({
            'Age': [25, 140, 60],
            'Blood_Pressure': [110, 120, 130],
            'Cholesterol': [180, 200, 220],
            'Heart_Rate': [70, 72, 74],
            'Diabetes': [0, 1, 0]
        })
        with self.assertRaises(ValueError):
            validate_healthcare_data(df)

    def test_preprocessing_removes_invalid_rows(self):
        df = pd.DataFrame({
            'Age': [25, 200],
            'Blood_Pressure': [110, 120],
            'Cholesterol': [180, -10],
            'Heart_Rate': [70, 72],
            'Diabetes': [0, 1]
        })
        X, y = preprocess_data(df)
        self.assertEqual(len(X), 1)

# --- Main Entry ---
if __name__ == "__main__":
    run_pipeline()
    print("\n🧪 Running Tests...")
    unittest.main(argv=[''], exit=False)


🔍 Generating Healthcare Data...
🛡️ Validating Data...


ValueError: Data validation failed: ['Negative cholesterol values found']