### Handling Missing Values - Imputation within ML Pipelines
**Description**: Implement a machine learning pipeline that includes imputation and a classifier.

In [None]:
# write your code from here

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import unittest

# Load and prepare data with error handling
try:
    # Load dataset
    iris = load_iris()
    X = pd.DataFrame(iris.data, columns=iris.feature_names)
    y = iris.target

    # Introduce some artificial missing values
    rng = np.random.RandomState(0)
    missing_rate = 0.1
    n_missing_samples = int(np.floor(missing_rate * X.size))
    missing_features = rng.randint(0, X.shape[1], n_missing_samples)
    missing_samples = rng.randint(0, X.shape[0], n_missing_samples)
    X.values[missing_samples, missing_features] = np.nan

except Exception as e:
    print(f"Data loading or preprocessing failed: {e}")
    exit()

# Split dataset into training and test sets
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
except Exception as e:
    print(f"Train-test split failed: {e}")
    exit()

# Create ML pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),        # Handle missing values
    ('scaler', StandardScaler()),                       # Feature scaling
    ('classifier', RandomForestClassifier(random_state=42))  # Classifier
])

# Fit model
try:
    pipeline.fit(X_train, y_train)
except Exception as e:
    print(f"Pipeline training failed: {e}")
    exit()

# Predict and evaluate
try:
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.2f}")
except Exception as e:
    print(f"Prediction or evaluation failed: {e}")

# ----------------------------
# ✅ Unit Test to Validate Pipeline Behavior
# ----------------------------
class TestPipeline(unittest.TestCase):
    def test_imputer_handles_nan(self):
        # Should not throw errors with missing data
        test_data = pd.DataFrame({
            'feature1': [1, 2, np.nan],
            'feature2': [4, np.nan, 6],
            'feature3': [7, 8, 9],
            'feature4': [np.nan, 11, 12]
        })
        test_target = [0, 1, 0]
        pipeline.fit(test_data, test_target)
        predictions = pipeline.predict(test_data)
        self.assertEqual(len(predictions), 3)

    def test_input_type_validation(self):
        with self.assertRaises(ValueError):
            # Introduce string in numeric column
            test_data = pd.DataFrame({
                'feature1': [1, 2, 'error'],
                'feature2': [4, 5, 6],
                'feature3': [7, 8, 9],
                'feature4': [10, 11, 12]
            })
            pipeline.fit(test_data, [0, 1, 0])

# Run the unit tests
unittest.main(argv=[''], exit=False)


..
----------------------------------------------------------------------
Ran 2 tests in 0.117s

OK


Test Accuracy: 0.97


<unittest.main.TestProgram at 0x7c8d773691b0>