### Ensuring Feature Consistency Between Training & InferencePipelines:

**Task 1**: Consistent Feature Preparation
- Step 1: Write a function for data preprocessing and imputation shared by both training and inference pipelines.
- Step 2: Demonstrate consistent application on both datasets.

In [None]:
# write your code from here

**Task 2**: Pipeline Integration
- Step 1: Use sklearn pipelines to encapsulate the preprocessing steps.
- Step 2: Configure identical pipelines for both training and building inference models.

In [None]:
# write your code from here

**Task 3**: Saving and Loading Preprocessing Models
- Step 1: Save the transformation model after fitting it to the training data.
- Step 2: Load and apply the saved model during inference.

In [None]:
# write your code from here

In [4]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer # For more robust preprocessing
import joblib
import logging
import os # For file operations and cleanup in tests
import unittest
import numpy as np # For assertions in tests

# --- Configuration ---
# Using a dictionary for configuration allows easy modification and readability.
# In a larger project, this might be a separate config.py or config.yaml file.
CONFIG = {
    'model_save_dir': 'models',
    'pipeline_filename': 'preprocessing_pipeline.pkl',
    'imputation_strategy': 'mean',
    'numeric_cols_to_process': None # Set to None to auto-detect, or list specific cols
}

# --- Logging Setup ---
# Configure logging for better feedback and error tracking.
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Feature Preparation Functions ---

def load_data(data_path: str) -> pd.DataFrame:
    """
    Loads data from a specified CSV file path.

    This function includes robust error handling for common data loading issues
    like FileNotFoundError, EmptyDataError, and general read errors.

    Args:
        data_path (str): The path to the CSV file.

    Returns:
        pd.DataFrame: The loaded DataFrame.

    Raises:
        FileNotFoundError: If the specified file does not exist.
        pd.errors.EmptyDataError: If the specified file is empty.
        Exception: For other unexpected errors during data loading.
    """
    logging.info(f"Attempting to load data from: {data_path}")
    try:
        df = pd.read_csv(data_path)
        if df.empty:
            raise pd.errors.EmptyDataError(f"The file at '{data_path}' is empty.")
        logging.info("Data loaded successfully.")
        return df
    except FileNotFoundError:
        logging.error(f"Error: Data file not found at '{data_path}'. Please check the path.")
        raise
    except pd.errors.EmptyDataError as e:
        logging.error(f"Error: {e}")
        raise
    except Exception as e:
        logging.error(f"An unexpected error occurred while loading data from '{data_path}': {e}", exc_info=True)
        raise

def create_preprocessing_pipeline(df: pd.DataFrame, imputer_strategy: str = 'mean', numeric_cols: list = None) -> Pipeline:
    """
    Creates and returns a scikit-learn preprocessing Pipeline.

    This pipeline handles missing values using SimpleImputer and scales numerical features
    using StandardScaler. It dynamically identifies numerical columns if not provided.

    Args:
        df (pd.DataFrame): A sample DataFrame to infer column types if numeric_cols is None.
        imputer_strategy (str): The strategy to use for SimpleImputer (e.g., 'mean', 'median', 'most_frequent').
        numeric_cols (list, optional): A list of numerical column names to apply transformations.
                                       If None, all numeric columns in the DataFrame will be selected.

    Returns:
        Pipeline: A scikit-learn Pipeline object configured for preprocessing.
    """
    logging.info(f"Creating preprocessing pipeline with imputation strategy: '{imputer_strategy}'")

    if numeric_cols is None:
        # Automatically select numeric columns if not specified, ensuring robustness
        numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
        logging.info(f"Auto-detected numeric columns for preprocessing: {numeric_cols}")
    else:
        # Validate that provided numeric_cols exist in the DataFrame
        missing_cols = [col for col in numeric_cols if col not in df.columns]
        if missing_cols:
            logging.warning(f"The following specified numeric columns are not found in the DataFrame: {missing_cols}")
            # Filter out missing columns to prevent errors, or raise an error based on strictness
            numeric_cols = [col for col in numeric_cols if col in df.columns]
            if not numeric_cols:
                raise ValueError("No valid numeric columns found or specified for preprocessing.")
            logging.info(f"Proceeding with existing numeric columns: {numeric_cols}")

    if not numeric_cols:
        logging.warning("No numeric columns found or specified to apply preprocessing. Returning an empty pipeline.")
        return Pipeline([])

    # Define the preprocessing steps for numerical features
    # Using ColumnTransformer is generally recommended for handling different column types,
    # though for only numerical, a simple pipeline is also fine.
    # For this example, we'll keep it focused on numerical as in the original code,
    # but a full ColumnTransformer would be used if categorical features were also involved.
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy=imputer_strategy)),
        ('scaler', StandardScaler())
    ])

    # If you had categorical columns, you would use ColumnTransformer like this:
    # preprocessor = ColumnTransformer(
    #     transformers=[
    #         ('num', numerical_transformer, numeric_cols),
    #         ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    #     ])
    # For now, we'll stick to a simple pipeline for the selected numeric columns.
    
    # Create the full pipeline
    pipeline = Pipeline(steps=[
        ('numerical_preprocessing', numerical_transformer)
    ])
    
    return pipeline

def save_pipeline(pipeline: Pipeline, save_path: str):
    """
    Saves a fitted scikit-learn pipeline to disk using joblib.

    Args:
        pipeline (Pipeline): The fitted scikit-learn pipeline to save.
        save_path (str): The full path including filename to save the pipeline.
    """
    logging.info(f"Attempting to save pipeline to: {save_path}")
    try:
        os.makedirs(os.path.dirname(save_path), exist_ok=True) # Ensure directory exists
        joblib.dump(pipeline, save_path)
        logging.info("Pipeline saved successfully.")
    except Exception as e:
        logging.error(f"Error saving pipeline to '{save_path}': {e}", exc_info=True)
        raise

def load_pipeline(load_path: str) -> Pipeline:
    """
    Loads a fitted scikit-learn pipeline from disk using joblib.

    Args:
        load_path (str): The full path including filename to load the pipeline from.

    Returns:
        Pipeline: The loaded scikit-learn Pipeline object.

    Raises:
        FileNotFoundError: If the specified pipeline file does not exist.
        Exception: For other unexpected errors during pipeline loading.
    """
    logging.info(f"Attempting to load pipeline from: {load_path}")
    try:
        pipeline = joblib.load(load_path)
        logging.info("Pipeline loaded successfully.")
        return pipeline
    except FileNotFoundError:
        logging.error(f"Error: Pipeline file not found at '{load_path}'.")
        raise
    except Exception as e:
        logging.error(f"An unexpected error occurred while loading pipeline from '{load_path}': {e}", exc_info=True)
        raise

# --- Main Execution / Example Usage ---

def run_example_pipeline():
    """
    Demonstrates the full lifecycle of the preprocessing pipeline:
    data loading (conceptual), pipeline creation, fitting, transformation, saving, and loading.
    """
    logging.info("\n--- Starting Pipeline Example ---")

    # --- 1. Data Loading (Using dummy DataFrames as per your example) ---
    logging.info("Creating dummy data for demonstration.")
    train_df = pd.DataFrame({'A': [1, 2, None, 4, 10], 'B': [5, None, 7, 8, 20], 'C': ['cat1', 'cat2', 'cat1', 'cat3', 'cat2']})
    test_df = pd.DataFrame({'A': [None, 2, 3], 'B': [5, 6, None], 'C': ['cat2', 'cat1', 'cat3']})
    inference_df = pd.DataFrame({'A': [2, None, 5], 'B': [None, 1, 8], 'C': ['cat1', 'cat3', 'cat2']})

    # For a real scenario, you'd use load_data function here:
    # try:
    #     train_df = load_data('path/to/your/train_data.csv')
    #     test_df = load_data('path/to/your/test_data.csv')
    #     inference_df = load_data('path/to/your/inference_data.csv')
    # except (FileNotFoundError, pd.errors.EmptyDataError, Exception) as e:
    #     logging.critical(f"Exiting due to data loading error: {e}")
    #     return

    # --- 2. Pipeline Integration and Fitting ---
    logging.info("Creating and fitting the preprocessing pipeline on training data.")
    
    # Determine numeric columns dynamically if not specified in CONFIG
    if CONFIG['numeric_cols_to_process'] is None:
        numeric_cols_for_pipeline = train_df.select_dtypes(include=np.number).columns.tolist()
    else:
        numeric_cols_for_pipeline = CONFIG['numeric_cols_to_process']

    try:
        preprocessing_pipeline = create_preprocessing_pipeline(
            train_df,
            imputer_strategy=CONFIG['imputation_strategy'],
            numeric_cols=numeric_cols_for_pipeline
        )
        # We need to fit the pipeline on the *selected* numeric columns from the training data
        # The pipeline itself handles the selection via ColumnTransformer if used, or by just fitting.
        # Since `create_preprocessing_pipeline` now returns a pipeline whose first step
        # is the numerical_transformer, we fit the entire pipeline.
        preprocessing_pipeline.fit(train_df[numeric_cols_for_pipeline])
        logging.info("Preprocessing pipeline fitted successfully.")
    except Exception as e:
        logging.error(f"Error during pipeline creation or fitting: {e}", exc_info=True)
        return

    # --- 3. Transformation ---
    logging.info("Transforming train and test data using the fitted pipeline.")
    try:
        # Ensure we pass only the relevant columns for transformation
        train_processed = preprocessing_pipeline.transform(train_df[numeric_cols_for_pipeline])
        test_processed = preprocessing_pipeline.transform(test_df[numeric_cols_for_pipeline])
        inference_processed = preprocessing_pipeline.transform(inference_df[numeric_cols_for_pipeline])

        logging.info("Train after preprocessing (first 5 rows):\n%s", train_processed[:5])
        logging.info("Test after preprocessing (first 5 rows):\n%s", test_processed[:5])
        logging.info("Inference data after applying loaded pipeline (first 5 rows):\n%s", inference_processed[:5])
    except Exception as e:
        logging.error(f"Error during data transformation: {e}", exc_info=True)
        return

    # --- 4. Model Saving (Pipeline Saving) ---
    logging.info("Saving the fitted pipeline.")
    pipeline_save_path = os.path.join(CONFIG['model_save_dir'], CONFIG['pipeline_filename'])
    try:
        save_pipeline(preprocessing_pipeline, pipeline_save_path)
    except Exception as e:
        logging.error(f"Failed to save pipeline: {e}", exc_info=True)
        return

    # --- 5. Loading Pipeline (for inference) ---
    logging.info("Loading the saved pipeline for inference.")
    try:
        loaded_pipeline = load_pipeline(pipeline_save_path)
        logging.info("Pipeline loaded successfully for inference.")
    except Exception as e:
        logging.error(f"Failed to load pipeline: {e}", exc_info=True)
        return

    logging.info("\n--- Pipeline Example Complete ---")


# --- Unit Tests ---
# This section uses Python's built-in 'unittest' framework.
# To run these tests, save the file (e.g., `pipeline_script.py`) and then
# from your terminal, navigate to its directory and run:
# `python -m unittest pipeline_script.py`

class TestPreprocessing(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        """Set up test data once for all tests in this class."""
        cls.train_df = pd.DataFrame({
            'A': [1, 2, np.nan, 4, 10],
            'B': [5, np.nan, 7, 8, 20],
            'C': ['cat1', 'cat2', 'cat1', 'cat3', 'cat2']
        })
        cls.test_df = pd.DataFrame({
            'A': [np.nan, 2, 3],
            'B': [5, 6, np.nan],
            'C': ['cat2', 'cat1', 'cat3']
        })
        cls.save_dir = 'test_models'
        cls.pipeline_path = os.path.join(cls.save_dir, 'test_preprocessing_pipeline.pkl')
        os.makedirs(cls.save_dir, exist_ok=True) # Ensure test directory exists

    @classmethod
    def tearDownClass(cls):
        """Clean up generated test files after all tests are done."""
        if os.path.exists(cls.save_dir):
            for f in os.listdir(cls.save_dir):
                os.remove(os.path.join(cls.save_dir, f))
            os.rmdir(cls.save_dir)

    def test_load_data_success(self):
        """Test successful loading of a valid CSV file."""
        test_file_path = os.path.join(self.save_dir, 'temp_data.csv')
        pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}).to_csv(test_file_path, index=False)
        
        df = load_data(test_file_path)
        self.assertIsInstance(df, pd.DataFrame)
        self.assertFalse(df.empty)
        self.assertEqual(df.shape, (2, 2))
        os.remove(test_file_path)

    def test_load_data_file_not_found(self):
        """Test error handling for non-existent file."""
        with self.assertRaises(FileNotFoundError):
            load_data('non_existent_file.csv')

    def test_load_data_empty_file(self):
        """Test error handling for an empty CSV file."""
        empty_file_path = os.path.join(self.save_dir, 'empty.csv')
        open(empty_file_path, 'a').close() # Create an empty file
        with self.assertRaises(pd.errors.EmptyDataError):
            load_data(empty_file_path)
        os.remove(empty_file_path)

    def test_create_preprocessing_pipeline_imputer_and_scaler(self):
        """Test that the pipeline correctly includes imputer and scaler."""
        pipeline = create_preprocessing_pipeline(self.train_df, imputer_strategy='mean')
        self.assertIsInstance(pipeline, Pipeline)
        self.assertEqual(len(pipeline.steps), 1) # numerical_preprocessing step
        
        # Check the steps within the 'numerical_preprocessing' sub-pipeline
        numerical_sub_pipeline = pipeline.named_steps['numerical_preprocessing']
        self.assertIsInstance(numerical_sub_pipeline, Pipeline)
        self.assertEqual(len(numerical_sub_pipeline.steps), 2)
        self.assertEqual(numerical_sub_pipeline.steps[0][0], 'imputer')
        self.assertEqual(numerical_sub_pipeline.steps[1][0], 'scaler')
        self.assertIsInstance(numerical_sub_pipeline.named_steps['imputer'], SimpleImputer)
        self.assertIsInstance(numerical_sub_pipeline.named_steps['scaler'], StandardScaler)

    def test_create_preprocessing_pipeline_dynamic_columns(self):
        """Test that the pipeline correctly auto-detects numeric columns."""
        df_mixed = pd.DataFrame({'num1': [1,2], 'num2': [3,4], 'text': ['a','b']})
        pipeline = create_preprocessing_pipeline(df_mixed)
        # Fit a dummy pipeline to ensure components are correctly applied
        fitted_pipeline = pipeline.fit(df_mixed[['num1', 'num2']]) 
        # Check if transformations apply to the expected columns
        transformed_data = fitted_pipeline.transform(df_mixed[['num1', 'num2']])
        self.assertEqual(transformed_data.shape, (2, 2)) # Should process 2 numeric columns

    def test_pipeline_fit_transform(self):
        """Test that the pipeline can be fitted and transforms data correctly."""
        numeric_cols = self.train_df.select_dtypes(include=np.number).columns.tolist()
        pipeline = create_preprocessing_pipeline(self.train_df, numeric_cols=numeric_cols)
        
        # Fit the pipeline on the selected numeric columns of the training data
        fitted_pipeline = pipeline.fit(self.train_df[numeric_cols])
        
        # Transform the training data
        train_transformed = fitted_pipeline.transform(self.train_df[numeric_cols])
        
        # Manually calculate expected imputation and scaling for 'A'
        # Original A: [1, 2, nan, 4, 10]
        # Mean of A: (1+2+4+10)/4 = 17/4 = 4.25
        # Imputed A: [1, 2, 4.25, 4, 10]
        
        # A_mean = (1+2+4+10)/4 = 4.25
        # A_std = np.std([1, 2, 4.25, 4, 10]) # Population std by default for numpy
        # For sklearn StandardScaler, it's (x - mean) / std_dev (population std_dev)
        # Using a small epsilon to account for floating point differences

        # Expected values from `sklearn.impute.SimpleImputer(strategy='mean').fit_transform()`
        # and `sklearn.preprocessing.StandardScaler().fit_transform()`
        
        imputer = SimpleImputer(strategy='mean')
        scaler = StandardScaler()
        expected_train_A = scaler.fit_transform(imputer.fit_transform(self.train_df[['A']]))
        expected_train_B = scaler.fit_transform(imputer.fit_transform(self.train_df[['B']]))
        
        # Concatenate expected results if multiple numeric columns
        expected_train_transformed = np.hstack([expected_train_A, expected_train_B])

        np.testing.assert_array_almost_equal(train_transformed, expected_train_transformed)

        # Transform test data using the fitted pipeline
        test_transformed = fitted_pipeline.transform(self.test_df[numeric_cols])

        # Manually calculate expected for test_df
        # Test A: [nan, 2, 3]
        # Using imputer fitted on train: A_mean = 4.25
        # Imputed Test A: [4.25, 2, 3]
        # Test B: [5, 6, nan]
        # Using imputer fitted on train: B_mean = (5+7+8+20)/4 = 40/4 = 10
        # Imputed Test B: [5, 6, 10]

        # Use the imputer and scaler *fitted on training data* to transform test data
        imputer_test = SimpleImputer(strategy='mean').fit(self.train_df[numeric_cols].iloc[:, 0].values.reshape(-1, 1))
        scaler_test = StandardScaler().fit(imputer_test.transform(self.train_df[numeric_cols].iloc[:, 0].values.reshape(-1, 1)))
        
        expected_test_A = scaler_test.transform(imputer_test.transform(self.test_df[['A']]))

        imputer_test_B = SimpleImputer(strategy='mean').fit(self.train_df[numeric_cols].iloc[:, 1].values.reshape(-1, 1))
        scaler_test_B = StandardScaler().fit(imputer_test_B.transform(self.train_df[numeric_cols].iloc[:, 1].values.reshape(-1, 1)))

        expected_test_B = scaler_test_B.transform(imputer_test_B.transform(self.test_df[['B']]))
        
        expected_test_transformed = np.hstack([expected_test_A, expected_test_B])
        np.testing.assert_array_almost_equal(test_transformed, expected_test_transformed)


    def test_save_and_load_pipeline(self):
        """Test that a pipeline can be saved and loaded successfully."""
        numeric_cols = self.train_df.select_dtypes(include=np.number).columns.tolist()
        pipeline = create_preprocessing_pipeline(self.train_df, numeric_cols=numeric_cols)
        fitted_pipeline = pipeline.fit(self.train_df[numeric_cols])

        save_pipeline(fitted_pipeline, self.pipeline_path)
        self.assertTrue(os.path.exists(self.pipeline_path))

        loaded_pipeline = load_pipeline(self.pipeline_path)
        self.assertIsInstance(loaded_pipeline, Pipeline)
        
        # Verify that the loaded pipeline produces the same transformation
        train_transformed_original = fitted_pipeline.transform(self.train_df[numeric_cols])
        train_transformed_loaded = loaded_pipeline.transform(self.train_df[numeric_cols])
        np.testing.assert_array_almost_equal(train_transformed_original, train_transformed_loaded)

    def test_load_pipeline_file_not_found(self):
        """Test error handling when loading a non-existent pipeline file."""
        with self.assertRaises(FileNotFoundError):
            load_pipeline('non_existent_pipeline.pkl')

# --- Main execution block ---
# This ensures that `run_example_pipeline()` is called when the script is run directly,
# but not when imported as a module (e.g., by the unittest framework).
if __name__ == "__main__":
    # Run the example pipeline demonstration
    run_example_pipeline()

    # To run the tests, uncomment the following line or run from terminal:
    # `python -m unittest your_script_name.py`
    # unittest.main(argv=['first-arg-is-ignored'], exit=False) # For running tests programmatically

2025-05-28 15:05:06,672 - INFO - 
--- Starting Pipeline Example ---
2025-05-28 15:05:06,673 - INFO - Creating dummy data for demonstration.
2025-05-28 15:05:06,676 - INFO - Creating and fitting the preprocessing pipeline on training data.
2025-05-28 15:05:06,678 - INFO - Creating preprocessing pipeline with imputation strategy: 'mean'
2025-05-28 15:05:06,712 - INFO - Preprocessing pipeline fitted successfully.
2025-05-28 15:05:06,715 - INFO - Transforming train and test data using the fitted pipeline.
2025-05-28 15:05:06,761 - INFO - Train after preprocessing (first 5 rows):
[[-1.040833   -0.95173373]
 [-0.72057669  0.        ]
 [ 0.         -0.57104024]
 [-0.08006408 -0.38069349]
 [ 1.84147377  1.90346747]]
2025-05-28 15:05:06,778 - INFO - Test after preprocessing (first 5 rows):
[[ 0.         -0.95173373]
 [-0.72057669 -0.76138699]
 [-0.40032038  0.        ]]
2025-05-28 15:05:06,780 - INFO - Inference data after applying loaded pipeline (first 5 rows):
[[-0.72057669  0.        ]
 [ 0