    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [1]:
# Write your code from here
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

# 1. Load a sample dataset
data = {'feature1': [10, np.nan, 30, 40, 50],
        'feature2': [5, 15, np.nan, 35, 45],
        'feature3': [1.0, 2.0, 1.5, 2.5, np.nan],
        'category': ['A', 'B', 'A', 'C', 'B']}
df = pd.DataFrame(data)

print("Original DataFrame with missing values:")
print(df)

# Identify numerical and categorical features
numerical_features = ['feature1', 'feature2', 'feature3']
categorical_features = ['category']

# 2. Define a transformation pipeline with both imputation and scaling

# Pipeline for numerical features: Impute missing values with the mean, then scale
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# For categorical features, we'll just impute missing values with the most frequent
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

# Apply the numerical pipeline to the numerical features
df[numerical_features] = numerical_pipeline.fit_transform(df[numerical_features])

# Apply the categorical pipeline to the categorical features
df[categorical_features] = categorical_pipeline.fit_transform(df[categorical_features])

# Display the transformed DataFrame
print("\nDataFrame after imputation and scaling:")
print(df)

Original DataFrame with missing values:
   feature1  feature2  feature3 category
0      10.0       5.0       1.0        A
1       NaN      15.0       2.0        B
2      30.0       NaN       1.5        A
3      40.0      35.0       2.5        C
4      50.0      45.0       NaN        B

DataFrame after imputation and scaling:
   feature1  feature2  feature3 category
0 -1.700840 -1.414214      -1.5        A
1  0.000000 -0.707107       0.5        B
2 -0.188982  0.000000      -0.5        A
3  0.566947  0.707107       1.5        C
4  1.322876  1.414214       0.0        B


In [2]:
# Task: Imputation Function








# Scaling Function









# Combined Transformation Function









In [3]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

def impute_missing_values(df, numerical_cols=None, categorical_cols=None, numerical_strategy='mean', categorical_strategy='most_frequent'):
    """
    Imputes missing values in a pandas DataFrame for specified numerical and categorical columns
    using different strategies.

    Args:
        df (pd.DataFrame): The input DataFrame.
        numerical_cols (list, optional): List of numerical column names. Defaults to None (imputes all numerical columns).
        categorical_cols (list, optional): List of categorical column names. Defaults to None (imputes all object dtype columns).
        numerical_strategy (str, optional): Imputation strategy for numerical columns ('mean', 'median', 'constant'). Defaults to 'mean'.
        categorical_strategy (str, optional): Imputation strategy for categorical columns ('most_frequent', 'constant'). Defaults to 'most_frequent'.
        fill_value_numerical (float or int, optional): Value to fill missing numerical data when strategy is 'constant'. Defaults to None.
        fill_value_categorical (object, optional): Value to fill missing categorical data when strategy is 'constant'. Defaults to None.

    Returns:
        pd.DataFrame: The DataFrame with missing values imputed.
    """
    df_copy = df.copy()
    transformers = []

    if numerical_cols is None:
        numerical_cols = df_copy.select_dtypes(include=['number']).columns.tolist()
    if numerical_cols:
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=numerical_strategy))
        ])
        transformers.append(('num', numerical_transformer, numerical_cols))

    if categorical_cols is None:
        categorical_cols = df_copy.select_dtypes(include=['object', 'category']).columns.tolist()
    if categorical_cols:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=categorical_strategy))
        ])
        transformers.append(('cat', categorical_transformer, categorical_cols))

    if not transformers:
        print("No numerical or categorical columns specified or found.")
        return df_copy

    preprocessor = ColumnTransformer(transformers=transformers, remainder='passthrough')
    df_imputed = pd.DataFrame(preprocessor.fit_transform(df_copy), columns=preprocessor.get_feature_names_out())

    return df_imputed

def scale_features(df, numerical_cols=None, scaler_type='standard'):
    """
    Scales numerical features in a pandas DataFrame using either StandardScaler or MinMaxScaler.

    Args:
        df (pd.DataFrame): The input DataFrame.
        numerical_cols (list, optional): List of numerical column names to scale. Defaults to None (scales all numerical columns).
        scaler_type (str, optional): Type of scaler to use ('standard' for StandardScaler, 'minmax' for MinMaxScaler). Defaults to 'standard'.

    Returns:
        pd.DataFrame: The DataFrame with the specified numerical features scaled.
    """
    df_copy = df.copy()

    if numerical_cols is None:
        numerical_cols = df_copy.select_dtypes(include=['number']).columns.tolist()

    if not numerical_cols:
        print("No numerical columns specified or found to scale.")
        return df_copy

    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'minmax':
        scaler = MinMaxScaler()
    else:
        raise ValueError("Invalid scaler_type. Choose 'standard' or 'minmax'.")

    df_copy[numerical_cols] = scaler.fit_transform(df_copy[numerical_cols])
    return df_copy

def combined_transformation(df, numerical_cols=None, categorical_cols=None,
                            numerical_imputer_strategy='mean', categorical_imputer_strategy='most_frequent',
                            scaler_type='standard', encoder_type='onehot'):
    """
    Performs imputation, scaling (for numerical), and encoding (for categorical) on a pandas DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        numerical_cols (list, optional): List of numerical column names. Defaults to None (uses all numerical columns).
        categorical_cols (list, optional): List of categorical column names. Defaults to None (uses all object/category columns).
        numerical_imputer_strategy (str, optional): Imputation strategy for numerical columns. Defaults to 'mean'.
        categorical_imputer_strategy (str, optional): Imputation strategy for categorical columns. Defaults to 'most_frequent'.
        scaler_type (str, optional): Type of scaler for numerical columns ('standard', 'minmax'). Defaults to 'standard'.
        encoder_type (str, optional): Type of encoder for categorical columns ('onehot', 'label'). Defaults to 'onehot'.

    Returns:
        pd.DataFrame: The transformed DataFrame.
    """
    df_copy = df.copy()
    transformers = []

    if numerical_cols is None:
        numerical_cols = df_copy.select_dtypes(include=['number']).columns.tolist()
    if numerical_cols:
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=numerical_imputer_strategy)),
            ('scaler', StandardScaler() if scaler_type == 'standard' else MinMaxScaler())
        ])
        transformers.append(('num', numerical_transformer, numerical_cols))

    if categorical_cols is None:
        categorical_cols = df_copy.select_dtypes(include=['object', 'category']).columns.tolist()
    if categorical_cols:
        if encoder_type == 'onehot':
            categorical_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy=categorical_imputer_strategy)),
                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
            ])
        elif encoder_type == 'label':
            # LabelEncoder works on one column at a time, so we'll handle it differently
            df_encoded_categorical = df_copy[categorical_cols].copy()
            for col in categorical_cols:
                label_encoder = LabelEncoder()
                df_encoded_categorical[col] = label_encoder.fit_transform(df_encoded_categorical[col].astype(str)) # Handle potential NaNs
            # We'll return the concatenated DataFrame later
            pass
        else:
            raise ValueError("Invalid encoder_type. Choose 'onehot' or 'label'.")

        if encoder_type == 'onehot':
            transformers.append(('cat', categorical_transformer, categorical_cols))

    preprocessor = ColumnTransformer(transformers=transformers, remainder='passthrough')
    df_transformed = pd.DataFrame(preprocessor.fit_transform(df_copy), columns=preprocessor.get_feature_names_out())

    if encoder_type == 'label' and categorical_cols:
        # Drop original categorical columns and concatenate the label encoded ones
        df_transformed = df_transformed.drop(columns=categorical_cols, errors='ignore')
        df_transformed = pd.concat([df_transformed, df_encoded_categorical.reset_index(drop=True)], axis=1)

    return df_transformed

from sklearn.preprocessing import LabelEncoder

if __name__ == '__main__':
    # Create a sample DataFrame
    data = {'Age': [25, 30, np.nan, 40, 35],
            'Income': [50000, np.nan, 60000, 75000, 55000],
            'City': ['Bangalore', 'Mumbai', 'Bangalore', np.nan, 'Chennai'],
            'Education': ['Bachelor', 'Master', 'Bachelor', 'PhD', 'Master'],
            'Score': [0.8, 0.9, 0.75, 0.95, 0.85]}
    sample_df = pd.DataFrame(data)
    print("Original DataFrame:")
    print(sample_df)

    print("\n--- Imputation Function ---")
    df_imputed = impute_missing_values(sample_df)
    print(df_imputed)

    print("\n--- Scaling Function (StandardScaler) ---")
    df_scaled_standard = scale_features(sample_df.copy(), scaler_type='standard')
    print(df_scaled_standard)

    print("\n--- Scaling Function (MinMaxScaler) ---")
    df_scaled_minmax = scale_features(sample_df.copy(), scaler_type='minmax')
    print(df_scaled_minmax)

    print("\n--- Combined Transformation Function (StandardScaler, One-Hot Encoding) ---")
    df_transformed_onehot = combined_transformation(sample_df.copy(), scaler_type='standard', encoder_type='onehot')
    print(df_transformed_onehot)

    print("\n--- Combined Transformation Function (MinMaxScaler, Label Encoding) ---")
    df_transformed_label = combined_transformation(sample_df.copy(), scaler_type='minmax', encoder_type='label')
    print(df_transformed_label)

Original DataFrame:
    Age   Income       City Education  Score
0  25.0  50000.0  Bangalore  Bachelor   0.80
1  30.0      NaN     Mumbai    Master   0.90
2   NaN  60000.0  Bangalore  Bachelor   0.75
3  40.0  75000.0        NaN       PhD   0.95
4  35.0  55000.0    Chennai    Master   0.85

--- Imputation Function ---
  num__Age num__Income num__Score  cat__City cat__Education
0     25.0     50000.0        0.8  Bangalore       Bachelor
1     30.0     60000.0        0.9     Mumbai         Master
2     32.5     60000.0       0.75  Bangalore       Bachelor
3     40.0     75000.0       0.95  Bangalore            PhD
4     35.0     55000.0       0.85    Chennai         Master

--- Scaling Function (StandardScaler) ---
        Age    Income       City Education     Score
0 -1.341641 -1.069045  Bangalore  Bachelor -0.707107
1 -0.447214       NaN     Mumbai    Master  0.707107
2       NaN  0.000000  Bangalore  Bachelor -1.414214
3  1.341641  1.603567        NaN       PhD  1.414214
4  0.447214 -

NameError: name 'MinMaxScaler' is not defined