In [33]:
%pwd

'/Users/rajusubba/Documents/End-to-End MLOps/customer-churn-project'

In [34]:
import os
os.chdir('/Users/rajusubba/Documents/End-to-End MLOps/customer-churn-project')
%pwd

'/Users/rajusubba/Documents/End-to-End MLOps/customer-churn-project'

In [None]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    target_column: str                 # ✅ ADD THIS
    processed_train_path: Path
    processed_test_path: Path
    preprocessor_object_path: Path

In [36]:
from src.customerchurn.constants import *
from src.customerchurn.utils.common import read_yaml, create_directories

In [37]:
class ConfigurationManager:
    def __init__(self, 
                 config_file_path = CONFIG_FILE_PATH,
                 params_file_path = PARAMS_FILE_PATH,
                 schema_file_path = SCHEMA_FILE_PATH,
                 ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)
        
        create_directories([self.config.artifacts_root])
        
    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        config = self.config.data_preprocessing
        create_directories([config.root_dir])

        return DataPreprocessingConfig(
            root_dir=Path(config.root_dir),
            train_data_path=Path(config.train_data_path),
            test_data_path=Path(config.test_data_path),
            target_column=self.schema.target_column,  # ✅ THIS FIXES YOUR ERROR
            processed_train_path=Path(config.processed_train_path),
            processed_test_path=Path(config.processed_test_path),
            preprocessor_object_path=Path(config.preprocessor_object_path),
        )

In [41]:
import os
from src.customerchurn.logging.logger import logger
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
import joblib

In [42]:
class DataPreprocessor:
    def __init__(self, config: DataPreprocessingConfig):
        self.config = config
        self.ohe = None
        self.scaler = None
        self.categorical_cols = None
        self.numerical_cols = None

    def load_data(self, file_path: Path) -> pd.DataFrame:
        logger.info(f"Loading data from {file_path}")
        return pd.read_csv(file_path)

    def fit(self, df: pd.DataFrame) -> None:
        """Fit preprocessing objects on TRAIN data only."""
        X = df.drop(columns=[self.config.target_column])

        self.categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
        self.numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

        self.ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        self.scaler = StandardScaler()

        if self.categorical_cols:
            self.ohe.fit(X[self.categorical_cols])

        if self.numerical_cols:
            self.scaler.fit(X[self.numerical_cols])

        logger.info("Preprocessor fitted on training data")

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Transform using already-fitted objects."""
        if self.ohe is None or self.scaler is None:
            raise ValueError("Preprocessor is not fitted. Call fit() first.")

        X = df.drop(columns=[self.config.target_column])
        y = df[self.config.target_column].reset_index(drop=True)

        parts = []

        if self.numerical_cols:
            num_data = self.scaler.transform(X[self.numerical_cols])
            num_df = pd.DataFrame(num_data, columns=self.numerical_cols)
            parts.append(num_df)

        if self.categorical_cols:
            cat_data = self.ohe.transform(X[self.categorical_cols])
            cat_cols = self.ohe.get_feature_names_out(self.categorical_cols)
            cat_df = pd.DataFrame(cat_data, columns=cat_cols)
            parts.append(cat_df)

        X_processed = pd.concat(parts, axis=1)
        processed_df = pd.concat([X_processed.reset_index(drop=True), y], axis=1)

        return processed_df

    def preprocess_data(self):
        """Pipeline entry point: load train/test, fit on train, transform both, save artifacts."""
        train_df = self.load_data(self.config.train_data_path)
        test_df = self.load_data(self.config.test_data_path)

        # Fit on train only
        self.fit(train_df)

        train_processed = self.transform(train_df)
        test_processed = self.transform(test_df)

        # Ensure output dir exists
        os.makedirs(self.config.root_dir, exist_ok=True)

        # Save processed datasets (optional but useful)
        train_processed.to_csv(self.config.processed_train_path, index=False)
        test_processed.to_csv(self.config.processed_test_path, index=False)

        # Save preprocessor bundle (for inference)
        bundle = {
            "categorical_cols": self.categorical_cols,
            "numerical_cols": self.numerical_cols,
            "ohe": self.ohe,
            "scaler": self.scaler,
            "target_column": self.config.target_column,
        }
        joblib.dump(bundle, self.config.preprocessor_object_path)

        logger.info(f"Saved processed train to: {self.config.processed_train_path}")
        logger.info(f"Saved processed test to: {self.config.processed_test_path}")
        logger.info(f"Saved preprocessor bundle to: {self.config.preprocessor_object_path}")

        return self.config.processed_train_path, self.config.processed_test_path

In [43]:
try:
    config_manager = ConfigurationManager()
    data_preprocessor_config = config_manager.get_data_preprocessing_config()
    data_preprocessor = DataPreprocessor(config=data_preprocessor_config)
    data_preprocessor.preprocess_data()
except Exception as e:
    logger.exception(e)
    raise e 

[2026-02-05 11:35:32,043: INFO: common: YAML file config/config.yaml loaded successfully.]
[2026-02-05 11:35:32,045: INFO: common: YAML file params.yaml loaded successfully.]
[2026-02-05 11:35:32,047: INFO: common: YAML file schema.yaml loaded successfully.]
[2026-02-05 11:35:32,048: INFO: common: Directory created at: artifacts]
[2026-02-05 11:35:32,049: INFO: common: Directory created at: artifacts/data_preprocessing]
[2026-02-05 11:35:32,049: INFO: 184846966: Loading data from artifacts/data_transformation/train.csv]
[2026-02-05 11:35:32,061: INFO: 184846966: Loading data from artifacts/data_transformation/test.csv]
[2026-02-05 11:35:32,071: INFO: 184846966: Preprocessor fitted on training data]
[2026-02-05 11:35:39,215: INFO: 184846966: Saved processed train to: artifacts/data_preprocessing/train_processed.csv]
[2026-02-05 11:35:39,216: INFO: 184846966: Saved processed test to: artifacts/data_preprocessing/test_processed.csv]
[2026-02-05 11:35:39,216: INFO: 184846966: Saved preproc