In [1]:
import os

In [2]:
%pwd

'd:\\ML_OPS_BABBY_FULL_STACK_NEW\\End-to-End-wine-quality-ML-Project\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'd:\\ML_OPS_BABBY_FULL_STACK_NEW\\End-to-End-wine-quality-ML-Project'

In [5]:
import pandas as pd

In [6]:
data = pd.read_csv("artifacts/data_ingestion/winequality-red.csv")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    """
    Configuration class for data transformation with immutable attributes.
    
    This dataclass defines the required parameters for the data transformation process
    in the wine quality prediction pipeline. The 'frozen=True' parameter ensures
    all attributes are read-only after initialization.
    
    Attributes:
        root_dir (Path): Directory where all data transformation outputs will be stored,
                        including preprocessed datasets and serialized transformers
        data_path (Path): Path to the validated CSV file that will be transformed,
                         output from the data ingestion step
    
    Note:
        Data transformation typically includes preprocessing steps like scaling,
        normalization, handling missing values, and train-test splitting, which
        prepare the data for model training.
    """
    root_dir: Path
    data_path: Path

In [8]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    """
    Manages configuration for the ML pipeline components.
    
    This class centralizes access to all configuration parameters by reading from
    YAML configuration files and providing component-specific configuration objects.
    
    Attributes:
        config: Main configuration parameters from config.yaml
        params: Model hyperparameters and training parameters from params.yaml
        schema: Data schema specifications from schema.yaml
    
    Methods:
        get_data_ingestion_config: Returns configuration for the data ingestion component
        get_data_validation_config: Returns configuration for the data validation component
        get_data_transformation_config: Returns configuration for the data transformation component
    """
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):
        """
        Initialize the ConfigurationManager with paths to configuration files.
        
        Args:
            config_filepath: Path to the main configuration file (default: CONFIG_FILE_PATH)
            params_filepath: Path to the parameters file (default: PARAMS_FILE_PATH)
            schema_filepath: Path to the schema file (default: SCHEMA_FILE_PATH)
        
        Note:
            Creates the root artifacts directory specified in the main configuration.
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        """
        Prepare and return the configuration for data transformation.
        
        This method extracts the data transformation configuration from the config file
        and creates a DataTransformationConfig object.
        
        Returns:
            DataTransformationConfig: Configuration object with all parameters
                                     required for the data transformation component.
                                     
        Note:
            Creates the root directory for data transformation if it doesn't exist.
        """
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [10]:
import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd

In [11]:
class DataTransformation:
    """
    Handles the data transformation process for the ML pipeline.
    
    This class is responsible for transforming the validated data into a format
    suitable for model training. In this implementation, the transformation is
    limited to train-test splitting since the wine quality dataset is already
    relatively clean.
    
    Attributes:
        config (DataTransformationConfig): Configuration containing all parameters
                                           needed for the data transformation process.
    """
    def __init__(self, config: DataTransformationConfig):
        """
        Initialize the DataTransformation component with configuration.
        
        Args:
            config (DataTransformationConfig): Configuration object with all required
                                               parameters for data transformation.
        """
        self.config = config

    
    ## Note: You can add different data transformation techniques such as Scaler, PCA and all
    #You can perform all kinds of EDA in ML cycle here before passing this data to the model

    # I am only adding train_test_spliting cz this data is already cleaned up


    def train_test_spliting(self):
        """
        Splits the dataset into training and testing sets.
        
        This method reads the validated CSV file and splits it into train and test
        sets using a 75/25 ratio. The split datasets are saved as separate CSV files
        in the transformation output directory.
        
        Process:
        1. Reads the validated CSV data file
        2. Splits the data using train_test_split with default 75/25 ratio
        3. Saves the resulting datasets as train.csv and test.csv
        4. Logs and prints the shape of the resulting datasets
        
        Returns:
            None: The method saves the split datasets to disk but doesn't return them
            
        Note:
            This implementation uses the default random_state in train_test_split,
            which means the split will be different each time the pipeline runs.
            Consider adding a fixed random_state for reproducibility.
        """
        data = pd.read_csv(self.config.data_path)

        # Split the data into training and test sets. (0.75, 0.25) split.
        train, test = train_test_split(data)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)

In [12]:
"""
Data Transformation Pipeline

This script orchestrates the data transformation process by initializing the configuration,
creating the data transformation component, and executing the train-test splitting.

The pipeline follows these steps:
1. Initialize the ConfigurationManager to load all configuration parameters
2. Get the specific data transformation configuration
3. Initialize the DataTransformation component with the configuration
4. Split the data into training and testing sets and save them to separate files

The entire process is wrapped in a try-except block to catch and propagate
any exceptions that might occur during execution, ensuring proper error handling.

Note:
- This is the third stage in the ML pipeline, following data validation
- It prepares the data for model training by splitting it into train and test sets
- While this implementation only performs train-test splitting, the component is
  designed to be extended with additional transformation techniques like scaling,
  normalization, or feature engineering as needed
"""

try:
    # Initialize configuration
    config = ConfigurationManager()
    
    # Get component-specific configuration
    data_transformation_config = config.get_data_transformation_config()
    
    # Initialize data transformation component
    data_transformation = DataTransformation(config=data_transformation_config)
    
    # Execute train-test splitting
    data_transformation.train_test_spliting()
    
except Exception as e:
    # Propagate any exceptions for handling at a higher level
    raise e

[2025-05-08 10:23:08,792: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-08 10:23:08,794: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-08 10:23:08,806: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-08 10:23:08,810: INFO: common: created directory at: artifacts]
[2025-05-08 10:23:08,812: INFO: common: created directory at: artifacts/data_transformation]
[2025-05-08 10:23:08,860: INFO: 4217672154: Splited data into training and test sets]
[2025-05-08 10:23:08,862: INFO: 4217672154: (1199, 12)]
[2025-05-08 10:23:08,863: INFO: 4217672154: (400, 12)]
(1199, 12)
(400, 12)
