In [1]:
import os

In [2]:
%pwd

'd:\\ML_OPS_BABBY_FULL_STACK_NEW\\End-to-End-wine-quality-ML-Project\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'd:\\ML_OPS_BABBY_FULL_STACK_NEW\\End-to-End-wine-quality-ML-Project'

##### In data validation step, we will check whether the data which we are having all the columns or not, if some is missing, we will not start the training, we will raise the exception.

In [5]:
import pandas as pd

In [6]:
data = pd.read_csv("artifacts/data_ingestion/winequality-red.csv")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [8]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    """
    Configuration class for data validation with immutable attributes.
    
    This dataclass defines the required parameters for the data validation process
    in the wine quality prediction pipeline. The 'frozen=True' parameter ensures
    all attributes are read-only after initialization.
    
    Attributes:
        root_dir (Path): Directory where all data validation outputs will be stored
        STATUS_FILE (str): Path to the file that will contain validation status (True/False)
        unzip_data_dir (Path): Path to the CSV file to be validated (from data ingestion)
        all_schema (dict): Dictionary containing the expected schema information 
                          from schema.yaml, including column data types and target column
    
    Note:
        The validation process compares the actual dataset against the schema
        defined in all_schema to ensure data consistency and quality.
    """
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

In [9]:
from src.mlProject.constants import *
from src.mlProject.utils.common import read_yaml, create_directories

In [10]:
class ConfigurationManager:
    """
    Manages configuration for the ML pipeline components.
    
    This class centralizes access to all configuration parameters by reading from
    YAML configuration files and providing component-specific configuration objects.
    
    Attributes:
        config: Main configuration parameters from config.yaml
        params: Model hyperparameters and training parameters from params.yaml
        schema: Data schema specifications from schema.yaml
    
    Methods:
        get_data_ingestion_config: Returns configuration for the data ingestion component
        get_data_validation_config: Returns configuration for the data validation component
    """
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):
        """
        Initialize the ConfigurationManager with paths to configuration files.
        
        Args:
            config_filepath: Path to the main configuration file (default: CONFIG_FILE_PATH)
            params_filepath: Path to the parameters file (default: PARAMS_FILE_PATH)
            schema_filepath: Path to the schema file (default: SCHEMA_FILE_PATH)
        
        Note:
            Creates the root artifacts directory specified in the main configuration.
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Prepare and return the configuration for data validation.
        
        This method extracts the data validation configuration from the config file
        and the column schema from the schema file, then combines them into a
        DataValidationConfig object.
        
        Returns:
            DataValidationConfig: Configuration object with all parameters
                                 required for the data validation component.
                                 
        Note:
            Creates the root directory for data validation if it doesn't exist.
            The schema.COLUMNS is passed as all_schema to validate data types.
        """
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir = config.unzip_data_dir,
            all_schema=schema,
        )

        return data_validation_config

In [11]:
import os
from mlProject import logger

In [12]:
class DataValiadtion:
    """
    Validates the ingested data against the expected schema.
    
    This class is responsible for ensuring that the dataset contains all the expected
    columns as defined in the schema. It validates the structure of the data before
    proceeding to further steps in the ML pipeline.
    
    Attributes:
        config (DataValidationConfig): Configuration containing all parameters
                                       needed for the data validation process.
    """
    def __init__(self, config: DataValidationConfig):
        """
        Initialize the DataValiadtion component with configuration.
        
        Args:
            config (DataValidationConfig): Configuration object with all required
                                           parameters for data validation.
        """
        self.config = config


    def validate_all_columns(self)-> bool:
        """
        Validates that all expected columns are present in the dataset.
        
        This method reads the CSV file from the data ingestion step and checks
        if all columns in the dataset match the expected columns defined in the schema.
        
        Process:
        1. Reads the CSV data file
        2. Extracts the column names from the actual dataset
        3. Compares with the expected columns from the schema
        4. Writes the validation status to the STATUS_FILE
        
        Returns:
            bool: True if validation passes (all expected columns are present),
                  False otherwise
                  
        Raises:
            Exception: Any error during the validation process
            
        Note:
            Current implementation will mark validation as True even if there are
            additional columns not in the schema, as long as all schema columns
            are present in the dataset.
        """
        try:
            validation_status = None

            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)

            all_schema = self.config.all_schema.keys()

            
            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")

            return validation_status
        
        except Exception as e:
            raise e

In [13]:
"""
Data Validation Pipeline

This script orchestrates the data validation process by initializing the configuration,
creating the data validation component, and executing the column validation step.

The pipeline follows these steps:
1. Initialize the ConfigurationManager to load all configuration parameters
2. Get the specific data validation configuration and schema
3. Initialize the DataValiadtion component with the configuration
4. Validate that all columns in the dataset match the expected schema

The entire process is wrapped in a try-except block to catch and propagate
any exceptions that might occur during execution, ensuring proper error handling.

Note:
- This is the second stage in the ML pipeline, following data ingestion
- It ensures data quality and consistency before proceeding to data transformation
- The validation results are written to a status file that can be checked by
  subsequent pipeline stages to determine if they should proceed
"""

try:
    # Initialize configuration
    config = ConfigurationManager()
    
    # Get component-specific configuration with schema information
    data_validation_config = config.get_data_validation_config()
    
    # Initialize data validation component
    data_validation = DataValiadtion(config=data_validation_config)
    
    # Execute validation of all columns against schema
    data_validation.validate_all_columns()
    
except Exception as e:
    # Propagate any exceptions for handling at a higher level
    raise e

[2025-05-08 09:58:20,684: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-08 09:58:20,688: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-08 09:58:20,695: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-08 09:58:20,698: INFO: common: created directory at: artifacts]
[2025-05-08 09:58:20,703: INFO: common: created directory at: artifacts/data_validation]
