In [1]:
import os

In [2]:
# check present working directory
%pwd

'd:\\ML_OPS_BABBY_FULL_STACK_NEW\\End-to-End-wine-quality-ML-Project\\research'

In [3]:
# Move back to Main folder
os.chdir('../')

In [4]:
# Now check again
%pwd

'd:\\ML_OPS_BABBY_FULL_STACK_NEW\\End-to-End-wine-quality-ML-Project'

### Now change the variables in the `config/config.yaml` for `data_ingestion stage`

- useful for you and next developer, change the values and the whole code changes.

##### Now next step is to update schema.yaml update is there, but as of now it is not required, because we are not doing data validation , like what is the datatype of my data, how many columns are there etc, but we will write some dummy values like `key:value` so that error not comes.

##### Now next step is to update `params.yaml`, we will be keeping all the model parameters, and not hardcode them, similar to our `config.yaml.`

##### Now update the entity, as currently we are doing notebook experiment so we will not change in the config, then we try to copy paste in our modular coding.

- entity is return type function

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    """
    Configuration class for data ingestion with immutable attributes.
    
    This dataclass defines the required parameters for the data ingestion process
    in the wine quality prediction pipeline. The 'frozen=True' parameter ensures
    all attributes are read-only after initialization.
    
    Attributes:
        root_dir (Path): Directory where all data ingestion outputs will be stored
        source_URL (str): URL pointing to the source data to be downloaded
        local_data_file (Path): Path where the downloaded data file will be saved
        unzip_dir (Path): Directory where downloaded data will be extracted
    
    Note:
        Using Path objects instead of strings ensures cross-platform compatibility
        and provides helpful path manipulation methods.
    """
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

##### Now update the `src/configuration.py` file, it will return all the configurations, which we have written in the previous step, and for that we have to use some libraries.

##### But before it, we have to update the `constants` in which we have the constructor `__init__.py`, to return all the yaml file paths.

##### Also, we have to use the common the `utils` in which we have the `common.py` file in which we have all the code which are common for all the modules, in this module, we will use `read_yaml` and `create_directories`.

In [6]:
from src.mlProject.constants import *
from src.mlProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    """
    Manages configuration for the ML pipeline components.
    
    This class centralizes access to all configuration parameters by reading from
    YAML configuration files and providing component-specific configuration objects.
    
    Attributes:
        config: Main configuration parameters from config.yaml
        params: Model hyperparameters and training parameters from params.yaml
        schema: Data schema specifications from schema.yaml
    
    Methods:
        get_data_ingestion_config: Returns configuration for the data ingestion component
    """
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):
        """
        Initialize the ConfigurationManager with paths to configuration files.
        
        Args:
            config_filepath: Path to the main configuration file (default: CONFIG_FILE_PATH)
            params_filepath: Path to the parameters file (default: PARAMS_FILE_PATH)
            schema_filepath: Path to the schema file (default: SCHEMA_FILE_PATH)
        
        Note:
            Creates the root artifacts directory specified in the main configuration.
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Prepare and return the configuration for data ingestion.
        
        Returns:
            DataIngestionConfig: Configuration object with all parameters
                                 required for the data ingestion component.
                                 
        Note:
            Creates the root directory for data ingestion if it doesn't exist.
        """
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config

##### Now we have to create the components for the data ingestion, and for that we have to import some of the libraries.

In [8]:
import os
import urllib.request as request
import zipfile
from src.mlProject import logger
from src.mlProject.utils.common import get_size

In [9]:
class DataIngestion:
    """
    Handles the data ingestion process for the ML pipeline.
    
    This class is responsible for downloading the data from a source URL 
    and extracting it to a specified directory. It implements the first step
    in the ML pipeline which is acquiring the raw data.
    
    Attributes:
        config (DataIngestionConfig): Configuration containing all parameters
                                      needed for the data ingestion process.
    """
    def __init__(self, config: DataIngestionConfig):
        """
        Initialize the DataIngestion component with configuration.
        
        Args:
            config (DataIngestionConfig): Configuration object with all required
                                          parameters for data ingestion.
        """
        self.config = config


    
    def download_file(self):
        """
        Downloads the data file from the source URL.
        
        If the file already exists locally, the download is skipped.
        Uses urllib.request to retrieve the file from the specified URL.
        
        Returns:
            None
        
        Logs:
            Info about successful download or existing file
        """
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")



    def extract_zip_file(self):
        """
        Extracts the downloaded zip file into the data directory.
        
        Creates the extraction directory if it doesn't exist and
        extracts all contents of the zip file to that location.
        
        Returns:
            None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

##### Now we have to create the pipeline. The execution flow, e.g., which method will execute first and after that the next method etc.

In [10]:
"""
Data Ingestion Pipeline

This script orchestrates the data ingestion process by initializing the configuration,
creating the data ingestion component, and executing the download and extraction steps.

The pipeline follows these steps:
1. Initialize the ConfigurationManager to load all configuration parameters
2. Get the specific data ingestion configuration
3. Initialize the DataIngestion component with the configuration
4. Download the data file from the source URL
5. Extract the zip file to the specified directory

The entire process is wrapped in a try-except block to catch and propagate
any exceptions that might occur during execution, ensuring proper error handling.
"""

try:
    # Initialize configuration
    config = ConfigurationManager()
    
    # Get component-specific configuration
    data_ingestion_config = config.get_data_ingestion_config()
    
    # Initialize data ingestion component
    data_ingestion = DataIngestion(config=data_ingestion_config)
    
    # Execute data download
    data_ingestion.download_file()
    
    # Extract the downloaded zip file
    data_ingestion.extract_zip_file()
    
except Exception as e:
    # Propagate any exceptions for handling at a higher level
    raise e

[2025-05-08 08:13:41,421: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-08 08:13:41,424: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-08 08:13:41,432: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-08 08:13:41,435: INFO: common: created directory at: artifacts]
[2025-05-08 08:13:41,437: INFO: common: created directory at: artifacts/data_ingestion]
[2025-05-08 08:13:43,665: INFO: 2625114949: artifacts/data_ingestion/data.zip download! with following info: 
Connection: close
Content-Length: 23329
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "c69888a4ae59bc5a893392785a938ccd4937981c06ba8a9d6a21aa52b4ab5b6e"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: 92AF:3D8379:E23A:23F1F:681C1A9E
Accept-Ranges: bytes
Date: Thu, 08 Ma