In [1]:
import os

In [2]:
os.chdir('../')

In [3]:
%pwd

'c:\\Users\\sridhar\\Desktop\\DATA ANALYTICS PROJECTS\\DATA SCIENCE\\End-To-End-ML-Project'

In [4]:
# Purpose: Data class definition for DataIngestionConfig.
# Creating a custom entity for 
"""Custom entities can be used to represent any type of object or concept that is not already defined in the Python standard library. 
They can be used to improve the readability and maintainability of your code."""

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    """Data class for configuration related to data ingestion."""
    root_dir: Path             # Root directory for data ingestion artifacts.
    source_URL: str            # URL from which the data will be downloaded.
    local_data_file: Path      # Local path to store the downloaded data file.
    unzip_dir: Path            # Directory where the downloaded data will be extracted.


In [6]:
# Purpose: Main script or module for the machine learning project.

# Importing constants and utility functions from mlproject package.
from mlproject.constants import *
from mlproject.utils.common import read_yaml,create_directories

In [7]:
# Purpose: Definition of the ConfigurationManager class for managing project configurations.

class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH):
        """
        Initializes the ConfigurationManager with default or provided file paths.

        Args:
            config_filepath (Path, optional): Path to the configuration file. Defaults to CONFIG_FILE_PATH.
            params_filepath (Path, optional): Path to the parameters file. Defaults to PARAMS_FILE_PATH.
            schema_filepath (Path, optional): Path to the schema file. Defaults to SCHEMA_FILE_PATH.
        """
        # Reading configuration, parameters, and schema files using read_yaml function.
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # Creating the root directory for project artifacts.
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Gets the configuration for data ingestion.

        Returns:
            DataIngestionConfig: Data class containing data ingestion configuration.
        """
        # Extracting data ingestion configuration from the overall project configuration.
        config = self.config.data_ingestion

        # Creating the root directory for data ingestion artifacts.
        create_directories([config.root_dir])

        # Creating a DataIngestionConfig object with the extracted configuration.
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )

        return data_ingestion_config


In [8]:
import os  # Operating system interface.
import urllib.request as request  # URL handling module.
import zipfile  # ZIP file processing module.
from mlproject import logger  # Logger specific to the mlProject package.
from mlproject.utils.common import get_size  # Common utility function for file size.

In [9]:
# Purpose: Definition of the DataIngestion class for handling data download and extraction.

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        """
        Initializes the DataIngestion object with the provided configuration.

        Args:
            config (DataIngestionConfig): DataIngestionConfig object containing data ingestion settings.
        """
        self.config = config

    def download_file(self):
        """
        Downloads the data file from the specified source URL and logs the download information.
        """
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url=self.config.source_URL,
                filename=self.config.local_data_file
            )
            logger.info(f"{filename} downloaded! with the following info:\n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")

    def extract_zip_file(self):
        """
        Extracts the contents of the ZIP file into the specified directory.
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)


In [10]:
# Purpose: Main script or module for the machine learning project.

try:
    # Creating ConfigurationManager to manage project configurations.
    config = ConfigurationManager()

    # Retrieving data ingestion configuration.
    data_ingestion_config = config.get_data_ingestion_config()

    # Creating DataIngestion object with the obtained configuration.
    data_ingestion = DataIngestion(config=data_ingestion_config)

    # Downloading the data file.
    data_ingestion.download_file()

    # Extracting contents from the ZIP file.
    data_ingestion.extract_zip_file()

except Exception as e:
    # Handling and re-raising any exceptions that occur during the process.
    raise e


[2023-12-01 20:46:04,006: INFO: common: YAML file: config\config.yaml loaded successfully]
[2023-12-01 20:46:04,016: INFO: common: YAML file: params.yaml loaded successfully]
[2023-12-01 20:46:04,021: INFO: common: YAML file: schema.yaml loaded successfully]
[2023-12-01 20:46:04,021: INFO: common: Created directory at: artifacts]
[2023-12-01 20:46:04,024: INFO: common: Created directory at: artifacts/data_ingestion]
[2023-12-01 20:46:07,016: INFO: 1058992654: artifacts/data_ingestion/data.zip downloaded! with the following info:
Connection: close
Content-Length: 23329
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "c69888a4ae59bc5a893392785a938ccd4937981c06ba8a9d6a21aa52b4ab5b6e"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: 7CFC:91CD:CFB4FF:FD9A2D:6569F8B3
Accept-Ranges: bytes
Date: Fri,