In [21]:
import os


In [12]:
%pwd

'e:\\MLOps\\DataScienceProject2.0'

In [22]:
os.chdir("e:/MLOps/DataScienceProject2.0")

In [14]:
%pwd

'e:\\MLOps\\DataScienceProject2.0'

In [23]:
from dataclasses import dataclass
from pathlib import Path

# In dataclass we dont need to use self keyword

@dataclass
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path


In [24]:
from src.DataScienceProject.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
from src.DataScienceProject.utils.common import read_yaml, create_directories

### Explanation of `ConfigurationManager` Class

The `ConfigurationManager` class helps organize and manage the settings needed for a data science project. Here’s how it works in simple terms:

- **Initialization (`__init__` method):**
    - It reads three configuration files: one for general settings, one for parameters, and one for schema (structure of data).
    - It stores the information from these files so the rest of the project can use them easily.
    - It also makes sure that the main folder for saving results (artifacts) exists.

- **Getting Data Ingestion Settings (`get_data_ingestion_config` method):**
    - This method looks at the part of the configuration that tells how to get the data (where to download it from, where to save it, etc.).
    - It makes sure the folder for storing the raw data exists.
    - It creates a simple object (`DataIngestionConfig`) that holds all the important details for getting the data, like the download link and file paths.
    - This object is then returned so other parts of the project can use it to fetch and store data.

In summary, this class makes it easy to read settings from files and prepare everything needed to start working with data, without having to manually set up folders or paths.

In [25]:
class ConfigurationManager:
    def __init__(self,config_filepath= CONFIG_FILE_PATH,params_filepath= PARAMS_FILE_PATH,schema_filepath= SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)    
        
        create_directories([self.config.artifacts_root])
        
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        
        data_ingestion_config = DataIngestionConfig(
            root_dir= Path(config.root_dir),
            source_URL= config.source_URL,
            local_data_file= Path(config.local_data_file),
            unzip_dir= Path(config.unzip_dir)
        )
        
        return data_ingestion_config

In [26]:
import os
import urllib.request as request
from src.DataScienceProject import logger
import zipfile

In [29]:
# Component - Data INgestion component


class DataIngestion:
    def __init__(self, config:DataIngestionConfig):
        self.config = config
        
    # Downloadig the zip file
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename,headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"file already exists")
    
    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Functions returns None
        
        """
        
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok = True)
        
        # Check if the file is actually a zip file
        if str(self.config.local_data_file).endswith('.zip'):
            with zipfile.ZipFile(self.config.local_data_file, "r") as zip_ref:
                zip_ref.extractall(unzip_path)
                logger.info(f"Extracted zip file to {unzip_path}")
        else:
            logger.info(f"File {self.config.local_data_file} is not a zip file, skipping extraction")
            # For non-zip files like CSV, just log that they're ready to use
            logger.info(f"Data file is ready at: {self.config.local_data_file}")

In [30]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2025-08-31 23:08:04,889: INFO: common : yaml file: config\config.yml loaded successfully]
[2025-08-31 23:08:04,892: INFO: common : yaml file: params.yml loaded successfully]
[2025-08-31 23:08:04,895: INFO: common : yaml file: schema.yml loaded successfully]
[2025-08-31 23:08:04,896: INFO: common : created directory at : artifacts]
[2025-08-31 23:08:04,898: INFO: common : created directory at : artifacts/data_ingestion]
[2025-08-31 23:08:04,892: INFO: common : yaml file: params.yml loaded successfully]
[2025-08-31 23:08:04,895: INFO: common : yaml file: schema.yml loaded successfully]
[2025-08-31 23:08:04,896: INFO: common : created directory at : artifacts]
[2025-08-31 23:08:04,898: INFO: common : created directory at : artifacts/data_ingestion]
[2025-08-31 23:08:04,898: INFO: 2557496555 : file already exists]
[2025-08-31 23:08:04,898: INFO: 2557496555 : File artifacts\data_ingestion\data.csv is not a zip file, skipping extraction]
[2025-08-31 23:08:04,898: INFO: 2557496555 : Data fil

In [None]:
# # Restart kernel and reimport everything
# %reset -f