We are writing in this notebook first, so we can understand easily, then we have to change this notebook into modular coding.

i.e. entity -> source configuration -> component -> pipeline

'src' is the main folder which contains all this, we created this 'research' folder to first experiment and check whether things are working or not.

In modular coding, we have to just copy paste this notebook in respective folder and have to access them. I am writing this change in orange comment to highlight.

Note : When we convert it into modular coding then it by default shows the packages which are not imported by marking them with yellow underline in the code, so we get idea that okay we need to import these things.

# Change to root dir to create artifacts folder

In [3]:
import os

In [4]:
directory_path = 'C:/WINE_QUALITY_ML_PROJ'

In [5]:
os.chdir(directory_path)

In [6]:
%pwd

'C:\\WINE_QUALITY_ML_PROJ'

# Create data ingestion Entity (class)

In [24]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path # in python, we have data type 'path' 
    source_url: str # but we don't have 'url' type datatype, hence have to use str for url
    local_data_file: Path
    unzip_dir: Path

# Create Data ingestion source configuration

In [25]:
# here we are fetching file location from constants.py
# and using two modules read_yaml to extract the paths and create_directories from utils.common.py

from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

class ConfigurationManager:
    # as soon as obj created, it should read the yaml files and create artifact folder related to data ingestion
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_file_path = PARAMS_FILE_PATH,
            schema_file_path = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)

        create_directories([self.config.artifacts_root]) # in the config.yaml, artifacts_root is key, fetch it's value (which is folder name)
        # which we are going to create in the main dir and this create_directories and read_yaml are common funcs defined in utils 


    def get_data_ingestion_config(self) -> DataIngestionConfig: # in python class can also be return type
        config = self.config.data_ingestion # in config 'data_ingestion' is nested dictionary
                                            # we are just copying  its inner key value pairs to another variable namely 'config'
        
        create_directories([config.root_dir]) # root_dir is holding the path artifacts\data_ingestion

        data_ingestion_config = DataIngestionConfig( # creating the object of class
            root_dir=config.root_dir,
            source_url=config.source_url,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )

        return data_ingestion_config # returning the object

# creating data ingestion component

In [20]:
import os
import urllib.request as request # with the help of it, we download the data from the given url
import zipfile # to unzip the file
from pathlib import Path
from mlProject.logging.logging import logger
from mlProject.utils.common import get_size

In [26]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config # fetching all the details from configuration manager

    # downloading data file
    def download_file(self):
        # if data is not present on local device then get it from souce
        # like when we download MNIST from torch.dataset, same thing happens
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url=self.config.source_url,
                filename=self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")

        
    def extract_zip_file(self):
        """"
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

        

# create the pipeline

In [27]:
try:
    config = ConfigurationManager() # step 1 -> fetching all the yaml files (config, params, schema) and creating folder 'artifacts'
    
    data_ingestion_config = config.get_data_ingestion_config() # step 2 -> now this object has all the info regarding data ingestion that
    # from where it has to take data(url) and dump it(local machine) and unzipping and also create 'data_ingestion' folder inside 'artifacts' folder 
    
    data_ingestion = DataIngestion(config=data_ingestion_config) # step 3 -> creating an obj by passing all the arguments get in step 2.
    
    data_ingestion.download_file() # step 4
    
    data_ingestion.extract_zip_file() # step 5

except Exception as e:
    raise e

[2024-02-28 19:50:07,833: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-28 19:50:07,836: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-28 19:50:07,844: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-28 19:50:07,847: INFO: common: created directory at: artifacts]
[2024-02-28 19:50:07,851: INFO: common: created directory at: artifacts/data_ingestion]


[2024-02-28 19:50:09,388: INFO: 4022175765: artifacts/data_ingestion/data.zip download! with following info: 
Connection: close
Content-Length: 26148
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "026ed31829e6874ab63255ebd95cc71ee7e6404d584eaf71ae3f079e260ffc7b"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: 5C54:3393CE:C7453:D61AE:65DF4118
Accept-Ranges: bytes
Date: Wed, 28 Feb 2024 14:20:08 GMT
Via: 1.1 varnish
X-Served-By: cache-bom4750-BOM
X-Cache: MISS
X-Cache-Hits: 0
X-Timer: S1709130008.366017,VS0,VE405
Vary: Authorization,Accept-Encoding,Origin
Access-Control-Allow-Origin: *
Cross-Origin-Resource-Policy: cross-origin
X-Fastly-Request-ID: 8382fb2b24e93752de12ce22b0907bb5626cf451
Expires: Wed, 28 Feb 2024 14:25:08 GMT
Source-Age: 0

]
