In [1]:
# config_entity.py
from dataclasses import dataclass
from pathlib import Path
import os


@dataclass(frozen=True)            # frozen=True means imutable dataclass
class DataIngestionConfig:
    root_dir: Path                 # Data type is path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [2]:
# configuration.py
from CNNClassifier.constants import *
from CNNClassifier.utils.utils import read_yaml, create_directory

# Configuration of Input of Data Ingestion Component

In [3]:
# configuration.py
class ConfigurationManager:
    # Class to map config_entity.py with config.yaml file
    def __init__(self, config_file_path=CONFIG_FILE_PATH, params_file_path= PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        # artifacts directory created
        create_directory([self.config.artifacts_root])
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        # artifacts/data_ingestion folder created
        create_directory([config.root_dir])

        # Assign values from config.yaml file to DataIngestionConfig dataclass in config_entity.
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )
        return data_ingestion_config

# Data Ingestion Function Definition

In [4]:
# Stage_01_data_ingestion.py
import urllib.request as request
from zipfile import ZipFile
class DataIngestion:
    
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            #logger.info("Downloading Data")
            filename,header = request.urlretrieve(url=self.config.source_URL, filename=self.config.local_data_file)
        #else:
         #   logger.info("Data already downloaded")

    def get_updated_list_of_files(self, list_of_files):
        return [f for f in list_of_files if f.endswith('.jpg')]

    def preprocess(self, zf:ZipFile, f:str, working_dir:str):
        target_file_path = os.path.join(working_dir, f)
        if not os.path.exists(target_file_path):
            zf.extract(f, working_dir)
            # os.rename(os.path.join(working_dir, f), target_file_path)

    def unzip_and_clean(self):
        with ZipFile(self.config.local_data_file, mode='r') as zipObj:
            list_of_files = zipObj.namelist()
            # Extracting the jpg files
            updated_list_of_file = self.get_updated_list_of_files(list_of_files)
            for f in updated_list_of_file:
                # For showing progress of preprocessing.
                self.preprocess(zipObj, f, self.config.unzip_dir)

# Data Ingestion Pipeline

In [5]:
os.chdir("../")
%pwd

'f:\\Machine_Learning\\full_stack_data_science-ineuron\\Computer Vision\\CNNProjectStructure'

In [6]:
# data_ingestion_pipeline.py
config = ConfigurationManager()
data_ingestion_config = config.get_data_ingestion_config()
# This is where we are passing the configuration in config.yaml file to the DataIngestion class in stage_01_data_ingestion.py file.
data_ingestion = DataIngestion(config=data_ingestion_config)

# Download the data from the given url or database.
#logger.info("Downloading Data")
data_ingestion.download_file()
#logger.info("Downloaded Data Successfully")

# Unzip and clean the data
#logger.info("Unzipping and Cleaning Data")
data_ingestion.unzip_and_clean()
#logger.info("Unzipped and Cleaned Data Successfully")

#logger.info("Stage 01: Data Ingestion completed successfully.")

[2024-01-10 19:00:03,403: INFO: utils]: Directory created at artifacts
[2024-01-10 19:00:03,404: INFO: utils]: Directory created at artifacts/data_ingestion
