In [1]:
import os


import urllib.request as request
import zipfile
import yaml
from pathlib import Path
from dataclasses import dataclass

from ImageClassification import logger
from ImageClassification.utils.common import get_size

from ImageClassification.utils.common import read_yaml, create_directories
from ImageClassification.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH

from urllib.error import HTTPError

In [2]:
'''
Setting up the directories
'''

CURRENT_DIRECTORY = os.getcwd()
PARENT_DIRECTORY = os.path.dirname(CURRENT_DIRECTORY)

# data Directory
DATA_PARENT_DIRECTORY = os.path.dirname(PARENT_DIRECTORY)
DATA_DIRECTORY = os.path.join(DATA_PARENT_DIRECTORY, '/_data_DL_Chicken_Disease_Classification_AWS_AZURE_DVC')

TRAIN_DATA = os.path.join(DATA_PARENT_DIRECTORY, '/train')
VALIDATION_DATA = os.path.join(DATA_PARENT_DIRECTORY, '/test')
TEST_DATA = os.path.join(DATA_PARENT_DIRECTORY, '/validation')

# # # Usage
# CONFIG_DIRECTORY = os.path.join(PARENT_DIRECTORY, 'config')
# CONFIG_FILE_PATH = os.path.join(CONFIG_DIRECTORY, 'config.yaml')


# print('CURRENT_DIRECTORY', CURRENT_DIRECTORY)
# print('PARENT_DIRECTORY', PARENT_DIRECTORY)
# print('DATA_PARENT_DIRECTORY', DATA_PARENT_DIRECTORY)
# print('DATA_DIRECTORY', DATA_DIRECTORY)
# print('TRAIN_DATA', TRAIN_DATA)
# print('VALIDATION_DATA', VALIDATION_DATA)
# print('TEST_DATA', TEST_DATA)
# print('CONFIG_DIRECTORY', CONFIG_DIRECTORY)
# print('CONFIG_FILE_PATH', CONFIG_FILE_PATH)

In [3]:


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path
    train_dir: Path
    validation_dir: Path
    test_dir: Path

In [4]:
class ConfigurationManager:
    def __init__(self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH):

        self.config_filepath = config_filepath  # Store config filepath as attribute
        self.params_filepath = params_filepath

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        with open(self.config_filepath, 'r') as file:
            self.config = yaml.safe_load(file)

        data_root_path = Path(self.config['data_root_path'])
        data_ingestion = self.config['data_ingestion']

        return DataIngestionConfig(
        root_dir=Path(data_ingestion['root_dir']),
        train_dir=data_root_path / data_ingestion['train_dir'],
        validation_dir=data_root_path / data_ingestion['validation_dir'],
        test_dir=data_root_path / data_ingestion['test_dir'],
        source_URL=data_ingestion['source_URL'],
        local_data_file=data_root_path / data_ingestion['local_data_file'],
        unzip_dir=data_root_path / data_ingestion['unzip_dir']
        )


In [5]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self):
        if self.config.source_URL and self.config.local_data_file:
            local_data_file_path = Path(self.config.local_data_file)
            if not os.path.exists(local_data_file_path.parent):
                os.makedirs(local_data_file_path.parent, exist_ok=True)
            
            if not os.path.exists(self.config.local_data_file):
                try:
                    filename, headers = request.urlretrieve(
                        url=self.config.source_URL,
                        filename=self.config.local_data_file
                    )
                except HTTPError as e:
                    logger.error(f'HTTP Error: {e}')
                    raise
                except Exception as e:
                    logger.error(f'Error during file download : {e}')
                    raise
            else:
                logger.info(f'File already exists of size: {get_size(Path(self.config.local_data_file))}')
        else:
            logger.info('Skipping download, source_URL or local_data_file not provided.')

    # def extract_zip_file(self):
    #     if self.config.unzip_dir and self.config.local_data_file:
    #         unzip_dir_path = Path(self.config.unzip_dir)
    #         if not unzip_dir_path.exists():
    #             os.makedirs(unzip_dir_path, exist_ok=True)
            
    #         if not any(unzip_dir_path.iterdir()):  # Check if Directory is empty
    #             with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
    #                 zip_ref.extractall(unzip_dir_path)
    #             logger.info(f'Extracted Zip file to {unzip_dir_path}')
    #         else:
    #             logger.info(f'Unzip directory already exists: {unzip_dir_path}')
    #     else:
    #         logger.info('Skipping unzip, unzip_dir or local_data_file not provided.')

    # def extract_zip_file(self):
    #     if self.config.unzip_dir and self.config.local_data_file:
    #         if not os.path.exists(self.config.unzip_dir):
    #             os.makedirs(self.config.unzip_dir, exist_ok=True)
    #             with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
    #                 zip_ref.extractall(self.config.unzip_dir)
    #         else:
    #             logger.info(f'Unzip directory already exists: {self.config.unzip_dir}')
    #     else:
    #         logger.info('Skipping unzip, unzip_dir or local_data_file not provided.')

    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

    def setup_local_data(self):
        # Use the / operator to join paths
        self.config.train_dir = Path(self.config.root_dir) / self.config.train_dir
        self.config.validation_dir = Path(self.config.root_dir) / self.config.validation_dir
        self.config.test_dir = Path(self.config.root_dir) / self.config.test_dir

        for path in [self.config.train_dir, self.config.validation_dir, self.config.test_dir]:
            if not os.path.exists(path):
                logger.warning(f'Missing expected directory: {path}')


In [6]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    logger.error(f"Error during data ingestion: {e}")
    raise e


YAML content: {'artifacts_root': 'artifacts', 'data_root_path': '', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/entbappy/Branching-tutorial/raw/master/Chicken-fecal-images.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion', 'train_dir': '', 'validation_dir': '', 'test_dir': ''}, 'prepare_base_model': {'root_dir': 'artifacts/prepare_base_model', 'base_model_path': 'artifacts/prepare_base_model/base_model.h5', 'update_base_model_path': 'artifacts/prepare_base_model/base_model_update.h5'}, 'perpare_callbacks': {'root_dir': 'artifacts/prepare_callbacks', 'tensorboard_root_log_dir': 'artifacts/prepare_callbacks/tensorboard_log_dir', 'checkpoint_model_filepath': 'artifacts/prepare_callbacks/checkpoint_dir/model.keras'}, 'training': {'root_dir': 'artifacts/training', 'trained_model_path': 'artifacts/training/model.h5'}}
[2024-05-24 13:36:55,577: INFO: common: YAML file: C:\Users\petersunny.mark

In [7]:
# ----------------------------------- Demo Code end here -----