### Create DataIngestionWrappers
- Entity (just a user-defined return type)
    - has a `dataclass decorator`



In [1]:
import os
%pwd

'/home/prateek/ThinkAuto/MLOPS/ChestDiseaseClassification/MLOPS_Deep_Learning/research'

In [2]:
os.chdir("../")

In [3]:
%pwd

'/home/prateek/ThinkAuto/MLOPS/ChestDiseaseClassification/MLOPS_Deep_Learning'

### Create data ingestion entity
- WIll finally go inside `src/vggClassifier/entity/config_entity.py`

In [4]:
from pathlib import Path
from dataclasses import dataclass # 

# dataclass is used to store data
# Type safety -> mention the datatypes
# frozen=True -> immutabilty throughout program's execution
@dataclass(frozen=True)
class DataIngestionConfig: # wraps the dict of config.yaml
    root_dir: Path
    source_url: str
    unzip_dir: Path
    local_data_file: Path



# config = DataIngestionConfig(
#     root_dir=Path('/path/to/root'),
#     source_url='http://example.com/data.zip',
#     unzip_dir=Path('/path/to/unzip'),
#     local_data_file=Path('/path/to/local/file.zip')
# )

# # Accessing the attributes
# print(config.root_dir)  # Output: /path/to/root
# print(config.source_url)  # Output: http://example.com/data.zip

In [5]:
from vggClassifier.constants import *
from vggClassifier.utils.common import read_yaml, create_directories
from vggClassifier import *


In [6]:
# dataclass is used to store data
# Type safety -> mention the datatypes
# frozen=True -> immutabilty throughout program's execution
@dataclass(frozen=True)
class DataIngestionConfig: # wraps the dict of config.yaml
    root_dir: Path
    source_url: str
    unzip_dir: Path
    local_data_file: Path



class DataIngestionManager:
    '''
        - Creates the required folders using config.yaml
        - Sets the data_ingestion properties as a dict
        - Finally wraps the dict as an entity (here DataIngestionConfig)
    '''
    def __init__(self, \
                 CONFIG_FILE_PATH,
                 PARAMS_FILE_PATH):
        self.params_filepath = PARAMS_FILE_PATH
        self.config_filepath = CONFIG_FILE_PATH

        # load these yaml
        self.config_dict = read_yaml(filepath=self.config_filepath)
        self.params_dict = read_yaml(filepath=self.params_filepath)

        # create artifacts_root directory 
        create_directories([self.config_dict.artifacts_root]) # BoxConfig type


    def get_data_ingestion_as_entity(self)->DataIngestionConfig:
        """
            Creates data_ingestion folder
            Sets the key-value of data_ingestion in config.yaml
            Returns as a DataIngestionConfig type
        """
        config = self.config_dict.data_ingestion

        # create root directory
        create_directories([config.root_dir])

        data_ingest_obj = DataIngestionConfig( root_dir=config.root_dir,\
                             source_url=config.source_url,
                             unzip_dir=config.unzip_dir,
                             local_data_file=config.local_data_file
                              )
        return data_ingest_obj


In [7]:
import gdown

class DataIngestion:
    '''
        - Downloads data from gdrive
        - Then unzips the zip file
    '''
    def __init__(self, config: DataIngestionConfig):
        self.config = config # this is the config.data_ingestion

    def download_data(self):
        try: 
            url = self.config.source_url
            file_id = url.split("/")[-2]
            prefix = "https://drive.google.com/uc?/export=download&id="
            zip_filename = self.config.local_data_file
            gdown.download(prefix + file_id, zip_filename)
            logger.info(f"Downloading complete and renamed as: {zip_filename}")
        except Exception as e:
            raise e

    def unzip_data(self):
        '''
            - Make the unzip_dir
            - unzip inside this dir
        '''
        # create dir to put the files after unzip
        unzip_dir = self.config.unzip_dir
        create_directories([unzip_dir])

        # unzip the files
        import zipfile
        zip_filename = self.config.local_data_file
        with zipfile.ZipFile(zip_filename, "r") as f:
            f.extractall(unzip_dir)
            logger.info(f"Zipped file: {zip_filename} is unzipped at: {unzip_dir}")


### Pipeline

In [9]:
dim = DataIngestionManager( CONFIG_FILE_PATH, PARAMS_FILE_PATH )
print( dim.params_dict ) # {'key': 'value'} dummy yaml have to have something else error
config = dim.get_data_ingestion_as_entity()

data_ingestion_object = DataIngestion(config)
data_ingestion_object.download_data()
data_ingestion_object.unzip_data()


[2024-07-24 13:23:57,968: INFO: common: yaml file config/config.yaml is loaded successfully]
[2024-07-24 13:23:57,974: INFO: common: yaml file params.yaml is loaded successfully]
[2024-07-24 13:23:57,975: INFO: common: created directory at: artifacts]
{'key': 'value'}
[2024-07-24 13:23:57,978: INFO: common: created directory at: artifacts/data_ingestion/]


Downloading...
From (original): https://drive.google.com/uc?/export=download&id=101_DIOzF9sP9T7HJHh5Z8V2eDHgPrtTo
From (redirected): https://drive.google.com/uc?%2Fexport=download&id=101_DIOzF9sP9T7HJHh5Z8V2eDHgPrtTo&confirm=t&uuid=b9fd6df6-9a92-4773-bf83-7d6d191b428e
To: /home/prateek/ThinkAuto/MLOPS/ChestDiseaseClassification/MLOPS_Deep_Learning/artifacts/data_ingestion/data.zip
100%|██████████| 49.0M/49.0M [00:08<00:00, 5.89MB/s]

[2024-07-24 13:24:10,996: INFO: 475719230: Downloading complete and renamed as: artifacts/data_ingestion/data.zip]
[2024-07-24 13:24:10,998: INFO: common: created directory at: artifacts/data_ingestion]





[2024-07-24 13:24:11,359: INFO: 475719230: Zipped file: artifacts/data_ingestion/data.zip is unzipped at: artifacts/data_ingestion]
