In [8]:
import os

## Data Ingestion Trial Notebook
experiment here before running the data ingestion script:

In [26]:
# check current directory
%pwd

'c:\\Users\\memor\\Documents\\MLProjects\\DataScience-w-Bappy\\MLOPs-Production-Ready-Deep-Learning-Project\\research'

In [32]:
# Going one folder up
os.chdir("../")

In [33]:
# Configuring the data class

from dataclasses import dataclass
from pathlib import Path


# All values will be read in from the config.yaml file

@dataclass(frozen=True)         # This is an Entity class
class DataIngestionConfig:
    root_dir: Path           # Path to the root directory
    source_url: str             # URL of the source of dataset
    local_data_file: Path       # Path to the local folder containing downloaded zip file
    extract_dir: Path           # Path to the directory containing unzipped dataset
    

In [34]:
# Read constants from the *chest_xray_classifier/constants* file containing paths to 
# the config.yaml and params.yaml files

from chest_xray_classifier.constants import *   # import all 

# Import the utilities from the *chest_xray_classifier/utils* package
from chest_xray_classifier.utils.common import read_yaml, create_directories



## Writing the Configuration manager file/class
Inside which we pass the CONFIG_FILE_PATH, PARAMS_FILE_PATH variables
and continue to construct the different pipelines
* src\chest_xray_classifier\config\configuration.py

Will now be populated.

In [36]:
# Create draft Configuration Manager class
class ConfigurationManager:
    # constructor method
    def __init__(
        self,
        config_file_path = CONFIG_FILE_PATH,
        params_file_path = PARAMS_FILE_PATH):
        
        # Read in the config and params - yaml files
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        
        # create the directories needed
        create_directories([self.config.artifacts_root])
        
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        # intialize config.yaml data_ingestion entity
        config = self.config.data_ingestion
        
        # create the resspective directories
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_url=config.source_url,
            local_data_file=config.local_data_file,
            extract_dir=config.extract_dir,
            # add other data ingestion config parameters if required by the project
        )
        
        return data_ingestion_config
        
        

## Implementing the Components
Stating and using the different entity configurations classes e.g DataIngestion. e.g how data ingestion will be handled/happenning

In [37]:
# import all necessary packages
import os
import zipfile
import gdown
from chest_xray_classifier import logger
from chest_xray_classifier.utils.common import get_size


In [38]:
# data ingestion pipeline
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        
# download data from source
def download_file(self)-> str:
    '''
    Fetch data from source/url
    '''
    
    try:
        dataset_url = self.config.source_url
        zip_download_dir = self.config.local_data_file  # change this to local 
        #create directory if it doesn't exist for artifacts/dataingestion
        os.makedirs(r"artifacts/dataingestion", exist_ok=True)
        
        logger.info("Downloading data from {dataset_url} into {zip_download_dir}".format(dataset_url,zip_download_dir))        
        
        # Download from gdrive storage
        file_id = dataset_url.split("/")[-2]
        # Construct the download URL
        prefix_download_url = f"https://drive.google.com/uc?id={file_id}&export=download"

        # Download the file into a local dir  # Change this to your desired file name and extension
        gdown.download(prefix_download_url, zip_download_dir, quiet=False)
        
    except Exception as e:
        raise e

# Extract the the downloaded zip file
def extract_zip(self):
    '''
    zip_file_path: str
    Extracts zip file to the directory specified in extract_dir directory
    Function returns None
    '''
    unzip_path = self.config.extract_dir
    # create extract directory incase it doesn't exist
    os.makedirs(unzip_path, exist_ok=True)
    # read zip file and extract it
    with zipfile.ZipFile(self.config.local_data_file, "r") as zip_file:
        zipfile.extractall(self.config.extract_dir)
        

## Execute the Pipeline
Put all pieces together

In [None]:
# use a try exception to catch any errors

try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip()
except Exception as e:
    raise e
