In [1]:
pwd

'E:\\Projects\\MLOPS\\kidney-disease-classification-mlops\\research'

# All this operations will be executed on root of the project directory

In [2]:
import os

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir : Path
    source_URL: str
    local_data_file: Path
    extracted_dir: Path

In [4]:
from kidneyDiseaseClassifier.constants import *
from kidneyDiseaseClassifier.utils.common import read_yaml, create_directories


In [5]:
class ConfigurationManager:
    def __init__(self,config_filepath= CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.prams = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
        
    def get_data_ingestion_gdrive_config(self)->DataIngestionConfig:
        config = self.config.data_ingestion_gdrive

        create_directories([config.root_dir])
        
        data_ingestion_cfg = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            extracted_dir= config.extracted_dir
        )
        
        return data_ingestion_cfg
    
    def get_data_ingestion_kaggle_config(self)->DataIngestionConfig:
        config = self.config.data_ingestion_kaggle

        create_directories([config.root_dir])
        
        data_ingestion_cfg = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            extracted_dir= config.extracted_dir
        )
        
        return data_ingestion_cfg

In [16]:
import os
import zipfile
import tarfile
import shutil
import gdown
import subprocess
import opendatasets as od
from kidneyDiseaseClassifier import logger
from kidneyDiseaseClassifier.utils.common import get_size

In [17]:
class DataIngestionGoogle:
    def __init__(self, config: DataIngestionConfig):
        self.config =config
        
    def download_gdrive_data(self):
        """
        Fetch the data from Gdrive
        :return: 
        """
        
        try:
            dataset_url = self.config.source_URL
            zip_download_dir = self.config.local_data_file
            os.makedirs("artifacts/data_ingestion/gdrive", exist_ok=True)
            logger.info(f"Downloading data from the {dataset_url} into {zip_download_dir} location")
            file_id = dataset_url.split('/')[-2]
            prefix = "https://drive.google.com/uc?/export=download&id="
            # gdown.download(prefix+file_id,zip_download_dir)
            gdown.download(prefix+file_id, zip_download_dir, resume=True)
            logger.info(f"Data has been downloaded at {zip_download_dir}")
        except Exception as e:
            logger.error(e)
            raise e
            
    def extractor(self):
        """
        zip_file_path: str path
        Extract zip file
        :return: None
        """
        unzip_path = self.config.extracted_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)
        logger.info(f"Successfully extract the file at {unzip_path}")
            
            

In [19]:
class DataIngestionKaggle:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        
    def download_kaggle_data(self):
        """
        Download data publicly available from kaggle
        :return: 
        """
        try:
            dataset_url = self.config.source_URL
            zip_download_dir = self.config.local_data_file
            os.makedirs("artifacts/data_ingestion/kaggle", exist_ok=True)
            logger.info(f"Downloading data from the {dataset_url} into {zip_download_dir} location")
            
            kaggle_api = read_yaml(KAGGLE_SECRET_FILE_PATH)
            
            os.environ["KAGGLE_USERNAME"] = kaggle_api.kaggle_username
            os.environ["KAGGLE_KEY"] = kaggle_api.kaggle_api_key
            
            command = f"kaggle datasets download {dataset_url.split('/datasets/')[-1]} -p {zip_download_dir} --unzip"
            
            subprocess.run(command.split())
            
        except Exception as e:
            raise e
    
    def get_newly_downloaded_file(self, directory:str):
        # Ensure the directory exists
        if not os.path.exists(directory) or not os.path.isdir(directory):
            return None
    
        # List all files in the directory
        files = os.listdir(directory)
    
        # Filter out directories (if any)
        files = [file for file in files if os.path.isfile(os.path.join(directory, file))]
    
        # Sort files by modification time in descending order
        files.sort(key=lambda x: os.path.getmtime(os.path.join(directory, x)), reverse=True)
    
        # Check if there are any files in the directory
        if not files:
            return None
    
        # Return the path to the latest file
        latest_file = os.path.join(directory, files[0])
        return latest_file
            
    def extractor(self):
        """
        zip_file_path: str path
        Extract zip file
        :return: None
        """
        try:
            
            unzip_path = self.config.extracted_dir
            compressed_file = self.get_newly_downloaded_file(self.config.local_data_file)
            os.makedirs(unzip_path, exist_ok=True)
            if not os.path.exists(compressed_file):
                raise FileExistsError(f"{compressed_file} doesn't exists. Make sure file exists")
            
            logger.info(f"{compressed_file} file extraction is started!")
            
            file_extension = os.path.splitext(compressed_file)[1].lower()
            # Handle zip files
            if file_extension == ".zip":
                with zipfile.ZipFile(compressed_file, 'r') as zip_ref:
                    zip_ref.extractall(unzip_path)
                logger.info(f"Successfully extract the file at {compressed_file}")
                # Delete the compressed file after extraction
                os.remove(compressed_file)
                
            elif file_extension in (".tar", ".gz", ".bz2"):
                with tarfile.open(compressed_file, "r") as tar_ref:
                    tar_ref.extractall(os.path.dirname(compressed_file))
                logger.info(f"Successfully extract the file at {compressed_file}")
            
                # Delete the compressed file after extraction
                os.remove(compressed_file)   
        except Exception as e:
            raise e
            

In [20]:
pwd

'E:\\Projects\\MLOPS\\kidney-disease-classification-mlops'

In [12]:
# os.chdir("..")

NameError: name 'pwd' is not defined

In [14]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_gdrive_config()
    data_ingestion = DataIngestionGoogle(config=data_ingestion_config)
    data_ingestion.download_gdrive_data()
    data_ingestion.extractor()
except Exception as e:
    raise e

[2023-12-09 21:49:32,754]: INFO: common: yaml file: config\config.yaml loaded successfully!
[2023-12-09 21:49:32,755]: INFO: common: yaml file: params.yaml loaded successfully!
[2023-12-09 21:49:32,757]: INFO: common: Created directory at : artifacts
[2023-12-09 21:49:32,757]: INFO: common: Created directory at : artifacts/data_ingestion/gdrive
[2023-12-09 21:49:32,759]: INFO: 1950354356: Downloading data from the https://drive.google.com/file/d/1vlhZ5c7abUKF8xXERIw6m9Te8fW7ohw3/view?usp=sharing into artifacts/data_ingestion/gdrive/data.zip location


Downloading...
Resume: artifacts/data_ingestion/gdrive\data.zip
From (uriginal): https://drive.google.com/uc?/export=download&id=1vlhZ5c7abUKF8xXERIw6m9Te8fW7ohw3
From (redirected): https://drive.google.com/uc?/export=download&id=1vlhZ5c7abUKF8xXERIw6m9Te8fW7ohw3&confirm=t&uuid=0262a44f-9f98-484a-99fd-6b842d6257c7
To: E:\Projects\MLOPS\kidney-disease-classification-mlops\artifacts\data_ingestion\gdrive\data.zip
0.00B [00:00, ?B/s]

[2023-12-09 21:49:36,785]: INFO: 1950354356: Data has been downloaded at artifacts/data_ingestion/gdrive/data.zip





[2023-12-09 21:49:37,201]: INFO: 1950354356: Successfully extract the file at artifacts/data_ingestion/gdrive/extraction


In [21]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_kaggle_config()
    data_ingestion = DataIngestionKaggle(config=data_ingestion_config)
    data_ingestion.download_kaggle_data()
    data_ingestion.extractor()
except Exception as e:
    raise e

[2023-12-09 23:34:07,020]: INFO: common: yaml file: config\config.yaml loaded successfully!
[2023-12-09 23:34:07,025]: INFO: common: yaml file: params.yaml loaded successfully!
[2023-12-09 23:34:07,028]: INFO: common: Created directory at : artifacts
[2023-12-09 23:34:07,029]: INFO: common: Created directory at : artifacts/data_ingestion/kaggle
[2023-12-09 23:34:07,030]: INFO: 1076420398: Downloading data from the https://www.kaggle.com/datasets/nazmul0087/ct-kidney-dataset-normal-cyst-tumor-and-stone/ into artifacts/data_ingestion/kaggle/data location
[2023-12-09 23:34:07,118]: INFO: common: yaml file: config\secrets.yaml loaded successfully!
