In [1]:
%pwd

'/workspaces/Credit_Card_Behaviour_Predicition/notebook'

In [2]:
import os
os.chdir('../')
%pwd

'/workspaces/Credit_Card_Behaviour_Predicition'

In [24]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass
class DataIngestionConfig:
    root_dir: Path
    source_URL: List[str]
    local_data_file: Path
    csv_dir: List[Path]

In [5]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

In [25]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([Path(self.config.artifacts_root)])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([Path(config.root_dir)])

        data_ingestion_config = DataIngestionConfig(
            root_dir=Path(config.root_dir),
            source_URL=config.source_URL,
            local_data_file=Path(config.local_data_file),
            csv_dir=[Path(dir_) for dir_ in config.csv_dir]
        )
        return data_ingestion_config

In [28]:
import gdown
from pathlib import Path
from src import logger

In [39]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_files(self):
        self.config.local_data_file.mkdir(parents=True, exist_ok=True)

        for csv_dir in self.config.csv_dir:
            Path(csv_dir).mkdir(parents=True, exist_ok=True)

        for i, url in enumerate(self.config.source_URL):
            try:

                file_id = self.extract_file_id(url)
                if file_id:
                    direct_url = f"https://drive.google.com/uc?id={file_id}"
                    output_dir = self.config.csv_dir[i]
                    output = Path(output_dir) / f'{file_id}.csv'
                    logger.info(f"Downloading file from {direct_url} to {output}")
                    gdown.download(direct_url, str(output), quiet=False)
                else:
                    logger.error(f"Failed to extract file ID from URL: {url}")
            except Exception as e:
                logger.error(f"Failed to download {url}: {e}")

    def extract_file_id(self, url: str) -> str:
        """Extract file ID from Google Drive URL."""

        if 'drive.google.com' in url:
            try:
                file_id = url.split('d/')[1].split('/')[0]
                return file_id
            except IndexError:
                return None
        return None

In [40]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_files()
except Exception as e:
    raise e

[2025-01-12 04:44:48,961: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-01-12 04:44:48,962: INFO: common: yaml file: params.yaml loaded successfully]
[2025-01-12 04:44:48,963: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-01-12 04:44:48,963: INFO: common: created directory at: artifacts]
[2025-01-12 04:44:48,964: INFO: common: created directory at: artifacts/data_ingestion]
[2025-01-12 04:44:48,966: INFO: 3005931499: Downloading file from https://drive.google.com/uc?id=1nZEHX3_xK0gUYiDCJoJ3CecARvIrQDlF to artifacts/data_ingestion/validation/1nZEHX3_xK0gUYiDCJoJ3CecARvIrQDlF.csv]


Downloading...
From (original): https://drive.google.com/uc?id=1nZEHX3_xK0gUYiDCJoJ3CecARvIrQDlF
From (redirected): https://drive.google.com/uc?id=1nZEHX3_xK0gUYiDCJoJ3CecARvIrQDlF&confirm=t&uuid=bb77739b-1e9e-4ad0-822a-a0978468ed76
To: /workspaces/Credit_Card_Behaviour_Predicition/artifacts/data_ingestion/validation/1nZEHX3_xK0gUYiDCJoJ3CecARvIrQDlF.csv
100%|██████████| 127M/127M [00:03<00:00, 34.4MB/s] 

[2025-01-12 04:44:57,204: INFO: 3005931499: Downloading file from https://drive.google.com/uc?id=1xo0d7jACK7yxmMpGA-PnPLLRxv5aLuxI to artifacts/data_ingestion/raw/1xo0d7jACK7yxmMpGA-PnPLLRxv5aLuxI.csv]



Downloading...
From (original): https://drive.google.com/uc?id=1xo0d7jACK7yxmMpGA-PnPLLRxv5aLuxI
From (redirected): https://drive.google.com/uc?id=1xo0d7jACK7yxmMpGA-PnPLLRxv5aLuxI&confirm=t&uuid=d75c553d-5ae9-42f4-b745-c64870fd40a4
To: /workspaces/Credit_Card_Behaviour_Predicition/artifacts/data_ingestion/raw/1xo0d7jACK7yxmMpGA-PnPLLRxv5aLuxI.csv
100%|██████████| 293M/293M [00:05<00:00, 52.5MB/s] 
