In [9]:
import os
import zipfile
from dataclasses import dataclass
from pathlib import Path
from urllib import request

In [10]:
# !ls

In [11]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_path: Path
    unzip_dir: Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: Path
    ALL_REQUIRED_FILES: list

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: str

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: str

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metric_file_name: Path

In [12]:
from src.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from src.utils import read_yaml, get_size
from src.logger import logging
from src.exception import CustomException

In [13]:
h=read_yaml(Path(f'../{CONFIG_FILE_PATH}'))
h.data_ingestion.root_dir

['artifacts', 'data_ingestion.ipynb', 'logs', 'text_summerization.ipynb']
[2023-07-18 16:24:19,713] 23 root - INFO - Yaml Config file successfully loaded from path: ..\config\config.yaml


'artifacts/data_ingestion'

In [14]:
class ConfigurationManager:
    def __init__(self, config_file_path=Path(f'../{CONFIG_FILE_PATH}'),
    param_file_path=Path(f'../{PARAMS_FILE_PATH}')) -> None:
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(param_file_path)

        os.makedirs(self.config.artifacts_root, exist_ok=True)
    
    def get_data_ingestion_config(self):
        config = self.config.data_ingestion
        os.makedirs(config.root_dir, exist_ok=True)

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_path=config.local_data_path,
            unzip_dir=config.unzip_dir 
        )
        return data_ingestion_config
    
    def get_data_validation_config(self):
        config = self.config.data_validation
        os.makedirs(config.root_dir, exist_ok=True)

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES 
        )
        return data_validation_config
    
    def get_data_transformation_config(self):
        config = self.config.data_transformation
        os.makedirs(config.root_dir, exist_ok=True)

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name=config.tokenizer_name
        )
        return data_transformation_config
    
    def get_model_trainer_config(self):
        config = self.config.model_trainer
        os.makedirs(config.root_dir, exist_ok=True)

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt=config.model_ckpt
        )
        return model_trainer_config
    
    def get_model_evaluation_config(self):
        config = self.config.model_evaluation
        os.makedirs(config.root_dir, exist_ok=True)

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt=config.model_ckpt
        )
        return model_evaluation_config

In [17]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_dataset(self):
        if not os.path.exists(self.config.local_data_path):
            file, header = request.urlretrieve(url=self.config.source_URL,
                                               filename=self.config.local_data_path)
            logging.info(f"{file} successfully downloaded! Info: \n{header}")
        else:
            logging.info(f"File already exists of size: {get_size(Path(self.config.local_data_path))} KB")

    def extract_zip_file(self, unzip_path=None):
        """Extract zip file to specified path

        Args:
            unzip_path: str, optional
                The path to extract the zip file to. If not provided, it defaults to `self.config.unzip_dir`.
        """
        if unzip_path is None:
            unzip_path = self.config.unzip_dir
        try:
            os.makedirs(unzip_path, exist_ok=True)
            with zipfile.ZipFile(self.config.local_data_path, 'r') as zip_ref:
                zip_ref.extractall(unzip_path)
        except CustomException as err:
            logging.warn(f"Error while extracting file path: {unzip_path}")
            logging.error(f"Error explaination: {err}")


In [19]:
class DataValidation:
    def __init__(self, config: DataValidationConfig) -> None:
        self.config = config
    
    def validate_files(self):
        try:
            validation_status = False
            all_files = os.listdir(os.path.join("artifacts", "data_ingestion", "samsum_dataset"))
            for file in all_files:
                if file not in self.config.ALL_REQUIRED_FILES:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as fp:
                        fp.write(f"Validation status of {file}: {validation_status}")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as fp:
                        fp.write(f"Validation status of {file}: {validation_status}")
                logging.info(f"Files validation Success. VALIDATION_STATUS: {validation_status}")
            return validation_status
        
        except Exception as err:
            logging.warn(f"Cannot validate due to error: \n{err}")
            raise err

In [21]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_dataset()
    data_ingestion.extract_zip_file()
    
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_files()

    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e

['artifacts', 'data_ingestion.ipynb', 'logs', 'text_summerization.ipynb']
[2023-07-18 18:46:16,883] 23 root - INFO - Yaml Config file successfully loaded from path: ..\config\config.yaml
['artifacts', 'data_ingestion.ipynb', 'logs', 'text_summerization.ipynb']
[2023-07-18 18:46:16,916] 23 root - INFO - Yaml Config file successfully loaded from path: ..\params.yaml
--------------------------------------------
{'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip', 'local_data_path': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'} <class 'box.config_box.ConfigBox'>
artifacts/data_ingestion <class 'box.config_box.ConfigBox'>
[2023-07-18 18:46:16,928] 11 root - INFO - File already exists of size: 7718.35 KB
