In [1]:
import os

In [2]:
%pwd

'e:\\Projects for portfolio\\Toxic Comment Classifier\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'e:\\Projects for portfolio\\Toxic Comment Classifier'

In [9]:
# Entity
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig: # defined for the config components present in artifacts for data validation
    root_dir : Path
    file_path : Path 
    STATUS_FILE : str
    ALL_REQUIRED_FILES : list

In [10]:
# Configuration manager
from ToxicCommentClassifier.constants import *
from ToxicCommentClassifier.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):
    # Here we are reading the yaml file and we can now use the file paths present inside pararms and config.yaml    
        print(config_filepath)    
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) # Here we are calling the artifacts_root key values using '.' , which was the purpose of @ensure_annotations

    def get_data_validation_config(self) -> DataValidationConfig: # Here we are using the entity to specify the return type classes to make sure proper output is returned
        config= self.config.data_validation # Calling the data_validation dictionary created in config.yaml file

        create_directories([config.root_dir]) # Creating a directory using the root directory

        data_validation_config = DataValidationConfig( # Extracting the values from the config.yaml to here inside data_ingestion_config
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            file_path=config.file_path,
            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES
        )

        return data_validation_config

In [12]:
# Data Validation
import pickle
from ToxicCommentClassifier.logging import logger
from ToxicCommentClassifier.utils.common import get_size

In [18]:
class DataValidation:
    def __init__(self,config:DataValidationConfig): # It will take the configuration from DataIngestionConfig defined earlier , which will in turn use Configuration Manager to take data from config.yaml
        self.config = config

    def validation_all_files_exist(self) -> bool:

        try:
            validation_status = None # Validation status counter for checking all files exists or not

            if get_size(Path(self.config.file_path)) != '0 KB' : # If File is not empty, we will load the dataset
                with open(self.config.file_path, "rb") as f:
                    dataset = pickle.load(f)
                    logger.info("Dataset loaded successfully for validation")
                

            for file in self.config.ALL_REQUIRED_FILES:
                if file not in dataset.keys(): # checking whether all the files mentioned in ALL_REQUIRED_FILES is present or not in the dataset file
                    validation_status = False
                    with open(self.config.STATUS_FILE,'a') as f:
                        f.write(f"Validation Status : {validation_status} - file : {file}\n")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE,'a') as f:
                        f.write(f"Validation Status : {validation_status} - file : {file}\n")
                
            return validation_status
        
        except Exception as e:
            raise e

In [19]:
#Pipeline
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config() # Storing the configuration
    data_validation = DataValidation(config=data_validation_config) # Using the configuration saved earlier to call data_ingestion
    data_validation.validation_all_files_exist()
except Exception as e:
    raise e

config\config.yaml
[2024-04-16 00:28:15,711: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-04-16 00:28:15,713: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-16 00:28:15,715: INFO: common: created directory at: artifacts]
[2024-04-16 00:28:15,716: INFO: common: created directory at: artifacts/data_validation]


  from .autonotebook import tqdm as notebook_tqdm


[2024-04-16 00:28:17,456: INFO: config: PyTorch version 2.2.2+cu118 available.]
