In [1]:
import os

In [2]:
%pwd

'e:\\Projects for portfolio\\Toxic Comment Classifier\\research'

In [3]:
# Going into the root directory
os.chdir("../") 

In [4]:
%pwd

'e:\\Projects for portfolio\\Toxic Comment Classifier'

In [5]:
#Entity.py
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig: # defined for the config components present in artifacts for data ingestion
    root_dir : Path 
    source_URL : str
    local_data_file : Path
    unzip_dir : Path

In [10]:
# Configuration manager
from ToxicCommentClassifier.constants import *
from ToxicCommentClassifier.utils.common import read_yaml,create_directories

In [11]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):
    # Here we are reading the yaml file and we can now use the file paths present inside pararms and config.yaml        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) # Here we are calling the artifacts_root key values using '.' , which was the purpose of @ensure_annotations

    def get_data_ingestion_config(self) -> DataIngestionConfig: # Here we are using the entity to specify the return type classes to make sure proper output is returned
        config= self.config.data_ingestion # Calling the data_ingestion dictionary created in config.yaml file

        create_directories([config.root_dir]) # Creating a directory using the root directory

        data_ingestion_config = DataIngestionConfig( # Extracting the values from the config.yaml to here inside data_ingestion_config
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )

        return data_ingestion_config

In [14]:
# Data ingestion
from ToxicCommentClassifier.logging import logger
from ToxicCommentClassifier.utils.common import get_size
from datasets import load_dataset
import pickle

In [19]:
class DataIngestion:
    def __init__(self,config:DataIngestionConfig): # It will take the configuration from DataIngestionConfig defined earlier , which will in turn use Configuration Manager to take data from config.yaml
        self.config = config

    def load_file(self):
        if not os.path.exists(self.config.local_data_file): # If file does not exist
            dataset = load_dataset(self.config.source_URL)
            logger.info(f"{self.config.source_URL} : loaded from hugging face")
            return dataset
        
        else:
            logger.info(f"File already exists of size : {get_size(Path(self.config.local_data_file))}") # Checking file size present already in the path
    
    def save_file(self):  # Save the dataset using pickle
        with open(self.config.local_data_file, "wb") as f:
            pickle.dump(self.load_file, f)

In [21]:
# Data Ingestion Pipeline

try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config() # Storing the configuration
    data_ingestion = DataIngestion(config=data_ingestion_config) # Using the configuration saved earlier to call data_ingestion
    data_ingestion.load_file()
    data_ingestion.save_file()
except Exception as e:
    raise e

[2024-04-15 19:48:39,080: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-04-15 19:48:39,081: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-15 19:48:39,082: INFO: common: created directory at: artifacts]
[2024-04-15 19:48:39,083: INFO: common: created directory at: artifacts/data_ingestion]
[2024-04-15 19:48:53,620: INFO: 880016668: Arsive/toxicity_classification_jigsaw : loaded from hugging face]
