# To create configuration for each component of pipeline we require 4 files:
configuration is nothing but we create directory url for each component 
For Example:
tgz_download_dir='d:\\Project\\machine_learning_project\\notebook\\housing\\artifact\\data_ingestion\\2022-06-25-13-25-32\\tgz_data
    
    1. main_project_dir --> housing --> entity --> config_entity.py
    2. main_project_dir --> config --> config.yaml
    3. main_project_dir --> housing --> constant --> __init__.py
    4. main_project_dir --> housing --> config --> configuration.py
    
component (Project Pipeline/Steps) --> Data Ingestion, Data Validation, Data Transformation 


# config_entity.py Below mentioned how we declared config entity--> here we created for Dataingestionconfig

In [1]:
from collections import namedtuple

In [2]:
DataIngestionConfig = namedtuple("DataIngestionConfig",
                                 ['dataset_download_url','tgz_download_dir','raw_data_dir',
                                  'ingested_train_dir','ingested_test_dir'])

In [3]:
DataIngestionConfig(dataset_download_url='abcdef',tgz_download_dir='abcde',raw_data_dir='abvde',ingested_test_dir='test',
                    ingested_train_dir = 'abcdfd')

DataIngestionConfig(dataset_download_url='abcdef', tgz_download_dir='abcde', raw_data_dir='abvde', ingested_train_dir='abcdfd', ingested_test_dir='test')

# How to read yaml file

In [4]:
!pip install pYAML

Defaulting to user installation because normal site-packages is not writeable


You should consider upgrading via the 'C:\Program Files\Python310\python.exe -m pip install --upgrade pip' command.


In [5]:
import yaml
import os

In [6]:
os.getcwd()

'C:\\Shubham\\Projects\\Personal\\ML_Project\\notebook'

In [7]:
change_dir = os.chdir("C:\\Shubham\\Projects\\Personal\\ML_Project")
os.getcwd()

'C:\\Shubham\\Projects\\Personal\\ML_Project'

In [8]:
config_yaml_folder_name = 'config'
config_yaml_file_name = 'config.yaml'
config_yaml_path = os.path.join(os.getcwd(),config_yaml_folder_name,config_yaml_file_name)
config_yaml_path

'C:\\Shubham\\Projects\\Personal\\ML_Project\\config\\config.yaml'

In [9]:
os.path.exists(config_yaml_path)

True

In [10]:
def read_yaml_file(file_path):
    with open (file_path,'rb') as yaml_file:
        return yaml.safe_load(yaml_file)
    
config_data_info = read_yaml_file(config_yaml_path)
config_data_info

{'training_pipeline_config': {'pipeline_name': 'housing',
  'artifact_dir': 'artifact'},
 'data_ingestion_config': {'dataset_download_url': 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz',
  'raw_data_dir': 'raw_data',
  'tgz_download_dir': 'tgz_data',
  'ingested_dir': 'ingested_data',
  'ingested_train_dir': 'train',
  'ingested_test_dir': 'test'},
 'data_validation_config': {'schema_dir': 'config',
  'schema_file_name': 'schema.yaml',
  'report_file_name': 'report.json',
  'report_page_file_name': 'report.html'},
 'data_transformation_config': {'add_bedroom_per_room': True,
  'transformed_dir': 'transformed_data',
  'transformed_train_dir': 'train',
  'transformed_test_dir': 'test',
  'preprocessing_dir': 'preprocessed',
  'preprocessed_object_file_name': 'preprocessed.pkl'},
 'model_trainer_config': {'trained_model_dir': 'trained_model',
  'model_file_name': 'model.pkl',
  'base_accuracy': 0.6,
  'model_config_dir': 'config',
  'model_confi

In [11]:
config_data_info['training_pipeline_config']

{'pipeline_name': 'housing', 'artifact_dir': 'artifact'}

In [12]:
# Data Ingestion related variable

DATA_INGESTION_CONFIG_KEY = "data_ingestion_config"
DATA_INGESTION_ARTIFACT_DIR = "data_ingestion"
DATA_INGESTION_DOWNLOAD_URL_KEY = "dataset_download_url"
DATA_INGESTION_RAW_DATA_DIR_KEY = "raw_data_dir"
DATA_INGESTION_TGZ_DOWNLOAD_DIR_KEY = "tgz_download_dir"
DATA_INGESTION_INGESTED_DIR_NAME_KEY = "ingested_dir"
DATA_INGESTION_TRAIN_DIR_KEY = "ingested_train_dir"
DATA_INGESTION_TEST_DIR_KEY = "ingested_test_dir"

# Training pipeline related variable
TRAINING_PIPELINE_CONFIG_KEY = "training_pipeline_config"
TRAINING_PIPELINE_ARTIFACT_DIR_KEY = "artifact_dir"
TRAINING_PIPELINE_NAME_KEY = "pipeline_name"

In [13]:
from housing.entity.config_entity import DataIngestionConfig, DataTransformationConfig,DataValidationConfig,   \
ModelTrainerConfig,ModelEvaluationConfig,ModelPusherConfig,TrainingPipelineConfig
from housing.util.util import read_yaml_file
import os
from housing.constant import *

In [14]:
class Configuration:
    
    def __init__(self,config_file_path = CONFIG_FILE_PATH,current_time_stamp = CURRENT_TIME_STAMP):
        self.config_info = read_yaml_file(file_path = config_file_path)
        self.training_pipeline_config = self.get_training_pipeline_config()
        self.time_stamp = current_time_stamp
        
    def get_training_pipeline_config(self):
        
        training_pipeline_config = self.config_info[TRAINING_PIPELINE_CONFIG_KEY]

        artifact_dir = os.path.join(ROOT_DIR,
                                    training_pipeline_config[TRAINING_PIPELINE_NAME_KEY],
                                    training_pipeline_config[TRAINING_PIPELINE_ARTIFACT_DIR_KEY])
        
        training_pipeline_config = TrainingPipelineConfig(artifact_dir=artifact_dir)
        return training_pipeline_config


In [15]:
c = Configuration().get_training_pipeline_config()
c.artifact_dir

'C:\\Shubham\\Projects\\Personal\\ML_Project\\housing\\artifact'

In [16]:
config_data_info[TRAINING_PIPELINE_CONFIG_KEY]

{'pipeline_name': 'housing', 'artifact_dir': 'artifact'}

In [17]:
config_data_info[TRAINING_PIPELINE_CONFIG_KEY][TRAINING_PIPELINE_ARTIFACT_DIR_KEY]

'artifact'

In [18]:
ROOT_DIR

'C:\\Shubham\\Projects\\Personal\\ML_Project'

In [19]:
training_pipeline_config = config_data_info[TRAINING_PIPELINE_CONFIG_KEY]
os.path.join(ROOT_DIR,
             training_pipeline_config[TRAINING_PIPELINE_NAME_KEY],
             training_pipeline_config[TRAINING_PIPELINE_ARTIFACT_DIR_KEY])

'C:\\Shubham\\Projects\\Personal\\ML_Project\\housing\\artifact'

In [20]:
training_pipeline_config

{'pipeline_name': 'housing', 'artifact_dir': 'artifact'}

In [21]:
DATA_INGESTION_CONFIG_KEY = "data_ingestion_config"
DATA_INGESTION_ARTIFACT_DIR = "data_ingestion"
DATA_INGESTION_DOWNLOAD_URL_KEY = "dataset_download_url"
DATA_INGESTION_RAW_DATA_DIR_KEY = "raw_data_dir"
DATA_INGESTION_TGZ_DOWNLOAD_DIR_KEY = "tgz_download_dir"
DATA_INGESTION_INGESTED_DIR_NAME_KEY = "ingested_dir"
DATA_INGESTION_TRAIN_DIR_KEY = "ingested_train_dir"
DATA_INGESTION_TEST_DIR_KEY = "ingested_test_dir"

In [22]:
class Configuration:
    
    def __init__(self,config_file_path = CONFIG_FILE_PATH,current_time_stamp = CURRENT_TIME_STAMP):
        self.config_info = read_yaml_file(file_path = config_file_path)
        self.training_pipeline_config = self.get_training_pipeline_config()
        self.time_stamp = current_time_stamp
        
        
    def get_data_ingestion_config(self):
        
        artifact_dir = self.training_pipeline_config.artifact_dir
        data_ingestion_artifact_dir = os.path.join(artifact_dir,DATA_INGESTION_ARTIFACT_DIR,self.time_stamp)
        data_ingestion_info = self.config_info[DATA_INGESTION_CONFIG_KEY]
        
        dataset_download_url = data_ingestion_info[DATA_INGESTION_DOWNLOAD_URL_KEY]
        
        raw_data_dir = os.path.join(data_ingestion_artifact_dir,data_ingestion_info[DATA_INGESTION_RAW_DATA_DIR_KEY]) 
        
        tgz_download_dir = os.path.join(data_ingestion_artifact_dir,data_ingestion_info[DATA_INGESTION_TGZ_DOWNLOAD_DIR_KEY])
        
        ingested_dir = os.path.join(data_ingestion_artifact_dir,data_ingestion_info[DATA_INGESTION_INGESTED_DIR_NAME_KEY])
        
        ingested_train_dir = os.path.join(ingested_dir,data_ingestion_info[DATA_INGESTION_TRAIN_DIR_KEY])
        
        ingested_test_dir = os.path.join(ingested_dir,data_ingestion_info[DATA_INGESTION_TEST_DIR_KEY])
        
        data_ingestion_config = DataIngestionConfig(
            dataset_download_url = dataset_download_url,
            raw_data_dir = raw_data_dir,
            tgz_download_dir = tgz_download_dir,
            ingested_train_dir = ingested_train_dir,
            ingested_test_dir = ingested_test_dir) 
        
        return data_ingestion_config    
        
        
    def get_training_pipeline_config(self):
        
              
        training_pipeline_config = self.config_info[TRAINING_PIPELINE_CONFIG_KEY]

        artifact_dir = os.path.join(ROOT_DIR,
                                    training_pipeline_config[TRAINING_PIPELINE_NAME_KEY],
                                    training_pipeline_config[TRAINING_PIPELINE_ARTIFACT_DIR_KEY])
        
        training_pipeline_config = TrainingPipelineConfig(artifact_dir=artifact_dir)
        return training_pipeline_config

In [23]:
config_data_info[DATA_INGESTION_CONFIG_KEY]['raw_data_dir']

'raw_data'

In [24]:
c = Configuration()
c.get_data_ingestion_config()

DataIngestionConfig(dataset_download_url='https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz', tgz_download_dir='C:\\Shubham\\Projects\\Personal\\ML_Project\\housing\\artifact\\data_ingestion\\2023-02-27-12-48-12\\tgz_data', raw_data_dir='C:\\Shubham\\Projects\\Personal\\ML_Project\\housing\\artifact\\data_ingestion\\2023-02-27-12-48-12\\raw_data', ingested_train_dir='C:\\Shubham\\Projects\\Personal\\ML_Project\\housing\\artifact\\data_ingestion\\2023-02-27-12-48-12\\ingested_data\\train', ingested_test_dir='C:\\Shubham\\Projects\\Personal\\ML_Project\\housing\\artifact\\data_ingestion\\2023-02-27-12-48-12\\ingested_data\\test')

In [25]:
config_data_info

{'training_pipeline_config': {'pipeline_name': 'housing',
  'artifact_dir': 'artifact'},
 'data_ingestion_config': {'dataset_download_url': 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz',
  'raw_data_dir': 'raw_data',
  'tgz_download_dir': 'tgz_data',
  'ingested_dir': 'ingested_data',
  'ingested_train_dir': 'train',
  'ingested_test_dir': 'test'},
 'data_validation_config': {'schema_dir': 'config',
  'schema_file_name': 'schema.yaml',
  'report_file_name': 'report.json',
  'report_page_file_name': 'report.html'},
 'data_transformation_config': {'add_bedroom_per_room': True,
  'transformed_dir': 'transformed_data',
  'transformed_train_dir': 'train',
  'transformed_test_dir': 'test',
  'preprocessing_dir': 'preprocessed',
  'preprocessed_object_file_name': 'preprocessed.pkl'},
 'model_trainer_config': {'trained_model_dir': 'trained_model',
  'model_file_name': 'model.pkl',
  'base_accuracy': 0.6,
  'model_config_dir': 'config',
  'model_confi

# Data Validation

config_entity.py:-
DataValidationConfig
1. schema_file_path
2. report_file_path
3. report_page_file_path

config.yaml
data_validation_config:
    1. schema_file_path:schema.yaml
    2. report_file_path:report.json
    3. report_page_file_path:report.html
    4. schema_dir:'config'
    
constanat.py
DATA_VALIDATAION_CONFIG_KEY ='data_validation_config'
DATA_VALIDATION_SCHEMA_FILE_KEY = 'schema_file_path'
DATA_VALIDATION_REPORT_KEY = 'report_file_path'
DATA_VALIDATION_REPORT_PAGE_FILE_KEY = 'report_page_file_path'
DATA_VALIDATION_ARTIFACTS_DIR = 'data_validation'
DATA_VALIDATION_SCHMA_DIR = 'schema_dir'    

In [26]:
from housing.entity.config_entity import DataIngestionConfig, DataTransformationConfig,DataValidationConfig,   \
ModelTrainerConfig,ModelEvaluationConfig,ModelPusherConfig,TrainingPipelineConfig
from housing.util.util import read_yaml_file
import os
from housing.constant import *

In [39]:
class Configuration:
    
    def __init__(self,config_file_path = CONFIG_FILE_PATH,current_time_stamp = CURRENT_TIME_STAMP):
        self.config_info = read_yaml_file(file_path = config_file_path)
        self.training_pipeline_config = self.get_training_pipeline_config()
        self.time_stamp = current_time_stamp
        
        
    def get_data_ingestion_config(self):
        
        artifact_dir = self.training_pipeline_config.artifact_dir
        data_ingestion_artifact_dir = os.path.join(artifact_dir,DATA_INGESTION_ARTIFACT_DIR,self.time_stamp)
        data_ingestion_info = self.config_info[DATA_INGESTION_CONFIG_KEY]
        
        dataset_download_url = data_ingestion_info[DATA_INGESTION_DOWNLOAD_URL_KEY]
        
        raw_data_dir = os.path.join(data_ingestion_artifact_dir,data_ingestion_info[DATA_INGESTION_RAW_DATA_DIR_KEY]) 
        
        tgz_download_dir = os.path.join(data_ingestion_artifact_dir,data_ingestion_info[DATA_INGESTION_TGZ_DOWNLOAD_DIR_KEY])
        
        ingested_dir = os.path.join(data_ingestion_artifact_dir,data_ingestion_info[DATA_INGESTION_INGESTED_DIR_NAME_KEY])
        
        ingested_train_dir = os.path.join(ingested_dir,data_ingestion_info[DATA_INGESTION_TRAIN_DIR_KEY])
        
        ingested_test_dir = os.path.join(ingested_dir,data_ingestion_info[DATA_INGESTION_TEST_DIR_KEY])
        
        data_ingestion_config = DataIngestionConfig(
            dataset_download_url = dataset_download_url,
            raw_data_dir = raw_data_dir,
            tgz_download_dir = tgz_download_dir,
            ingested_train_dir = ingested_train_dir,
            ingested_test_dir = ingested_test_dir) 
        
        return data_ingestion_config    
    
    
    def get_data_validation_config(self):
        
        DataValidationConfig = namedtuple('DataValidationConfig',['schema_file_path','report_file_path','report_page_file_path'])
        
        artifact_dir = self.training_pipeline_config.artifact_dir
        
        data_validation_artifact_dir = os.path.join(artifact_dir,DATA_VALIDATION_ARTIFACT_DIR_NAME,self.time_stamp)
        
        data_validation_info = self.config_info[DATA_VALIDATION_CONFIG_KEY]
        
        schema_dir_path = os.path.join(ROOT_DIR,data_validation_info[DATA_VALIDATION_SCHEMA_DIR_KEY])
        
        schema_file_path = os.path.join(schema_dir_path,data_validation_info[DATA_VALIDATION_SCHEMA_FILE_NAME_KEY])
        
        report_file_path = os.path.join(data_validation_artifact_dir,data_validation_info[DATA_VALIDATION_REPORT_FILE_NAME_KEY])
        
        report_page_file_name = os.path.join(data_validation_artifact_dir,data_validation_info[DATA_VALIDATION_REPORT_PAGE_FILE_NAME_KEY])
                
        data_validation_config = DataValidationConfig(schema_file_path=schema_file_path,
                                                      report_file_path=report_file_path,
                                                      report_page_file_path=report_page_file_name)
        
        return data_validation_config
    
             
    def get_training_pipeline_config(self):
        
              
        training_pipeline_config = self.config_info[TRAINING_PIPELINE_CONFIG_KEY]

        artifact_dir = os.path.join(ROOT_DIR,
                                    training_pipeline_config[TRAINING_PIPELINE_NAME_KEY],
                                    training_pipeline_config[TRAINING_PIPELINE_ARTIFACT_DIR_KEY])
        
        training_pipeline_config = TrainingPipelineConfig(artifact_dir=artifact_dir)
        return training_pipeline_config

In [40]:
c = Configuration().get_data_validation_config()
c

DataValidationConfig(schema_file_path='C:\\Shubham\\Projects\\Personal\\ML_Project\\config\\schema.yaml', report_file_path='C:\\Shubham\\Projects\\Personal\\ML_Project\\housing\\artifact\\data_validation\\2023-02-27-12-48-12\\report.json', report_page_file_path='C:\\Shubham\\Projects\\Personal\\ML_Project\\housing\\artifact\\data_validation\\2023-02-27-12-48-12\\report.html')

In [42]:
class Configuration:
    
    def __init__(self,config_file_path = CONFIG_FILE_PATH,current_time_stamp = CURRENT_TIME_STAMP):
        self.config_info = read_yaml_file(file_path = config_file_path)
        self.training_pipeline_config = self.get_training_pipeline_config()
        self.time_stamp = current_time_stamp
        
        
    def get_data_ingestion_config(self):
        
        artifact_dir = self.training_pipeline_config.artifact_dir
        data_ingestion_artifact_dir = os.path.join(artifact_dir,DATA_INGESTION_ARTIFACT_DIR,self.time_stamp)
        data_ingestion_info = self.config_info[DATA_INGESTION_CONFIG_KEY]
        
        dataset_download_url = data_ingestion_info[DATA_INGESTION_DOWNLOAD_URL_KEY]
        
        raw_data_dir = os.path.join(data_ingestion_artifact_dir,data_ingestion_info[DATA_INGESTION_RAW_DATA_DIR_KEY]) 
        
        tgz_download_dir = os.path.join(data_ingestion_artifact_dir,data_ingestion_info[DATA_INGESTION_TGZ_DOWNLOAD_DIR_KEY])
        
        ingested_dir = os.path.join(data_ingestion_artifact_dir,data_ingestion_info[DATA_INGESTION_INGESTED_DIR_NAME_KEY])
        
        ingested_train_dir = os.path.join(ingested_dir,data_ingestion_info[DATA_INGESTION_TRAIN_DIR_KEY])
        
        ingested_test_dir = os.path.join(ingested_dir,data_ingestion_info[DATA_INGESTION_TEST_DIR_KEY])
        
        data_ingestion_config = DataIngestionConfig(
            dataset_download_url = dataset_download_url,
            raw_data_dir = raw_data_dir,
            tgz_download_dir = tgz_download_dir,
            ingested_train_dir = ingested_train_dir,
            ingested_test_dir = ingested_test_dir) 
        
        return data_ingestion_config    
    
    
    def get_data_validation_config(self):
        
        DataValidationConfig = namedtuple('DataValidationConfig',['schema_file_path','report_file_path','report_page_file_path'])
        
        artifact_dir = self.training_pipeline_config.artifact_dir
        
        data_validation_artifact_dir = os.path.join(artifact_dir,DATA_VALIDATION_ARTIFACT_DIR_NAME,self.time_stamp)
        
        data_validation_info = self.config_info[DATA_VALIDATION_CONFIG_KEY]
        
        schema_dir_path = os.path.join(ROOT_DIR,data_validation_info[DATA_VALIDATION_SCHEMA_DIR_KEY])
        
        schema_file_path = os.path.join(schema_dir_path,data_validation_info[DATA_VALIDATION_SCHEMA_FILE_NAME_KEY])
        
        report_file_path = os.path.join(data_validation_artifact_dir,data_validation_info[DATA_VALIDATION_REPORT_FILE_NAME_KEY])
        
        report_page_file_name = os.path.join(data_validation_artifact_dir,data_validation_info[DATA_VALIDATION_REPORT_PAGE_FILE_NAME_KEY])
                
        data_validation_config = DataValidationConfig(schema_file_path=schema_file_path,
                                                      report_file_path=report_file_path,
                                                      report_page_file_path=report_page_file_name)
        
        return data_validation_config
    
    def get_data_transformation_config(self):
        artifact_dir = self.training_pipeline_config.artifact_dir
        data_transformation_artifact_dir =os.path.join(artifact_dir,DATA_TRANSFORMATION_ARTIFACT_DIR,self.time_stamp)
        data_transformation_config_info=self.config_info[DATA_TRANSFORMATION_CONFIG_KEY]
        
        add_bedroom_per_room = data_transformation_config_info[DATA_TRANSFORMATION_ADD_BEDROOM_PER_ROOM_KEY]
        
        preprocessed_object_file_path = os.path.join(data_transformation_artifact_dir,
                                                     data_transformation_config_info[DATA_TRANSFORMATION_PREPROCESSING_DIR_KEY],
                                                     data_transformation_config_info[DATA_TRANSFORMATION_PREPROCESSED_FILE_NAME_KEY])
                
        transformed_train_dir = os.path.join(data_transformation_artifact_dir,
                                            data_transformation_config_info[DATA_TRANSFORMATION_DIR_NAME_KEY],
                                            data_transformation_config_info[DATA_TRANSFORMATION_TRAIN_DIR_NAME_KEY])
                                                     
        transformed_test_dir = os.path.join(data_transformation_artifact_dir,
                                             data_transformation_config_info[DATA_TRANSFORMATION_DIR_NAME_KEY],
                                            data_transformation_config_info[DATA_TRANSFORMATION_TEST_DIR_NAME_KEY])
        
        data_transformation_config = DataTransformationConfig(
            add_bedroom_per_room=add_bedroom_per_room,
            transformed_train_dir=transformed_train_dir,
            transformed_test_dir=transformed_test_dir,
            preprocessed_object_file_path=preprocessed_object_file_path)

        return data_transformation_config
        
         
    def get_training_pipeline_config(self):
        
              
        training_pipeline_config = self.config_info[TRAINING_PIPELINE_CONFIG_KEY]

        artifact_dir = os.path.join(ROOT_DIR,
                                    training_pipeline_config[TRAINING_PIPELINE_NAME_KEY],
                                    training_pipeline_config[TRAINING_PIPELINE_ARTIFACT_DIR_KEY])
        
        training_pipeline_config = TrainingPipelineConfig(artifact_dir=artifact_dir)
        return training_pipeline_config

In [43]:
c = Configuration().get_data_transformation_config()
c

DataTransformationConfig(add_bedroom_per_room=True, transformed_train_dir='C:\\Shubham\\Projects\\Personal\\ML_Project\\housing\\artifact\\data_transformation\\2023-02-27-12-48-12\\transformed_data\\train', transformed_test_dir='C:\\Shubham\\Projects\\Personal\\ML_Project\\housing\\artifact\\data_transformation\\2023-02-27-12-48-12\\transformed_data\\test', preprocessed_object_file_path='C:\\Shubham\\Projects\\Personal\\ML_Project\\housing\\artifact\\data_transformation\\2023-02-27-12-48-12\\preprocessed\\preprocessed.pkl')