In [1]:
import os

In [2]:
os.chdir("../")
%pwd

'/Users/hh/MLops/dataScienceProject'

In [3]:
import pandas as pd
data = pd.read_csv("artifacts/data_ingestion/heart.csv.xls")

In [4]:
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [5]:
data.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict



In [7]:
from src.datascience.constants import *
from src.datascience.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(self,
                 config_file_path=CONFIG_FILE_PATH,
                 params_file_path=PARAMS_FILE_PATH,
                 schema_file_path=SCHEMA_FILE_PATH):
        self.config=read_yaml(config_file_path)
        self.params=read_yaml(params_file_path)
        self.schema=read_yaml(schema_file_path)
        create_directories([self.config.artifacts_root])

    def get_data_validation(self) -> DataValidationConfig:
        config=self.config.data_validation
        create_directories([config.root_dir])

        data_validation_config=DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir=config.unzip_data_dir,
            all_schema=self.schema
        )
        return data_validation_config


In [9]:
from src.datascience import logger
import os
import numpy as np

In [10]:


class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
    
    def validate_all_columns(self) -> bool:
        try:
            columns_validation_status = None

            data = pd.read_csv(self.config.unzip_data_dir)
            all_columns = list(data.columns)

            list_all_schema = self.config.all_schema["COLUMNS"].keys()

            logger.info("Validating Each Column in Data File")
            for column in all_columns:
                if column not in list_all_schema:
                    columns_validation_status = False
                    with open(self.config.STATUS_FILE, "w") as f:
                        f.write(f"Validation Status: {columns_validation_status}")

                else:
                    columns_validation_status = True
                    with open(self.config.STATUS_FILE, "w") as f:
                        f.write(f"Validation Status: {columns_validation_status}")
            logger.info(f"Validation Status {columns_validation_status} Recorded in {self.config.STATUS_FILE}")
            return columns_validation_status
        
        except Exception as e:
            logger.exception(e)
            raise e



        


In [11]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation()
    data_validation = DataValidation(data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2025-08-07 21:37:37,597: INFO: common: yaml file config/config.yaml is loaded successfully]
[2025-08-07 21:37:37,599: INFO: common: yaml file params.yaml is loaded successfully]
[2025-08-07 21:37:37,603: INFO: common: yaml file schema.yaml is loaded successfully]
[2025-08-07 21:37:37,604: INFO: common: created directory at artifacts]
[2025-08-07 21:37:37,604: INFO: common: created directory at artifacts/data_validation]
[2025-08-07 21:37:37,607: INFO: 4251991257: Validating Each Column in Data File]
[2025-08-07 21:37:37,610: INFO: 4251991257: Validation Status True Recorded in artifacts/data_validation/status.txt]
