In [38]:
import os

In [39]:
%pwd

'/Users/macbook/PycharmProjects/end_to_end_mlops'

In [4]:
os.chdir('../')

In [40]:
%pwd

'/Users/macbook/PycharmProjects/end_to_end_mlops'

In [41]:
from dataclasses import dataclass
from pathlib import Path

In this file we need to define the data we want to validate which is the first stage in this pipeline

In [42]:
#/Users/macbook/PycharmProjects/end_to_end_mlops/artifacts/data_ingestion/email_phishing_data.csv
@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    unzip_data_dir: Path
    STATUS_FILE: Path
    all_schema: dict

In [43]:
from mlProject.constants import *

In [44]:
from mlProject.utils.common import * #here are all teh reusable code blocks are defigned as functions and can be called and used more than once

In [45]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema=self.schema.COLUMNS
        create_directories([config.root_dir])
        return DataValidationConfig(
            root_dir=Path(config.root_dir),
            unzip_data_dir=Path(config.unzip_data_dir),
            STATUS_FILE=Path(config.STATUS_FILE),
            all_schema= schema
        )



In [46]:
import os
import urllib.request as request
import zipfile
from mlProject import logger
from mlProject.utils.common import get_size

In [47]:
import pandas as pd

In [58]:
class DataValiadtion:
    def __init__(self, config: DataValidationConfig):
        self.config = config


    def validate_all_columns(self)-> bool:
        try:
            validation_status = None
            errors=[]

            data = pd.read_csv(self.config.unzip_data_dir)
            
            all_cols = list(data.columns)
            all_schema = self.config.all_schema
            
            logger.info(f"Columns in data: {all_cols}")
            logger.info(f"Expected schema: {all_schema}")
            
            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                        errors.append(f"Missing column: {col}")
                else:
                    actual_dtype = str(data[col].dtype)
                    expected_dtype = str(all_schema[col])
                    if actual_dtype != expected_dtype:
                        validation_status = False
                        errors.append(
                            f"Type mismatch in column '{col}': expected {expected_dtype}, got {actual_dtype}"
                        )
                    else:
                        validation_status = True
                    
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")

            return validation_status
        
        except Exception as e:
            raise e


In [59]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValiadtion(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2025-05-24 14:57:00,732: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-05-24 14:57:00,792: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-24 14:57:00,800: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-24 14:57:00,802: INFO: common: created directory at: artifacts]
[2025-05-24 14:57:00,803: INFO: common: created directory at: artifacts/data_validation]
[2025-05-24 14:57:01,079: INFO: 3719562795: Columns in data: ['num_words', 'num_unique_words', 'num_stopwords', 'num_links', 'num_unique_domains', 'num_email_addresses', 'num_spelling_errors', 'num_urgent_keywords', 'label']]
[2025-05-24 14:57:01,082: INFO: 3719562795: Expected schema: {'num_words': 'int64', 'num_unique_words': 'int64', 'num_stopwords': 'int64', 'num_links': 'int64', 'num_unique_domains': 'int64', 'num_email_addresses': 'int64', 'num_spelling_errors': 'int64', 'num_urgent_keywords': 'int64', 'label': 'int64'}]
