In [1]:
import os 

In [2]:
os.chdir('../')

In [3]:
!pwd

/Users/samaheddaoudi/E2E-datasciene_project-18/e2e-datascience_project-1


In [4]:
import pandas as pd
data=pd.read_csv("artifacts/data_ingestion/winequality-red.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [5]:
data.isnull().sum() ## to see if there is null values
data.dtypes


fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [6]:
data.shape
all_types = list(data.dtypes)
print(all_types)

[dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('int64')]


In [7]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataValidationConfig:
    root_dir: Path
    unzip_data_dir: Path
    STATUS_FILE: str ## str not path because we will use this value to create the file and write to it True or False later
    all_schema: dir ## Contains all schemas from schema.yaml
    

In [8]:
from src.datascience_project_1.constants import *
from src.datascience_project_1.utils.common import read_yaml , create_directories

In [9]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH


                 ):
        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)
        self.schema=read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])
    ## Get the needed informations about the data validation itself
    def get_data_validation_config(self) -> DataValidationConfig: 
        config=self.config.data_validation 
        schema=self.schema.COLUMNS 

        create_directories([config.root_dir])

        data_validation_config=DataValidationConfig(
            root_dir=config.root_dir,
            unzip_data_dir=config.unzip_data_dir,
            STATUS_FILE=config.STATUS_FILE,
            all_schema=schema,

        )
        return data_validation_config

In [10]:
import os
from src.datascience_project_1 import logger

In [11]:
## validate data by comparing its schema to the schema in schema.yaml
class DataValiadtion:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self)-> bool:
        try:
            validation_status = None

            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)

            all_schema = self.config.all_schema.keys()

            
            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")

            return validation_status
        
        except Exception as e:
            raise e

In [12]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValiadtion(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2025-04-27 17:42:13,543: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-04-27 17:42:13,548: INFO: common: yaml file: params.yaml loaded successfully]
[2025-04-27 17:42:13,554: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-04-27 17:42:13,557: INFO: common: created directory at: artifacts]
[2025-04-27 17:42:13,560: INFO: common: created directory at: artifacts/data_validation]


##### validate types

In [None]:
class DataValiadtion:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_types(self)-> bool:
        try:
            validation_status = None

            data = pd.read_csv(self.config.unzip_data_dir)
            all_types = list(data.dtypes)

            all_schema = self.config.all_schema.values()

            
            for type in all_types:
                if type not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status for types: {validation_status}")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status for types : {validation_status}")

            return validation_status
        
        except Exception as e:
            raise e

In [None]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValiadtion(config=data_validation_config)
    data_validation.validate_all_types()
except Exception as e:
    raise e

[2025-04-27 17:39:47,713: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-04-27 17:39:47,716: INFO: common: yaml file: params.yaml loaded successfully]
[2025-04-27 17:39:47,720: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-04-27 17:39:47,723: INFO: common: created directory at: artifacts]
[2025-04-27 17:39:47,724: INFO: common: created directory at: artifacts/data_validation]
