In [6]:
import os

In [7]:
%pwd

'c:\\Users\\dell\\Documents\\mlops\\walmart-sales\\research'

In [8]:
os.chdir("../")
%pwd

'c:\\Users\\dell\\Documents\\mlops\\walmart-sales'

In [9]:
import pandas as pd

In [10]:
features = pd.read_csv("artifacts/data_ingestion/test.csv")
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115064 entries, 0 to 115063
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Store      115064 non-null  int64 
 1   Dept       115064 non-null  int64 
 2   Date       115064 non-null  object
 3   IsHoliday  115064 non-null  bool  
dtypes: bool(1), int64(2), object(1)
memory usage: 2.7+ MB


In [11]:
cols, col_dtypes = features.dtypes.index.tolist(), features.dtypes.values.tolist()
cols, col_dtypes
for c,t in zip(cols, col_dtypes):
    print(f"{c}: {t}")

Store: int64
Dept: int64
Date: object
IsHoliday: bool


In [12]:
from dataclasses import dataclass, field
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    all_schema: dict
    data_dirs: dict = field(default_factory= lambda: {
        'features': None,
        'stores': None,
        'train': None,
        'test': None
    })

In [13]:
from projectFiles.constants import *
from projectFiles.utils.common import read_yaml, create_directories

In [14]:
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH, schema_filepath = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema
        
        data_files_dirs = {}
        for k,v in config.data_dirs.items():
            data_files_dirs[k] = v

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            all_schema = schema,
            data_dirs = data_files_dirs
        )

        # for k,v in data_validation_config.data_dirs.items():
        #     data_validation_config.data_dirs.k = config.data_dirs.k

        return data_validation_config

In [15]:
cm = ConfigurationManager()
val_config = cm.get_data_validation_config()

DEBUG: YAML Content from config\config.yaml -> {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/rkdsai/walmart-sales/raw/refs/heads/main/raw_data.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}, 'data_validation': {'root_dir': 'artifacts/data_validation', 'data_dirs': {'features': 'artifacts/data_ingestion/features.csv', 'stores': 'artifacts/data_ingestion/stores.csv', 'train': 'artifacts/data_ingestion/train.csv', 'test': 'artifacts/data_ingestion/test.csv'}, 'STATUS_FILE': 'artifacts/data_validation/status.txt'}}
[2025-03-07 19:06:50,726: INFO: common: yaml file: config\config.yaml loaded successfully]
DEBUG: YAML Content from params.yaml -> {'LGBMRegressor': {'n_estimators': 250, 'learning_rate': 0.05}}
[2025-03-07 19:06:50,729: INFO: common: yaml file: params.yaml loaded successfully]
DEBUG: YAML Content from schema.yaml -> {'features': {'COLUMNS': {'St

In [16]:
val_config.all_schema["features"]["COLUMNS"].keys()

dict_keys(['Store', 'Date', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday'])

In [17]:
from projectFiles import logger


In [18]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_data(self) -> bool:
        try:
            validation_status = None
            full_status = True
            
            with open(self.config.STATUS_FILE, 'w') as sfile:
                sfile.write("Initializing validation tests")

            for k,v in self.config.data_dirs.items():
                df = pd.read_csv(v)
                df_cols = list(df.columns)
                df_status = True

                schema_cols = self.config.all_schema[k]["COLUMNS"].keys()
                with open(self.config.STATUS_FILE, 'a') as sfile:
                    sfile.write(f"\n\nValidating columns for {k}.csv")
                for col in df_cols:
                    if col not in schema_cols:
                        validation_status = False
                        df_status = False
                        full_status = False
                    else:
                        validation_status = True
                    with open(self.config.STATUS_FILE, 'a') as sfile:
                        sfile.write(f"\nStatus of {v}: {validation_status}")
                with open(self.config.STATUS_FILE, 'a') as sfile:
                    sfile.write(f"\n{k}.csv final validation status: {df_status}")
            
            return full_status
            
        except Exception as e:
            raise e

In [19]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config = data_validation_config)
    data_validation.validate_data()
except Exception as e:
    raise e

DEBUG: YAML Content from config\config.yaml -> {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/rkdsai/walmart-sales/raw/refs/heads/main/raw_data.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}, 'data_validation': {'root_dir': 'artifacts/data_validation', 'data_dirs': {'features': 'artifacts/data_ingestion/features.csv', 'stores': 'artifacts/data_ingestion/stores.csv', 'train': 'artifacts/data_ingestion/train.csv', 'test': 'artifacts/data_ingestion/test.csv'}, 'STATUS_FILE': 'artifacts/data_validation/status.txt'}}
[2025-03-07 19:07:43,027: INFO: common: yaml file: config\config.yaml loaded successfully]
DEBUG: YAML Content from params.yaml -> {'LGBMRegressor': {'n_estimators': 250, 'learning_rate': 0.05}}
[2025-03-07 19:07:43,032: INFO: common: yaml file: params.yaml loaded successfully]
DEBUG: YAML Content from schema.yaml -> {'features': {'COLUMNS': {'St