In [1]:
# data ingestion code

import zipfile
import os 
from pathlib import Path


class DataIngestion:

    def __init__(self, zip_path: str, extract_path: str) -> None:
        self.zip_path = Path(zip_path) 
        self.extract_dir = Path(extract_path)
    
    def ingest(self) -> Path:
        if not self.zip_path.exists():
            raise FileNotFoundError(f"Source data not found in {self.zip_path}")
        
        self.extract_dir.mkdir(parents=True, exist_ok=True)

        with zipfile.ZipFile(self.zip_path, 'r') as zip_ref:
            zip_ref.extractall(self.extract_dir)
        
        return self.extract_dir


In [3]:
ZIP_FILE_PATH= r"E:\project_archive\emovision-complete-mlops\data\sources\archive.zip"
EXTRACT_DIR_PATH = r"E:\project_archive\emovision-complete-mlops\data\raw"
# d = DataIngestion(ZIP_FILE_PATH, EXTRACT_DIR_PATH)
# data_path = d.ingest()

In [23]:
# data validation

import os
from pathlib import Path 
from collections import Counter
from typing import List

class DataValidation:
    
    def __init__(self, raw_data_path: str, allowed_extension: None, splits: None) -> None:
        self.raw_data_dir = Path(raw_data_path)
        self.allowed_extension = allowed_extension or {".jpg", ".png", ".jpeg"}
        self.splits = splits or ["test", "train"]
    
    def validate_splits(self):
        for split in self.splits:
            split_path = self.raw_data_dir / split 
            if not split_path.exists():
                raise FileNotFoundError(f"Missing split folder: {split}")

    def get_class(self, split) -> List[str]:
        split_path = self.raw_data_dir / split
        class_dirs = [d.stem for d in split_path.iterdir()]

        return class_dirs
    
    def validate_class_consistency(self):
        base_class = self.get_class(self.splits[0])
        for split in self.splits[1:]:
            if self.get_class(split) != base_class:
                raise ValueError("Class Mismatch between train and test.")
        
        return base_class
    
    def validate_image(self, split, class_name) -> int:
        class_path = self.raw_data_dir / split / class_name
        images = list(class_path.iterdir())

        if not images:
            raise ValueError(f"Empty folder: {class_path}")
        
        for img in images:
           if img.suffix.lower() not in self.allowed_extension:
               raise ValueError(f"Invalid imgage format: {img}")
        
        return len(images)
    
    def run_validation(self):
        self.validate_splits()
        classes = self.validate_class_consistency()

        report = {}
        for split in self.splits:
            report[split] = {}
            for cls in classes:
                count = self.validate_image(split, cls)
                report[split][cls] = count 
        
        return report



In [24]:
RAW_DATA_PATH =  r"E:\project_archive\emovision-complete-mlops\data\raw"
SPLITS = ["train", "test"]
ALLOWED_EXT = {".jpg", ".png", ".jpeg"}
dv = DataValidation(RAW_DATA_PATH, None, None)

dv.validate_splits()

In [8]:
dv.get_class('test')

['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'sad', 'surprised']

In [21]:
dv.validate_class_consistency()

['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'sad', 'surprised']

In [27]:
report = dv.run_validation()
report

{'test': {'angry': 958,
  'disgusted': 111,
  'fearful': 1024,
  'happy': 1774,
  'neutral': 1233,
  'sad': 1247,
  'surprised': 831},
 'train': {'angry': 3995,
  'disgusted': 436,
  'fearful': 4097,
  'happy': 7215,
  'neutral': 4965,
  'sad': 4830,
  'surprised': 3171}}

In [9]:
print(data_path.exists())

True


In [13]:
for root, dirs, files in os.walk(data_path):
    print(f"{dirs}")

['test', 'train']
['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'sad', 'surprised']
[]
[]
[]
[]
[]
[]
[]
['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'sad', 'surprised']
[]
[]
[]
[]
[]
[]
[]


In [14]:
for item in data_path.iterdir():
    print(item)

E:\project_archive\emovision-complete-mlops\data\raw\test
E:\project_archive\emovision-complete-mlops\data\raw\train


In [15]:
train_path = data_path / 'train'
train_path

WindowsPath('E:/project_archive/emovision-complete-mlops/data/raw/train')

In [18]:
for item in train_path.iterdir():
    print(item.stem)

angry
disgusted
fearful
happy
neutral
sad
surprised


In [20]:
class_dirs = [d.stem for d in train_path.iterdir()]
class_dirs

['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'sad', 'surprised']

In [21]:
category = data_path / 'test' / 'happy'
category

WindowsPath('E:/project_archive/emovision-complete-mlops/data/raw/test/happy')

In [23]:
images = list(category.iterdir())
images[:10]

[WindowsPath('E:/project_archive/emovision-complete-mlops/data/raw/test/happy/im0.png'),
 WindowsPath('E:/project_archive/emovision-complete-mlops/data/raw/test/happy/im1.png'),
 WindowsPath('E:/project_archive/emovision-complete-mlops/data/raw/test/happy/im10.png'),
 WindowsPath('E:/project_archive/emovision-complete-mlops/data/raw/test/happy/im100.png'),
 WindowsPath('E:/project_archive/emovision-complete-mlops/data/raw/test/happy/im1000.png'),
 WindowsPath('E:/project_archive/emovision-complete-mlops/data/raw/test/happy/im1001.png'),
 WindowsPath('E:/project_archive/emovision-complete-mlops/data/raw/test/happy/im1002.png'),
 WindowsPath('E:/project_archive/emovision-complete-mlops/data/raw/test/happy/im1003.png'),
 WindowsPath('E:/project_archive/emovision-complete-mlops/data/raw/test/happy/im1004.png'),
 WindowsPath('E:/project_archive/emovision-complete-mlops/data/raw/test/happy/im1005.png')]