In [1]:
import os

In [2]:
%pwd

'd:\\Data_science\\Projects\\kidney_disease_classification\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\Data_science\\Projects\\kidney_disease_classification'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_split_dir: Path
    train_ratio: float
    validation_ratio: float
    test_ratio: float
    split_seed: int

In [6]:
from KidneyCNN.constants import *
from KidneyCNN.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        params = self.params

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            # Paths (from config.yaml)
            # Input: Directory containing the 4 class subfolders
            root_dir = config.root_dir,
            data_split_dir = config.data_split_dir,

            # Output: Directory where the train/val/,
            
            # Parameters (from params.yaml)
            train_ratio = params.TRAIN_RATIO,
            validation_ratio = params.VALIDATION_RATIO,
            test_ratio = params.TEST_RATIO,
            split_seed = params.SPLIT_SEED ,
        )

        return data_transformation_config

In [8]:
import os
import shutil
import random
from pathlib import Path
import splitfolders

In [None]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config


    def perform_split(self):
        """
        Executes the stratified split using the split-folders library.
        """
        
        # 1. Prepare input and output paths
        # Assuming the 4 class folders are directly inside the 'raw_data_dir' 
        
        input_path = Path(self.config.root_dir) / "dataset" 
        
        # Check if the input directory exists and has subfolders (classes)
        if not input_path.exists():
             # Using the correct Path object for checking and printing
             print(f"Error: Raw data directory not found at {input_path}. Ensure data ingestion ran successfully.")
             return
             
        # Ensure the output directory is created (splitfolders does this, but good practice)
        os.makedirs(self.config.data_split_dir, exist_ok=True)
        
        # 2. Define ratios
        ratios = (
            self.config.train_ratio, 
            self.config.validation_ratio, 
            self.config.test_ratio
        )

        # 3. Perform the split
        print(f"Starting stratified data split (Train:{ratios[0]}, Validation:{ratios[1]}, Test:{ratios[2]})")
        
        try:
            splitfolders.ratio(
                str(input_path), 
                output=str(self.config.data_split_dir),
                seed=self.config.split_seed, 
                ratio=ratios,
                group_prefix=None,
                move=False # Set to True if you want to move files instead of copying
            )
            print(f"Data split successful. New structure created in: {self.config.data_split_dir}")

        except Exception as e:
            print(f"An error occurred during data splitting: {e}")

In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.perform_split()
except Exception as e:
    raise e

[2025-10-04 03:42:42,845: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-04 03:42:42,851: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-04 03:42:42,853: INFO: common: created directory at: artifacts]
[2025-10-04 03:42:42,853: INFO: common: created directory at: artifacts/data_ingestion]
Starting stratified data split (Train:0.7, Val:0.15, Test:0.15)


Copying files: 4000 files [00:44, 89.64 files/s]

Data split successful. New structure created in: artifacts/data_transformation



