In [1]:
import os

In [2]:
os.chdir("../")
%pwd

'/Users/hh/MLops/dataScienceProject'

In [11]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    target_column: str
    cleaned_data: Path

In [12]:
from src.datascience.constants import *
from src.datascience.utils.common import read_yaml, create_directories

In [13]:
class ConfigurationManager:
    def __init__(self,
                 config_file_path=CONFIG_FILE_PATH,
                 params_file_path=PARAMS_FILE_PATH,
                 schema_file_path=SCHEMA_FILE_PATH):
        self.config=read_yaml(config_file_path)
        self.params=read_yaml(params_file_path)
        self.schema=read_yaml(schema_file_path)
        create_directories([self.config.artifacts_root])

    def get_data_transformation(self) -> DataTransformationConfig:
        config=self.config.data_transformation
        schema = self.schema.TARGET_COLUMN
        create_directories([config.root_dir])

        data_transformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            target_column = schema.name,
            cleaned_data=config.cleaned_data
        )
        return data_transformation_config


In [14]:
import os
from src.datascience import logger
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [15]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def preprocess_data(self):
        df = pd.read_csv(self.config.data_path)

        target = self.config.target_column
        if target not in df.columns:
            raise ValueError(f"Target column '{target}' not found.")

        if "Sex" in df.columns:
            sex_norm = (
                df["Sex"].astype(str).str.strip().str.upper()
                .replace({"MALE": "M", "FEMALE": "F"})
            )
            df["Sex"] = sex_norm.map({"M": 1, "F": 0}).astype("float64")

        if "ExerciseAngina" in df.columns:
            ang_norm = df["ExerciseAngina"].astype(str).str.strip().str.upper()
            df["ExerciseAngina"] = ang_norm.map({"Y": 1, "N": 0}).astype("float64")

        cat_cols = []
        for c in ["ChestPainType", "RestingECG", "ST_Slope"]:
            if c in df.columns:
                cat_cols.append(c)

        if cat_cols:
            df = pd.get_dummies(df, columns=cat_cols, drop_first=True, dtype="float64")

        for c in df.columns:
            if c != target:
                df[c] = pd.to_numeric(df[c], errors="coerce")

        os.makedirs(os.path.dirname(self.config.cleaned_data), exist_ok=True)
        df.to_csv(self.config.cleaned_data, index=False)
        logger.info(f"Saved cleaned dataset → {self.config.cleaned_data}")


    def train_test_split(self):
        data = pd.read_csv(self.config.cleaned_data)

        train, test = train_test_split(data)
        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)
        logger.info("Data Splitted into training and testing sets")
        logger.info(f"train set shape {train.shape}")
        logger.info(f"test set shape {test.shape}")



In [16]:
try:
    obj = ConfigurationManager()
    data_transformation_config = obj.get_data_transformation()
    data_transformation = DataTransformation(data_transformation_config)
    data_transformation.preprocess_data()
    data_transformation.train_test_split()
except Exception as e:
    logger.exception(e)
    raise e

[2025-08-10 13:58:56,316: INFO: common: yaml file config/config.yaml is loaded successfully]
[2025-08-10 13:58:56,318: INFO: common: yaml file params.yaml is loaded successfully]
[2025-08-10 13:58:56,321: INFO: common: yaml file schema.yaml is loaded successfully]
[2025-08-10 13:58:56,322: INFO: common: created directory at artifacts]
[2025-08-10 13:58:56,323: INFO: common: created directory at artifacts/data_transformation]
[2025-08-10 13:58:56,340: INFO: 2903973266: Saved cleaned dataset → artifacts/data_transformation/cleaned_data.csv]
[2025-08-10 13:58:56,356: INFO: 2903973266: Data Splitted into training and testing sets]
[2025-08-10 13:58:56,357: INFO: 2903973266: train set shape (688, 16)]
[2025-08-10 13:58:56,358: INFO: 2903973266: test set shape (230, 16)]
