In [2]:
import os

In [3]:
%pwd

'e:\\Data Science\\ML-Project\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'e:\\Data Science\\ML-Project'

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [7]:
from ML_Project.constants import *
from ML_Project.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH
        ):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        
        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformationConfig(
            root_dir= config.root_dir,
            data_path= config.data_path
        )
        
        return data_transformation_config

In [9]:
import os
from ML_Project import logger
from sklearn.model_selection import train_test_split
import pandas as pd

In [10]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        
    # Note: we can add different data transformation techniques such as scaler, pca and all we can perform all kinds of EDA in mL cycle here before passing this data to the model.
    
    def train_test_splitting(self):
        data = pd.read_csv(self.config.data_path)
        
        #split the data into training and test sets.
        train, test = train_test_split(data)
        
        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index = False)
        
        logger.info("Splitted data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)
        
        print(train.shape)
        print(test.shape)
        

In [11]:
try:
    config = ConfigurationManager()
    data_transformation_confg = config.get_data_transformation_config()
    data_transformation = DataTransformation(config= data_transformation_confg)
    data_transformation.train_test_splitting()
except Exception as e:
    raise e

[2025-01-26 13:14:28,776: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-01-26 13:14:28,795: INFO: common: yaml file: params.yaml loaded successfully]
[2025-01-26 13:14:28,819: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-01-26 13:14:28,822: INFO: common: created direcory at: artifacts]
[2025-01-26 13:14:28,826: INFO: common: created direcory at: artifacts/data_transformation]
[2025-01-26 13:14:28,934: INFO: 1473436160: Splitted data into training and test sets]
[2025-01-26 13:14:28,936: INFO: 1473436160: (1199, 12)]
[2025-01-26 13:14:28,940: INFO: 1473436160: (400, 12)]
(1199, 12)
(400, 12)
