In [1]:
import os

In [2]:
%pwd

'd:\\Cdac_ML\\Assignments\\Assignment_4\\research'

In [3]:
os.chdir("../")

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [5]:
from src.Assignment_4.constants import *
from src.Assignment_4.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = Config_yaml_path,
        params_filepath = params_yaml_path,
        schema_filepath = schema_yaml_path):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [None]:
from src.Assignment_4 import logger
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import os

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def train_test_spliting(self):

        data = pd.read_csv(self.config.data_path)

        target_col = "satisfaction" 

        train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

        X_train = train_df.drop(columns=[target_col])
        y_train = train_df[target_col]

        X_test = test_df.drop(columns=[target_col])
        y_test = test_df[target_col]

        # CATEGORICAL ENCODING
       
        cat_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']

        ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        ohe.fit(X_train[cat_cols])

        X_train_ohe = pd.DataFrame(
            ohe.transform(X_train[cat_cols]),
            columns=ohe.get_feature_names_out(cat_cols),
            index=X_train.index
        )

        X_test_ohe = pd.DataFrame(
            ohe.transform(X_test[cat_cols]),
            columns=ohe.get_feature_names_out(cat_cols),
            index=X_test.index
        )

        # Merge OHE + numerical
        X_train_enc = pd.concat([X_train.drop(columns=cat_cols), X_train_ohe], axis=1)
        X_test_enc = pd.concat([X_test.drop(columns=cat_cols), X_test_ohe], axis=1)

        # Align features
        X_train_enc, X_test_enc = X_train_enc.align(X_test_enc, join="outer", axis=1, fill_value=0)

    
        # SCALING NUMERIC COLUMNS
        
        num_cols = X_train_enc.select_dtypes(include=["int64", "float64"]).columns

        scaler = StandardScaler()
        scaler.fit(X_train_enc[num_cols])

        X_train_enc[num_cols] = scaler.transform(X_train_enc[num_cols])
        X_test_enc[num_cols] = scaler.transform(X_test_enc[num_cols])

        # FINAL MERGE + SAVE
        
        train_final = pd.concat([X_train_enc, y_train], axis=1)
        test_final = pd.concat([X_test_enc, y_test], axis=1)

        # Save files
        train_path = os.path.join(self.config.root_dir, "train.csv")
        test_path = os.path.join(self.config.root_dir, "test.csv")

        train_final.to_csv(train_path, index=False)
        test_final.to_csv(test_path, index=False)

        logger.info("Train & Test transformed and saved successfully.")
        logger.info(f"Train shape: {train_final.shape}")
        logger.info(f"Test  shape: {test_final.shape}")

        print("Final train:", train_final.shape)
        print("Final test:", test_final.shape)


In [9]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_spliting()
except Exception as e:
    raise e

[2025-11-09 10:51:10,284: INFO: common: yaml file: D:\Cdac_ML\Assignments\Assignment_4\config\config.yaml loaded successfully]
[2025-11-09 10:51:10,287: INFO: common: yaml file: D:\Cdac_ML\Assignments\Assignment_4\params.yaml loaded successfully]
[2025-11-09 10:51:10,288: INFO: common: yaml file: D:\Cdac_ML\Assignments\Assignment_4\schema.yaml loaded successfully]
[2025-11-09 10:51:10,290: INFO: common: created directory at: artifacts]
[2025-11-09 10:51:10,290: INFO: common: created directory at: artifacts/data_transformation]
[2025-11-09 10:51:10,965: INFO: 1944876077: Splited data into training and test sets]
[2025-11-09 10:51:10,965: INFO: 1944876077: (97410, 24)]
[2025-11-09 10:51:10,965: INFO: 1944876077: (32470, 24)]
(97410, 24)
(32470, 24)
