In [1]:
import  os

In [2]:
%pwd

'c:\\Users\\Priyanshu\\Desktop\\ML Lab Assignment\\research'

In [3]:
os.chdir("../")

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [5]:
from src.ML_LAB.constants import *
from src.ML_LAB.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = Config_yaml_path,
        params_filepath = params_yaml_path,
        schema_filepath = schema_yaml_path):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [6]:
import os
import pandas as pd
import numpy as np
from src.ML_LAB import logger 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

class DataTransformationConfig:
    def __init__(self, root_dir, data_path):
        self.root_dir = root_dir
        self.data_path = data_path

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def drop_high_frequency_columns(self, df, threshold=0.95):
        """
        Drop columns where one value dominates > threshold (e.g. 95%)
        """
        cols_to_drop = []
        for col in df.columns:
            if df[col].dtype == 'object': # Only check categorical
                freq = (df[col].value_counts(normalize=True).max())
                if freq >= threshold:
                    cols_to_drop.append(col)
                    logger.info(f"Dropping column '{col}' due to high frequency ({freq:.2f})")
        
        return cols_to_drop

    def drop_high_vif_features(self, X_train_num, vif_threshold=20):
        """
        Iteratively drops features with high VIF.
        """
        cols_to_drop = []
        X_temp = X_train_num.copy()
        
        X_temp['intercept'] = 1 

        while True:
            vif_data = pd.DataFrame()
            vif_data["feature"] = X_temp.columns
            vif_data["VIF"] = [variance_inflation_factor(X_temp.values, i) 
                               for i in range(X_temp.shape[1])]
            
            vif_data = vif_data[vif_data['feature'] != 'intercept']
            
            max_vif = vif_data['VIF'].max()
            if max_vif > vif_threshold:
                feature_to_drop = vif_data.sort_values('VIF', ascending=False)['feature'].iloc[0]
                cols_to_drop.append(feature_to_drop)
                X_temp.drop(columns=[feature_to_drop], inplace=True)
                logger.info(f"Dropping column '{feature_to_drop}' due to VIF: {max_vif:.2f}")
            else:
                break
                
        return cols_to_drop

    def initiate_data_transformation(self):
        try:
            logger.info("Loading data...")
            df = pd.read_csv(self.config.data_path)

            if 'Booking_ID' in df.columns:
                df.drop('Booking_ID', axis=1, inplace=True)

            logger.info("Splitting data into Train and Test...")
            target_col = "booking_status"
            
            X = df.drop(target_col, axis=1)
            y = df[target_col]

            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, stratify=y, random_state=42
            )
            encoding_map = {'Canceled': 1, 'Not_Canceled': 0}
            y_train = y_train.map(encoding_map)
            y_test = y_test.map(encoding_map)

            cat_cols_train = X_train.select_dtypes(include=['object']).columns
            freq_drop_cols = self.drop_high_frequency_columns(X_train[cat_cols_train])
            
            X_train.drop(columns=freq_drop_cols, inplace=True)
            X_test.drop(columns=freq_drop_cols, inplace=True) # Apply same drop to test

            num_cols_train = X_train.select_dtypes(include=['number']).columns
            vif_drop_cols = self.drop_high_vif_features(X_train[num_cols_train])
            
            X_train.drop(columns=vif_drop_cols, inplace=True)
            X_test.drop(columns=vif_drop_cols, inplace=True) # Apply same drop to test

            num_cols = X_train.select_dtypes(include=['number']).columns.tolist()
            cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

            logger.info(f"Categorical cols: {cat_cols}")
            logger.info(f"Numerical cols: {num_cols}")

            ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
            ohe.fit(X_train[cat_cols])

            X_train_cat = pd.DataFrame(ohe.transform(X_train[cat_cols]), columns=ohe.get_feature_names_out(cat_cols), index=X_train.index)
            X_test_cat = pd.DataFrame(ohe.transform(X_test[cat_cols]), columns=ohe.get_feature_names_out(cat_cols), index=X_test.index)

            scaler = StandardScaler()
            scaler.fit(X_train[num_cols])

            X_train_num = pd.DataFrame(scaler.transform(X_train[num_cols]), columns=num_cols, index=X_train.index)
            X_test_num = pd.DataFrame(scaler.transform(X_test[num_cols]), columns=num_cols, index=X_test.index)

       
            train_final = pd.concat([X_train_num, X_train_cat, y_train], axis=1)
            test_final = pd.concat([X_test_num, X_test_cat, y_test], axis=1)

      
            train_file_path = os.path.join(self.config.root_dir, "train.csv")
            test_file_path = os.path.join(self.config.root_dir, "test.csv")

            train_final.to_csv(train_file_path, index=False)
            test_final.to_csv(test_file_path, index=False)

            logger.info(f"Transformation Complete. Train Shape: {train_final.shape}, Test Shape: {test_final.shape}")
            print(f"Train Saved at: {train_file_path}")
            print(f"Test Saved at: {test_file_path}")

        except Exception as e:
            logger.error(f"Error in Data Transformation: {str(e)}")
            raise e

In [7]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.initiate_data_transformation()
    
except Exception as e:
    raise e

[2025-11-29 12:49:14,262: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-11-29 12:49:14,265: INFO: common: yaml file: params.yaml loaded successfully]
[2025-11-29 12:49:14,269: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-11-29 12:49:14,269: INFO: common: created directory at: artifacts]
[2025-11-29 12:49:14,269: INFO: common: created directory at: artifacts/data_transformation]
[2025-11-29 12:49:14,277: INFO: 1632261556: Loading data...]
[2025-11-29 12:49:14,406: INFO: 1632261556: Splitting data into Train and Test...]
[2025-11-29 12:49:14,898: INFO: 1632261556: Categorical cols: ['type_of_meal_plan', 'room_type_reserved', 'market_segment_type']]
[2025-11-29 12:49:14,898: INFO: 1632261556: Numerical cols: ['no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights', 'required_car_parking_space', 'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_boo