In [2]:
import os
os.chdir("../")

In [19]:
from src.carPricePrediction import logger

In [20]:
# from dataclasses import dataclass
# from pathlib import Path

# @dataclass(frozen=True)
# class DataTransformationConfig:
#     root_dir: Path
#     data_path: Path
#     preprocessor_name: str
#     target_column: str

In [21]:
from src.carPricePrediction.constants import *
from src.carPricePrediction.utils.common import read_yaml, create_directories
# from src.carPricePrediction.entity.config_entity import (DataIngestionConfig, 
#                                                 DataValidationConfig,
#                                                 DataTransformationConfig)

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root]) 
   
   
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            preprocessor_name=config.preprocessor_name,
            target_column=schema.name
        )

        return data_transformation_config

In [22]:
# Custom transformer for Target Encoding
from sklearn.base import BaseEstimator, TransformerMixin

class TargetEncodingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        self.encoder = TargetEncoder(cols=self.cols)
    
    def fit(self, X, y=None):
        self.encoder.fit(X[self.cols], y)
        return self
    
    def transform(self, X, y=None):
        X[self.cols] = self.encoder.transform(X[self.cols])
        return X

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer
import joblib




class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        """
        Initialize the DataTransformation class with a given configuration.
        """
        self.config = config


    def _drop_irrelevant_columns(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Drop irrelevant columns from the data.

        Parameters:
        data (pd.DataFrame): The input data.

        Returns:
        pd.DataFrame: The data without the dropped columns.
        """

        data=data.drop(columns=['car_name','registration_year'])
        print(data.columns)

        return data

    def _handle_missing_values(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Drop rows with missing target values and handle other missing values as needed.

        Parameters:
        data (pd.DataFrame): The raw input data.

        Returns:
        pd.DataFrame: The data with missing target values removed.
        """

        print("self.config.target_column : ", self.config.target_column)
        data = data.dropna(subset=[self.config.target_column])

        # Additional handling for missing values in features can be added here if needed
        return data

    def _separate_features_and_target(self, data: pd.DataFrame) -> tuple:
        """
        Separate features and target variable from the data.

        Parameters:
        data (pd.DataFrame): The input data.

        Returns:
        tuple: Features (X) and target (y).
        """
        X = data.drop(columns=self.config.target_column)
        y = data[self.config.target_column]
        return X, y

    def _select_columns_by_type(self, X: pd.DataFrame) -> tuple:
        """
        Select numerical and categorical columns from the feature data.

        Parameters:
        X (pd.DataFrame): The feature data.

        Returns:
        tuple: Lists of numerical and categorical column names.
        """
        num_cols = ['manufacturing_year', 'seats', 'kms_driven',
       'mileage(kmpl)', 'engine(cc)', 'torque(Nm)']

        cat_cols = ['insurance_validity', 'fuel_type', 'ownsership', 'transmission']

        return num_cols, cat_cols

    def _create_transformer(self, num_cols: list, cat_cols: list) -> ColumnTransformer:
        """
        Create a column transformer for preprocessing.

        Parameters:
        num_cols (list): List of numerical column names.
        cat_cols (list): List of categorical column names.

        Returns:
        ColumnTransformer: The column transformer.
        """
        num_pipeline = Pipeline(steps=[
            ('imputer', KNNImputer(n_neighbors=3)),  # Using KNNImputer
            ('scaler', StandardScaler())
        ])


        cat_pipeline = ColumnTransformer(
        transformers=[
            # Label Encoding for insurance_validity
            ('insurance_validity', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['insurance_validity']),

            # One-Hot Encoding for fuel_type and transmission
            ('fuel_type', OneHotEncoder(handle_unknown='ignore'), ['fuel_type']),
            ('transmission', OneHotEncoder(handle_unknown='ignore'), ['transmission']),

            # Ordinal Encoding for ownership
            ('ownsership', OrdinalEncoder(categories=[['First Owner', 'Second Owner', 'Third Owner', 'Fourth Owner','Fifth Owner']]), ['ownsership'])
        ],
        
        remainder='passthrough'  # Pass the numeric features through without transformation
)

        transformer = ColumnTransformer(transformers=[
            ('num_pipeline', num_pipeline, num_cols),
            ('cat_pipeline', cat_pipeline, cat_cols),
        ], remainder='drop', n_jobs=-1)

        return transformer

    def _save_transformer(self, transformer: ColumnTransformer) -> None:
        """
        Save the fitted transformer to a file.

        Parameters:
        transformer (ColumnTransformer): The fitted column transformer.
        """
        joblib.dump(transformer, os.path.join(self.config.root_dir, self.config.preprocessor_name))


    def _apply_target_encoding(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:

        """
        Apply target encoding to specified categorical columns.

        Parameters:
        X (pd.DataFrame): The feature data.
        y (pd.Series): The target variable.

        Returns:
        pd.DataFrame: The feature data with target-encoded columns.
        """
        
        target_encoder = TargetEncodingTransformer(cols=['short_carname'])
        target_encoder.fit(X, y)
        return target_encoder

    def preprocess_data(self, X : pd.DataFrame,y : pd.DataFrame, fit: bool = True) -> pd.DataFrame:
        """
        Perform the complete preprocessing pipeline on the data.

        This includes:
        - Dropping irrelevant columns
        - Handling missing values (dropping rows with missing target)
        - Separating features and target
        - Applying target encoding
        - Selecting numerical and categorical columns
        - Creating and saving the transformer
        - Transforming the feature data

        Parameters:
        data (pd.DataFrame): The raw input data.
        fit (bool): Whether to fit the transformers or just transform using pre-fitted transformers.

        Returns:
        tuple: Transformed feature data (X_transformed) and target variable (y).
        """



        target_encoder=self._load_target_encoder()
        X=target_encoder.transform(X)
        transformer = joblib.load(os.path.join(self.config.root_dir, self.config.preprocessor_name))
        X_transformed = transformer.transform(X)


        return X_transformed, y

    def _save_target_encoder(self, encoder: TargetEncodingTransformer) -> None:
        joblib.dump(encoder, os.path.join(self.config.root_dir, 'target_encoder.pkl'))

    def _load_target_encoder(self) -> TargetEncodingTransformer:
        return joblib.load(os.path.join(self.config.root_dir, 'target_encoder.pkl'))

    def train_test_splitting(self) -> None:
        """
        Load data, preprocess it, and split into training and test sets.
        """
        try:
            data = pd.read_csv(self.config.data_path)


            data = self._drop_irrelevant_columns(data)
            data = self._handle_missing_values(data)

            
            X, y = self._separate_features_and_target(data)

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

            num_cols, cat_cols = self._select_columns_by_type(X)
            transformer = self._create_transformer(num_cols, cat_cols)
            transformer.fit(X_train,y_train)
            self._save_transformer(transformer)

            target_encoder = self._apply_target_encoding(X_train,y_train)
            self._save_target_encoder(target_encoder)
            

            X_train,y_train = self.preprocess_data(X_train,y_train)
            X_test,y_test = self.preprocess_data(X_test,y_test)



            X_train = pd.DataFrame(X_train).reset_index()
            X_test = pd.DataFrame(X_test).reset_index()

            y_train = pd.DataFrame(y_train).reset_index()
            y_test = pd.DataFrame(y_test).reset_index()

            print("checking null values ",X_train.isnull().sum())

            print("checking length of ",len(X_train),len(y_train))

            train_processed = pd.concat([X_train, y_train], axis=1)
            test_processed = pd.concat([X_test, y_test], axis=1)

            print("checking null values ",train_processed.isnull().sum())

            train_processed.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
            test_processed.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

            logger.info("Data split into training and test sets")
            logger.info(f"Shape of preprocessed training data: {train_processed.shape}")
            logger.info(f"Shape of preprocessed test data: {test_processed.shape}")

        except Exception as e:
            logger.error("An error occurred during train-test splitting", exc_info=True)
            raise


    

In [44]:
try:
    config = ConfigurationManager()
    print("1")
    data_transformation_config = config.get_data_transformation_config()
    print("2")
    data_transformation = DataTransformation(config=data_transformation_config)
    print("3")
    data_transformation.train_test_splitting()
    print("4")
except Exception as e:
    print("5")
    raise e

[2024-07-07 19:41:22,496: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-07 19:41:22,502: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-07 19:41:22,516: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-07 19:41:22,521: INFO: common: created directory at: artifacts]
1
[2024-07-07 19:41:22,527: INFO: common: created directory at: artifacts/data_transformation]
2
3
Index(['insurance_validity', 'fuel_type', 'ownsership', 'transmission',
       'manufacturing_year', 'seats', 'kms_driven', 'mileage(kmpl)',
       'engine(cc)', 'torque(Nm)', 'price(in lakhs)', 'short_carname'],
      dtype='object')
self.config.target_column :  price(in lakhs)
checking null values  index    0
0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
dtype: int64
checking length of  813 813
checking null values  index              0
0                  0
1       