In [1]:
import os

In [2]:
%pwd

'f:\\Self Learning\\car_price_prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'f:\\Self Learning\\car_price_prediction'

In [5]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path
    
@dataclass
class PrepareBaseModelConfig:
    base_model_path: Path
    updated_base_model_path: Path
    feature_columns: List[str]
    target_column: str
    test_data_path: Path

@dataclass
class PreprocessingConfig:
    target_column: str
    test_size: float
    random_state: int

In [6]:
from car_price_prediction.constants import *
from car_price_prediction.utils.common import read_yaml, create_directories

In [7]:
import os
import urllib.request as request
from zipfile import ZipFile

In [16]:
import os
import joblib
from sklearn.linear_model import LinearRegression
from pathlib import Path
from typing import Optional
import pandas as pd

from car_price_prediction.entity.config_entity import PrepareBaseModelConfig

class PrepareBaseModel:
    def __init__(self, config: PrepareBaseModelConfig):
        self.config = config
        self.model = None
        self.full_model = None

    def get_base_model(self):
        """Initialize base Linear Regression model"""
        self.model = LinearRegression()
        self.save_model(self.config.base_model_path, self.model)

    @staticmethod
    def _prepare_full_model(model: LinearRegression, X: pd.DataFrame, y: pd.Series):
        """Train the linear regression model on the dataset"""
        model.fit(X, y)
        return model

    def update_base_model(self):
        df = pd.read_csv(self.config.test_data_path)

        # ✅ Clean column names
        df.columns = df.columns.str.replace(",", "", regex=False).str.strip()

        # ✅ Replace bad string values with NaN
        df.replace("-", pd.NA, inplace=True)

        # ✅ Drop rows missing target column
        df.dropna(subset=[self.config.target_column], inplace=True)

        # ✅ Convert all feature columns to numeric
        for col in self.config.feature_columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

        # ✅ Drop rows where all features are NaN
        df.dropna(subset=self.config.feature_columns, how='any', inplace=True)

        # 🚨 Guard: ensure dataset isn't empty
        if df.empty:
            raise ValueError("❌ No valid data left after cleaning. Please check your dataset and params.yaml.")

        # ✅ Debug logs
        print("✅ Final columns:", df.columns.tolist())
        print(f"✅ Using {len(df)} samples for training.")
        print("✅ Features Preview:")
        print(df[self.config.feature_columns].head())

        # ✅ Prepare features and target
        X = df[self.config.feature_columns]
        y = df[self.config.target_column]

        # ✅ Train model
        self.full_model = self._prepare_full_model(self.model, X, y)

        # ✅ Save the updated model
        self.save_model(self.config.updated_base_model_path, self.full_model)

    @staticmethod
    def save_model(path: Path, model: LinearRegression):
        """Save model using joblib"""
        os.makedirs(path.parent, exist_ok=True)
        joblib.dump(model, path)


In [17]:
from car_price_prediction.entity.config_entity import (
    DataIngestionConfig,
    PreprocessingConfig,
    PrepareBaseModelConfig
)
from car_price_prediction.utils.common import read_yaml
from car_price_prediction.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH

class ConfigurationManager:
    def __init__(self, config_path=CONFIG_FILE_PATH, params_path=PARAMS_FILE_PATH):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        return DataIngestionConfig(
            local_data_file=Path(self.config["data_ingestion"]["local_data_file"])
        )

    def get_preprocessing_config(self) -> PreprocessingConfig:
        return PreprocessingConfig(
            target_column=self.params["preprocessing"]["target_column"]
        )

    def get_prepare_base_model_config(self) -> PrepareBaseModelConfig:
        model_config = self.config["prepare_base_model"]
        model_params = self.params["model"]
        return PrepareBaseModelConfig(
            base_model_path=Path(model_config["base_model_path"]),
            updated_base_model_path=Path(model_config["updated_base_model_path"]),
            feature_columns=model_params["feature_columns"],
            target_column=model_params["target_column"],
            test_data_path=Path(model_config["test_data_path"])
        )


In [18]:

config = ConfigurationManager()
prepare_base_model_config = config.get_prepare_base_model_config()

prepare_model = PrepareBaseModel(config=prepare_base_model_config)
prepare_model.get_base_model()
prepare_model.update_base_model()


[2025-07-15 00:10:19,491: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-15 00:10:19,498: INFO: common: yaml file: params.yaml loaded successfully]


ValueError: ❌ No valid data left after cleaning. Please check your dataset and params.yaml.