In [1]:
from dataclasses import dataclass
from pathlib import Path
import os

In [2]:
%pwd

'/Users/saiprakashlikky/Desktop/Projects/ML_projects/Flight_Fare_estimator_Project/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/saiprakashlikky/Desktop/Projects/ML_projects/Flight_Fare_estimator_Project'

In [52]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataModellingConfig:
    root_dir: Path
    x_datapath: Path
    y_datapath: Path
    max_depth: float
    max_features: str
    min_samples_leaf: float
    min_samples_split: float
    n_estimators: float


In [53]:
from src.Flight_Fare_estimator_Project.constants import *
from src.Flight_Fare_estimator_Project.utils.common import read_yaml, create_directories
from src.Flight_Fare_estimator_Project.pipeline.stage_3_data_transformation import DataTransformation_stage

In [54]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_modelling_config(self) -> DataModellingConfig:
        config = self.config.data_modelling
        params= self.params.xgboost_params

        create_directories([config.root_dir])

        datamodelling_config = DataModellingConfig(
            root_dir=config.root_dir,
            x_datapath=config.x_datapath,
            y_datapath=config.y_datapath,
            max_depth=params.max_depth,
            max_features=params.max_features,
            min_samples_leaf=params.min_samples_leaf,
            min_samples_split=params.min_samples_split,
            n_estimators=params.n_estimators
        )

        return datamodelling_config


In [55]:
import warnings
import os
import pickle
import urllib.request as request
import zipfile
from src.Flight_Fare_estimator_Project import logger
from src.Flight_Fare_estimator_Project.utils.common import get_size
import pandas as pd


In [56]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler

In [57]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import pickle
import os
from src.Flight_Fare_estimator_Project import logger
from sklearn.model_selection import train_test_split


In [58]:
class DataModelling:
    def __init__(self,config):
        self.config=config
    

    def get_data_transformed_object(self,file_path):
        """
        Method: To handle categorical vaiables and standardization
        Description: This method is used to standardize the data and handle categorical variables.
        Parameters: Outfile file path name
        Return: preprocessor_obj, Scaled Independent Features and output feature
        Version: 1.0
        """
        try:
            x=pd.read_csv(self.config.x_datapath)
            y=pd.read_csv(self.config.y_datapath)

            numerical_columns = ['Duration_minutes']
            categorical_columns = ['Airline', 'Source', 'Destination', 'Total_Stops']

            num_pipeline = Pipeline(
                steps=[
                    ("scaler", StandardScaler())
                ]
            )
            cat_pipeline = Pipeline(
                steps=[
                    ("one_hot_encoder", OneHotEncoder()),
                    ("scaler", StandardScaler(with_mean=False))
                ]
            )
            logger.info(f"Categorical columns: {categorical_columns}")
            logger.info(f"Numerical columns: {numerical_columns}")

            preprocessor = ColumnTransformer(
                [
                    ("num_pipeline", num_pipeline, numerical_columns),
                    ("cat_pipelines", cat_pipeline, categorical_columns)
                ]
            )

            scaled_x = preprocessor.fit_transform(x)
            pickle_filepath = os.path.join(self.config.root_dir, file_path)
            with open(pickle_filepath, 'wb') as file:
                pickle.dump(scaled_x, file)

            return preprocessor,scaled_x,y
        except Exception as e:
            raise e


    def train_test_variables(self, x_scaled_variable,y):
        """
        Method: Extracting Train and Test Variables
        Description: This method is used to extract Train and test variables from Dataframe.
        Parameters: dependent and Independent variable
        Return: dependent and Independent variables after Train test Split
        Version: 1.0
        """
        try:

            logger.info(f"Shape of data is X: {x_scaled_variable.shape},Y:{y.shape}")


            logger.info("Train Test Split of The Data Started")
            x_train, x_test, y_train, y_test = train_test_split(x_scaled_variable, y, test_size=0.2, random_state=42)
            logger.info(f"Train Test Split Completed. Shapes of training and test data:\n{x_train.shape}\n{x_test.shape}\n{y_train.shape}\n{y_test.shape}")
            return x_train, x_test, y_train, y_test
        except Exception as e:
            raise e

    def model_trainer(self, x_train, y_train, x_test, y_test):
        """
        Method: Training the model from getting best models after research test
        Description: This method is used create best model.
        Parameters: train and test variables
        Return: model file
        Version: 1.0
        """

        try:
            logger.info("Training XGBoost Model Started")

            # Access XGBoost parameters from the configuration
            xgb_params = {
                'max_depth': self.config.max_depth,
                'max_features': self.config.max_features,
                'min_samples_leaf': self.config.min_samples_leaf,
                'min_samples_split': self.config.min_samples_split,
                'n_estimators': self.config.n_estimators
            }

            xgb_model = XGBRegressor(**xgb_params)
            xgb_model.fit(x_train, y_train)
            logger.info("Training XGBoost Model Completed")

            xgb_model_filepath = os.path.join(self.config.root_dir, 'xgb_model.pkl')
            with open(xgb_model_filepath, 'wb') as file:
                pickle.dump(xgb_model, file)

            logger.info(f"XGBoost model saved to {xgb_model_filepath}")
            return xgb_model
        except Exception as e:
            raise e


In [59]:
try:
    config = ConfigurationManager()
    data_modelling_config = config.get_modelling_config()
    data_modelling = DataModelling(config=data_modelling_config)
    preprocessor_obj, x_scaled, y = data_modelling.get_data_transformed_object("scaler.pkl")
    x_train, x_test, y_train, y_test = data_modelling.train_test_variables(x_scaled, y)
    xgb_model = data_modelling.model_trainer(x_train, y_train, x_test, y_test)
    
except Exception as e:
    raise e

[2023-11-28 00:57:31,363:INFO:common:yaml file: config/config.yaml loaded successfully]
[2023-11-28 00:57:31,366:INFO:common:yaml file: params.yaml loaded successfully]
[2023-11-28 00:57:31,367:INFO:common:yaml file: schema.yaml loaded successfully]
[2023-11-28 00:57:31,368:INFO:common:created directory at: artifacts]
[2023-11-28 00:57:31,368:INFO:common:created directory at: artifacts/data_modelling]
[2023-11-28 00:57:31,382:INFO:1684455004:Categorical columns: ['Airline', 'Source', 'Destination', 'Total_Stops']]
[2023-11-28 00:57:31,382:INFO:1684455004:Numerical columns: ['Duration_minutes']]
[2023-11-28 00:57:31,400:INFO:1684455004:Shape of data is X: (10682, 29),Y:(10682, 1)]
[2023-11-28 00:57:31,401:INFO:1684455004:Train Test Split of The Data Started]
[2023-11-28 00:57:31,404:INFO:1684455004:Train Test Split Completed. Shapes of training and test data:
(8545, 29)
(2137, 29)
(8545, 1)
(2137, 1)]
[2023-11-28 00:57:31,404:INFO:1684455004:Training XGBoost Model Started]
[2023-11-28 0

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

