In [4]:
from dataclasses import dataclass
from pathlib import Path
import os

In [5]:
%pwd

'/Users/saiprakashlikky/Desktop/Projects/ML_projects/Flight_Fare_estimator_Project/research'

In [6]:
os.chdir("../")

In [7]:
%pwd

'/Users/saiprakashlikky/Desktop/Projects/ML_projects/Flight_Fare_estimator_Project'

In [8]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataModellingConfig:
    root_dir: Path
    x_datapath: Path
    y_datapath: Path
    model_file_path: Path
    preprocessor_file_path: Path
    max_depth: float
    max_features: str
    min_samples_leaf: float
    min_samples_split: float
    n_estimators: float


In [9]:
from src.Flight_Fare_estimator_Project.constants import *
from src.Flight_Fare_estimator_Project.utils.common import read_yaml, create_directories,save_json
from src.Flight_Fare_estimator_Project.pipeline.stage_3_data_transformation import DataTransformation_stage

In [10]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_modelling_config(self) -> DataModellingConfig:
        config = self.config.data_modelling
        params= self.params.xgboost_params

        create_directories([config.root_dir])

        datamodelling_config = DataModellingConfig(
            root_dir=config.root_dir,
            x_datapath=config.x_datapath,
            y_datapath=config.y_datapath,
            model_file_path=config.model_file_path,
            preprocessor_file_path= config.preprocessor_file_path,
            max_depth=params.max_depth,
            max_features=params.max_features,
            min_samples_leaf=params.min_samples_leaf,
            min_samples_split=params.min_samples_split,
            n_estimators=params.n_estimators
        )

        return datamodelling_config


In [11]:
import warnings
import os
import pickle
import urllib.request as request
import zipfile
from src.Flight_Fare_estimator_Project import logger
from src.Flight_Fare_estimator_Project.utils.common import get_size
import pandas as pd
import numpy as np

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler

In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import pickle
import os
from src.Flight_Fare_estimator_Project import logger
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [17]:
class DataModelling:
    def __init__(self,config):
        self.config=config
    

    def get_data_transformed_object(self):
        try:
            x = pd.read_csv(self.config.x_datapath)
            y = pd.read_csv(self.config.y_datapath)

            logger.info(f"Columns in x: {x.columns}")
            logger.info(f"{x.dtypes}")

            numerical_columns = ['Duration_minutes', 'Month_of_Month_of_journey', 'day_of_date_of_journey']
            categorical_columns = ['Airline', 'Source', 'Destination', 'Total_Stops']

            data_OHE = pd.concat([x[['Month_of_Month_of_journey', 'day_of_date_of_journey', 'Duration_minutes']],
                      pd.get_dummies(x.Airline),   
                      pd.get_dummies(x.Source, prefix='source'),  
                      pd.get_dummies(x.Destination, prefix='destination'), 
                      pd.get_dummies(x.Total_Stops)], 
                    axis=1)
            data_OHE = data_OHE.astype(int)
            numerical_data = data_OHE[numerical_columns]
            scaler = StandardScaler()
            scaled_numerical_data = scaler.fit_transform(numerical_data)
            scaled_numerical_df = pd.DataFrame(scaled_numerical_data, columns=numerical_columns)
            scaled_numerical_df.reset_index(drop=True, inplace=True)
            data_OHE.reset_index(drop=True, inplace=True)


            data_OHE.drop(columns=numerical_columns, inplace=True)

            scaled_data_OHE = pd.concat([scaled_numerical_df, data_OHE], axis=1)

            scaler_filename = self.config.preprocessor_file_path
            with open(scaler_filename, 'wb') as scaler_file:
                pickle.dump(scaler, scaler_file)

            logger.info(f"{scaled_data_OHE.shape}")

            return scaled_data_OHE, y
        except Exception as e:
            raise e

    def train_test_variables(self, x_scaled_variable, y):
        try:
            logger.info(f"Shape of data is X: {x_scaled_variable.shape},Y:{y.shape}")

            logger.info("Train Test Split of The Data Started")
            x_train, x_test, y_train, y_test = train_test_split(x_scaled_variable, y, test_size=0.2, random_state=42)
            logger.info(f"Train Test Split Completed. Shapes of training and test data:\n{x_train.shape}\n{x_test.shape}\n{y_train.shape}\n{y_test.shape}")
            return x_train, x_test, y_train, y_test
        except Exception as e:
            raise e

    def model_trainer(self, x_train, y_train, x_test, y_test):
        """
        Method: Training the model from getting best models after research test
        Description: This method is used create best model.
        Parameters: train and test variables
        Return: model file
        Version: 1.0
        """

        try:
            logger.info("Training XGBoost Model Started")

            # Access XGBoost parameters from the configuration
            xgb_params = {
                'max_depth': self.config.max_depth,
                'max_features': self.config.max_features,
                'min_samples_leaf': self.config.min_samples_leaf,
                'min_samples_split': self.config.min_samples_split,
                'n_estimators': self.config.n_estimators
            }

            xgb_model = XGBRegressor(**xgb_params)
            xgb_model.fit(x_train, y_train)
            logger.info("Training XGBoost Model Completed")

            xgb_model_filepath = os.path.join(self.config.root_dir, 'xgb_model.pkl')
            with open(xgb_model_filepath, 'wb') as file:
                pickle.dump(xgb_model, file)

            logger.info(f"XGBoost model saved to {xgb_model_filepath}")
            return xgb_model
        except Exception as e:
            raise e
    


In [18]:
try:
            config = ConfigurationManager()
            data_modelling_config = config.get_modelling_config()
            data_modelling = DataModelling(config=data_modelling_config)
            x_scaled, y = data_modelling.get_data_transformed_object()
            x_train, x_test, y_train, y_test = data_modelling.train_test_variables(x_scaled, y)
            xgb_model = data_modelling.model_trainer(x_train, y_train, x_test, y_test)
            
    
except Exception as e:
    raise e

[2023-11-29 16:03:55,729:INFO:common:yaml file: config/config.yaml loaded successfully]
[2023-11-29 16:03:55,731:INFO:common:yaml file: params.yaml loaded successfully]
[2023-11-29 16:03:55,732:INFO:common:yaml file: schema.yaml loaded successfully]
[2023-11-29 16:03:55,732:INFO:common:created directory at: artifacts]
[2023-11-29 16:03:55,733:INFO:common:created directory at: artifacts/data_modelling]
[2023-11-29 16:03:55,746:INFO:1934804010:Columns in x: Index(['Airline', 'Source', 'Destination', 'Total_Stops',
       'Month_of_Month_of_journey', 'day_of_date_of_journey',
       'Duration_minutes'],
      dtype='object')]
[2023-11-29 16:03:55,747:INFO:1934804010:Airline                      object
Source                       object
Destination                  object
Total_Stops                  object
Month_of_Month_of_journey     int64
day_of_date_of_journey        int64
Duration_minutes              int64
dtype: object]
[2023-11-29 16:03:55,763:INFO:1934804010:(10682, 31)]
[2023-1

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

