In [14]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataPredictionConfig:
    root_dir: Path
    model_path:Path
    data_path:Path
    label_encoders:dict

In [16]:
import os
os.chdir('../')

In [26]:
%pwd

'r:\\Projects\\Flight_Price_Prediction_ML\\Flight-Fare-Prediction-ML-model'

In [18]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__ (
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH,
            schema_filepath = SCHEMA_FILE_PATH,
            label_encoding_filepath = LABEL_ENCODING_FILE_PATH):
        
            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)
            self.schema = read_yaml(schema_filepath)
            self.labels = read_yaml(label_encoding_filepath)



    def get_data_prediction_config(self) -> DataPredictionConfig:
        config = self.config.data_prediction
        label_encoders = self.labels

        create_directories([config.root_dir])

        data_prediction_config = DataPredictionConfig(
             root_dir= config.root_dir,
             model_path=config.model_path,
             data_path=config.data_path,
             label_encoders = label_encoders
        )

        return data_prediction_config

In [19]:
import os
import numpy as np
import pandas as pd



class DataPrediction:
    def __init__(self,config:DataPredictionConfig):
        self.config = config
        

    def data_transformation_for_prediction(self,df_):

        #creating columns for month,year,day of journey

        df_.Date_of_Journey = pd.to_datetime(df_.Date_of_Journey)
        df_['Journey_month'] = df_.Date_of_Journey.dt.month
        df_['Journey_year'] = df_.Date_of_Journey.dt.year
        df_['Journey_day'] = df_.Date_of_Journey.dt.day


        # creating columns for hours,min for depature and arrival time
        df_.Dep_Time = pd.to_datetime(df_.Dep_Time)
        df_['Dep_Time_hours'] = df_.Dep_Time.dt.hour
        df_['Dep_Time_mins'] = df_.Dep_Time.dt.minute
        df_.Arrival_Time = pd.to_datetime(df_.Arrival_Time)
        df_['Arrival_Time_hours'] = df_.Arrival_Time.dt.hour
        df_['Arrival_Time_mins'] = df_.Arrival_Time.dt.minute


        df_ = df_.drop(['Date_of_Journey','Dep_Time','Arrival_Time'],axis=1)

        def duration_hours(x):
            if 'h' in x:
                return int(x.split('h')[0])
            else:
                return 0


        def duration_mins(x):
            if 'h' in x and 'm' in x:
                return int(x.split(' ')[1].split('m')[0])

            elif 'm' in x:
                return int(x.split('m')[0])

            else:
                return 0


        df_['Duration_hours'] = df_['Duration'].apply(duration_hours)
        df_['Duration_minutes'] = df_['Duration'].apply(duration_mins)


        def total_mins(x):
            total = x['Duration_hours']*60 + x['Duration_minutes']
            return total

        df_['Duration_total_minutes'] = df_.apply(total_mins,axis=1)

        df_ = df_.drop('Duration',axis=1)

        sources = pd.read_excel(self.config.data_path).Source.unique()


        # Applying One-Hot encoder in sources column
        for source in sources:
            df_['Source_'+source] = np.nan
            df_['Source_'+source] = np.where(df_['Source']==source,1,0)


        df_['Airline label'] = np.nan

        airlines = dict(self.config.label_encoders.airlines)
        
        # Applying Label encoding in airline column
        df_['Airline label'] = df_['Airline'].map(airlines)
        df_ = df_.drop('Airline',axis=1)


        destinations = dict(self.config.label_encoders.destinations)

        df_.Destination = df_.Destination.replace('Delhi','New Delhi')

        # Applying Label encoding in destination column
        df_['Destination label'] = df_.Destination.map(destinations)

        # Dropping unncessary columns
        df_ = df_.drop(['Source','Destination','Additional_Info'],axis=1)

        stops_labels = {'non-stop':0,'1 stop':1,'2 stops':2,'3 stops':3,'4 stops':4}

        # Applying label encoding in total_stops column
        df_['Total_Stops_label'] = df_['Total_Stops'].map(stops_labels)

        df_['Total_Stops_label'] = df_['Total_Stops_label'].astype('int')

        df_ = df_.drop('Total_Stops',axis=1)

        df_ = df_.drop('Route',axis=1)

        df_ = df_.drop('Journey_year',axis=1)   # only 2019 year exists

        df_ = df_.drop('Price',axis=1)


        # changing float values to int
        for col in df_.columns:
            if df_[col].dtype == 'float64':
                df_[col]= df_[col].astype('int')



        return df_
    



In [20]:
obj = ConfigurationManager()
config = obj.get_data_prediction_config()
d = DataPrediction(config)
data1 = {'Airline': {0: 'IndiGo'},
 'Date_of_Journey': {0: '24/03/2019'},
 'Source': {0: 'Banglore'},
 'Destination': {0: 'New Delhi'},
 'Route': {0: 'BLR → DEL'},
 'Dep_Time': {0: '22:20'},
 'Arrival_Time': {0: '01:10 '},
 'Duration': {0: '2h 50m'},
 'Total_Stops': {0: 'non-stop'},
 'Additional_Info': {0: 'No info'},
 'Price':{0:'NA'}
    }

data ={'Airline': {0: 'IndiGo'},
 'Date_of_Journey': {0: '2024-07-11'},
 'Source': {0: 'Delhi'},
 'Destination': {0: 'Cochin'},
 'Route': {0: 'NA'},
 'Dep_Time': {0: '00:00'},
 'Arrival_Time': {0: '12:00'},
 'Duration': {0: '12h 0m'},
 'Total_Stops': {0: 'non-stop'},
 'Additional_Info': {0: 'No info'},
 'Price':{0:'NA'}
    }


df_ = pd.DataFrame(data)
d.data_transformation_for_prediction(df_)



[2024-07-15 22:13:22,098: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-15 22:13:22,106: INFO: common: yaml file: params.yaml loaded successfully]


[2024-07-15 22:13:22,119: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-15 22:13:22,136: INFO: common: yaml file: label_encoding.yaml loaded successfully]
[2024-07-15 22:13:22,141: INFO: common: created directory at: artifacts/model_evaluation]


Unnamed: 0,Journey_month,Journey_day,Dep_Time_hours,Dep_Time_mins,Arrival_Time_hours,Arrival_Time_mins,Duration_hours,Duration_minutes,Duration_total_minutes,Source_Banglore,Source_Kolkata,Source_Delhi,Source_Chennai,Source_Mumbai,Airline label,Destination label,Total_Stops_label
0,7,11,0,0,12,0,12,0,720,0,0,1,0,0,3,1,0


In [4]:
from datetime import datetime

def hours_minutes_calc(start_time, end_time):
    time_format = "%H:%M"
    start = datetime.strptime(start_time, time_format)
    end = datetime.strptime(end_time, time_format)
    total_minutes = (end - start).seconds // 60
    return divmod(total_minutes, 60)

# Example usage
start_time = "09:30"
end_time = "14:45"
hours, minutes = hours_minutes_calc(start_time, end_time)

str(hours)+"h"+" " +str(minutes)+"m"

'5h 15m'

In [27]:
import pandas as pd
pd.read_excel('artifacts\\data_ingestion\\data.xlsx').Total_Stops

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648
