## 1. Importing Libraries

In [51]:
import os

import joblib

import warnings

import numpy as np

import pandas as pd

import xgboost
from xgboost import XGBRegressor

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score,mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import(
    OneHotEncoder,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer,
    OrdinalEncoder,
    StandardScaler
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder
)

## 2. Display Settings

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
sklearn.set_config(transform_output="pandas")

In [4]:
warnings.filterwarnings("ignore")

## 3. Reading Datasets

In [5]:
train = pd.read_csv(r"C:\Users\rahul\OneDrive\Desktop\flight sagemaker project\data\train.csv")
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-05-27,Delhi,Cochin,20:55:00,12:35:00,940,1.0,In-flight meal not included,12898
1,Jet Airways,2019-06-12,Kolkata,Banglore,18:55:00,16:20:00,1285,1.0,No Info,13044
2,Air India,2019-05-18,Delhi,Cochin,09:45:00,09:25:00,1420,2.0,No Info,10975
3,Indigo,2019-06-03,Mumbai,Hyderabad,21:20:00,22:50:00,90,0.0,No Info,2227
4,Jet Airways,2019-04-01,Mumbai,Hyderabad,02:55:00,04:20:00,85,0.0,No Info,5678
...,...,...,...,...,...,...,...,...,...,...
6689,Spicejet,2019-06-09,Kolkata,Banglore,11:35:00,18:50:00,435,1.0,No Info,8479
6690,Multiple Carriers,2019-05-09,Delhi,Cochin,10:00:00,01:30:00,930,1.0,No Info,15078
6691,Air India,2019-05-18,Delhi,Cochin,12:00:00,07:40:00,1180,2.0,No Info,8603
6692,Air Asia,2019-05-18,Delhi,Cochin,07:55:00,13:25:00,330,1.0,No Info,8759


In [6]:
val = pd.read_csv(r"C:\Users\rahul\OneDrive\Desktop\flight sagemaker project\data\val.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-05-27,Delhi,Cochin,09:00:00,19:00:00,600,1.0,In-flight meal not included,10675
1,Jet Airways,2019-05-24,Kolkata,Banglore,18:55:00,10:05:00,910,1.0,In-flight meal not included,8586
2,Jet Airways,2019-03-18,Banglore,Delhi,21:25:00,09:30:00,725,1.0,No Info,13555
3,Spicejet,2019-06-27,Chennai,Kolkata,17:45:00,20:05:00,140,0.0,No check-in baggage included,3543
4,Air Asia,2019-05-15,Kolkata,Banglore,07:35:00,19:25:00,710,1.0,No Info,5192
...,...,...,...,...,...,...,...,...,...,...
1669,Vistara,2019-05-06,Kolkata,Banglore,07:10:00,22:40:00,930,1.0,No Info,8452
1670,Indigo,2019-04-03,Delhi,Cochin,21:05:00,00:20:00,195,0.0,No Info,5021
1671,Air India,2019-03-01,Banglore,Delhi,17:00:00,19:45:00,165,0.0,No Info,25913
1672,Air India,2019-06-18,Mumbai,Hyderabad,06:20:00,07:40:00,80,0.0,No Info,3100


In [7]:
test = pd.read_csv(r"C:\Users\rahul\OneDrive\Desktop\flight sagemaker project\data\test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-06,Banglore,Delhi,08:00:00,08:15:00,1455,1.0,No Info,17996
1,Spicejet,2019-06-06,Kolkata,Banglore,22:20:00,00:40:00,140,0.0,No Info,3873
2,Indigo,2019-03-18,Kolkata,Banglore,05:30:00,08:20:00,170,0.0,No Info,4462
3,Jet Airways,2019-03-24,Mumbai,Hyderabad,15:50:00,17:20:00,90,0.0,In-flight meal not included,2228
4,Spicejet,2019-04-27,Banglore,Delhi,09:30:00,12:20:00,170,0.0,No Info,4991
...,...,...,...,...,...,...,...,...,...,...
2088,Jet Airways,2019-05-27,Delhi,Cochin,19:15:00,12:35:00,1040,1.0,In-flight meal not included,12898
2089,Jet Airways,2019-05-27,Delhi,Cochin,02:15:00,19:00:00,1005,1.0,In-flight meal not included,12898
2090,Jet Airways,2019-06-03,Delhi,Cochin,02:15:00,04:25:00,1570,1.0,In-flight meal not included,11627
2091,Multiple Carriers,2019-06-06,Delhi,Cochin,15:15:00,01:30:00,615,1.0,No Info,6795


## 4. Preprocessing Operations

In [8]:
# airline
air_transformer=Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1,replace_with="Others",n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]


doj_transformer = Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
loc_transformer= Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour","minute"])),
    ("scaler",MinMaxScaler())
])


def part_of_day(X):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col : pd.to_datetime(X.loc[:,col]).dt.hour
        for col in columns
    })

    return(
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:,col].between(4,12, inclusive="left"),
                 X_temp.loc[:,col].between(12,16, inclusive="left"),
                 X_temp.loc[:,col].between(16,20, inclusive="left")],
                ["morning","afternoon","evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

    
time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])


time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
def dur_cat(X):
    return(
        X
        .assign(duration_cat=np.select([X.duration.lt(180),
                                       X.duration.between(180,420,inclusive="left")],
                                       ["short","medium"],
                                       default="long"))
        .drop(columns="duration")
    )

    
def is_over(X):
	return (
		X
        .assign(duration_over_1000 = X.duration.ge(1000).astype(int))
		.drop(columns="duration")
	)

    
dur_pipe1 = Pipeline(steps=[
    ("cat",FunctionTransformer(func=dur_cat)),
    ("encoder",OrdinalEncoder(categories=[["short","medium","long"]]))
])

dur_union = FeatureUnion(transformer_list=[
    ("part1",dur_pipe1),
    ("part2",FunctionTransformer(func=is_over)),
    ("scaler",StandardScaler())
])

dur_transformer = Pipeline(steps=[
    ("outlier", Winsorizer(capping_method="iqr",fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", dur_union)
])

# total_stops
def is_direct(X):
    return (
        X
        .assign(is_direct_flight= X.total_stops.eq(0).astype(int))
    )

total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("direct", FunctionTransformer(func= is_direct))
])

# additional_info
info_pipe1= Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="other")),
    ("encoder", OneHotEncoder(sparse_output=False,handle_unknown="ignore"))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
    ("part1", info_pipe1),
    ("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
    ("air", air_transformer, ["airline"]),
    ("doj", doj_transformer, ["date_of_journey"]),
    ("loc", loc_transformer, ["source", "destination"]),
    ("time", time_transformer, ["dep_time","arrival_time"]),
    ("dur", dur_transformer, ["duration"]),
    ("stops", total_stops_transformer, ["total_stops"]),
    ("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
	estimator=estimator,
	scoring="r2",
	threshold=0.1
) 

#preprocessor
preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])

In [9]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

In [10]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Others,doj__date_of_journey_week,doj__date_of_journey_day_of_year,loc__source,loc__destination,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,1.0,0.0,0.764706,0.737288,1.040187,1.040187,2.0,0,0.609935,1.0,0
1,0.0,1.0,0.0,0.882353,0.872881,-0.190314,-0.190314,2.0,1,1.301752,1.0,0
2,0.0,0.0,0.0,0.647059,0.661017,1.040187,1.040187,2.0,1,1.572463,2.0,0
3,1.0,0.0,0.0,0.823529,0.796610,-1.915733,-1.915733,0.0,0,-1.094542,0.0,1
4,0.0,1.0,0.0,0.294118,0.262712,-1.915733,-1.915733,0.0,0,-1.104568,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6689,0.0,0.0,1.0,0.823529,0.847458,-0.190314,-0.190314,2.0,0,-0.402725,1.0,0
6690,0.0,0.0,0.0,0.588235,0.584746,1.040187,1.040187,2.0,0,0.589882,1.0,0
6691,0.0,0.0,0.0,0.647059,0.661017,1.040187,1.040187,2.0,1,1.091199,2.0,0
6692,0.0,0.0,1.0,0.647059,0.661017,1.040187,1.040187,1.0,0,-0.613278,1.0,0


In [48]:
X_train_t = preprocessor.transform(train.drop(columns="price"))
y_train = train.price.copy()

X_val_t = preprocessor.transform(val.drop(columns="price"))
y_val = val.price.copy()

X_test_t = preprocessor.transform(test.drop(columns="price"))
y_test = test.price.copy()

In [21]:
pipeline = Pipeline(steps=[
    ("pre",preprocessor),
    ("model", LinearRegression()) # placeholder
])

param_grid = [
    {
        'model': [LinearRegression()],
        'model__fit_intercept': [True, False],
        'model__positive': [False, True]
    },
    {
        'model': [RandomForestRegressor(random_state=42)],
        'model__n_estimators': [100, 200, 300], # no.of decision trees
        'model__max_depth': [None, 10, 20], 
        'model__min_samples_split': [2, 5, 10], # minimum no. of samples to split a node
        'model__min_samples_leaf': [1, 2, 4] # minimum samples at leaf node
    },
    {
        'model': [XGBRegressor(objective='reg:squarederror', verbosity=0, random_state=42)],
        'model__n_estimators': [100, 200, 300], # no. of boosting rounds
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__max_depth': [3, 5, 7],
        'model__subsample': [0.6, 0.8, 1.0], # fraction of training rows used per tree
        'model__colsample_bytree': [0.6, 0.8, 1.0] # fraction of features used per tree
     }
]

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 328 candidates, totalling 1640 fits


In [55]:
print("Best Model Type:", type(grid_search.best_estimator_.named_steps['model']).__name__)
#print("Best Parameters:", grid_search.best_params_)
print("Best CV Score (R2):", grid_search.best_score_)
# Only model-related parameters
print("Best Tuned Model Params:", {k: v for k, v in grid_search.best_params_.items() if k.startswith('model__')})

Best Model Type: XGBRegressor
Best CV Score (R2): 0.7560464859008789
Best Tuned Model Params: {'model__colsample_bytree': 0.6, 'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 300, 'model__subsample': 1.0}


* here GridSearchCV is giving XGBRegressor as the best model.
* to be sure lets check the performance of each model

In [56]:
# Create DataFrame from GridSearchCV results
results_df = pd.DataFrame(grid_search.cv_results_)

# Add model name column
results_df['model_name'] = results_df['param_model'].apply(lambda m: type(m).__name__)

lr_best_score = results_df[results_df['model_name'] == 'LinearRegression']['mean_test_score'].max()
print("Best R2 for LinearRegression:", lr_best_score)

rf_best_score = results_df[results_df['model_name'] == 'RandomForestRegressor']['mean_test_score'].max()
print("Best R2 for RandomForestRegressor:", rf_best_score)

xgb_best_score = results_df[results_df['model_name'] == 'XGBRegressor']['mean_test_score'].max()
print("Best R2 for XGBRegressor:", xgb_best_score)

Best R2 for LinearRegression: 0.5267193366287165
Best R2 for RandomForestRegressor: 0.7527321810584411
Best R2 for XGBRegressor: 0.7560464859008789


* performance of RandomForestRegressor & XGBRegressor is very close.
* lets take best parameters and check r2_score for validation & test set.

In [57]:
results_df = pd.DataFrame(grid_search.cv_results_)
results_df['model_name'] = results_df['param_model'].apply(lambda m: type(m).__name__)

for model in results_df['model_name'].unique():
    best = results_df[results_df['model_name'] == model].sort_values('mean_test_score', ascending=False).iloc[0]
    tuned_params = {k: v for k, v in best['params'].items() if k != 'model'}
    print(f"{model} → R2: {best['mean_test_score']:.4f}, Tuned Params: {tuned_params}")

LinearRegression → R2: 0.5267, Tuned Params: {'model__fit_intercept': True, 'model__positive': False}
RandomForestRegressor → R2: 0.7527, Tuned Params: {'model__max_depth': 20, 'model__min_samples_leaf': 1, 'model__min_samples_split': 10, 'model__n_estimators': 300}
XGBRegressor → R2: 0.7560, Tuned Params: {'model__colsample_bytree': 0.6, 'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 300, 'model__subsample': 1.0}


In [47]:
rf_best = RandomForestRegressor(max_depth= 20, min_samples_leaf= 1, min_samples_split= 10, n_estimators= 300,random_state=42)
xgb_best = XGBRegressor(objective='reg:squarederror', verbosity=0, random_state=42,colsample_bytree= 0.6, learning_rate= 0.1, max_depth= 5, n_estimators= 300, subsample= 1.0)

rf_best.fit(X_train_t, y_train)
xgb_best.fit(X_train_t, y_train)

In [49]:
rf_val_pred = rf_best.predict(X_val_t)
xgb_val_pred = xgb_best.predict(X_val_t)

rf_test_pred = rf_best.predict(X_test_t)
xgb_test_pred = xgb_best.predict(X_test_t)

In [67]:
# Evaluate r2_score

print("RF Validation R2:", r2_score(y_val, rf_val_pred))
print("XGB Validation R2:", r2_score(y_val, xgb_val_pred))

print("\nRF Test R2:", r2_score(y_test, rf_test_pred))
print("XGB Test R2:", r2_score(y_test, xgb_test_pred))

RF Validation R2: 0.7687001747771234
XGB Validation R2: 0.7985448837280273

RF Test R2: 0.7979481963884375
XGB Test R2: 0.7913675904273987


* XGBRegressor performs better on validation set but RandomForestRegressor is slightly better on test set.
* lets confirm with mean_absolute_error & root mean_squared_error

In [66]:
# MAE
print("RF Validation MAE:", mean_absolute_error(y_val, rf_val_pred))
print("XGB Validation MAE:", mean_absolute_error(y_val, xgb_val_pred))

# RMSE
print("\nRF Validation RMSE:", np.sqrt(mean_squared_error(y_val, rf_val_pred)))
print("XGB Validation RMSE:", np.sqrt(mean_squared_error(y_val, xgb_val_pred)))

# MAE
print("\nRF Test MAE:", mean_absolute_error(y_test, rf_test_pred))
print("XGB Test MAE:", mean_absolute_error(y_test, xgb_test_pred))

# RMSE
print("\nRF Test RMSE:", np.sqrt(mean_squared_error(y_test, rf_test_pred)))
print("XGB Test RMSE:", np.sqrt(mean_squared_error(y_test, xgb_test_pred)))

RF Validation MAE: 1318.6812853755825
XGB Validation MAE: 1331.37548828125

RF Validation RMSE: 2177.0415384709445
XGB Validation RMSE: 2031.7402639117038

RF Test MAE: 1342.1795309298584
XGB Test MAE: 1378.3333740234375

RF Test RMSE: 2044.4556192776452
XGB Test RMSE: 2077.481768873075


* From above RandomForestRegressor is better in majority of the cases.

In [60]:
final_model = rf_best

In [54]:
# r2_score(y_train,train_pred)
print("val r2 score",r2_score(y_val,rf_val_pred))
print("test r2 score",r2_score(y_test,rf_test_pred))

val r2 score 0.7687001747771234
test r2 score 0.7979481963884375


In [61]:
joblib.dump(final_model, "rf-model.pkl")

['rf-model.pkl']