In [1]:
!pip install xgboost




In [2]:
!pip install feature-engine



In [3]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)
!pip install feature-engine
from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import matplotlib.pyplot as plt

import warnings



In [4]:
pd.set_option("display.max_columns", None)

In [5]:
sklearn.set_config(transform_output="pandas")


In [6]:
warnings.filterwarnings("ignore")

## Train Data

In [7]:
path_1 = r"C:\Users\User\Desktop\Learnabay Training\My Portfolio projects for resume\Flight Price Prediction\Data\train.csv"

train = pd.read_csv(path_1)
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-05-27,Delhi,Cochin,20:55:00,12:35:00,940,1,In-flight meal not included,12898
1,Jet Airways,2019-06-12,Kolkata,Banglore,18:55:00,16:20:00,1285,1,No Info,13044
2,Air India,2019-05-18,Delhi,Cochin,09:45:00,09:25:00,1420,2,No Info,10975
3,Indigo,2019-06-03,Mumbai,Hyderabad,21:20:00,22:50:00,90,0,No Info,2227
4,Jet Airways,2019-04-01,Mumbai,Hyderabad,02:55:00,04:20:00,85,0,No Info,5678
...,...,...,...,...,...,...,...,...,...,...
6689,Spicejet,2019-06-09,Kolkata,Banglore,11:35:00,18:50:00,435,1,No Info,8479
6690,Multiple Carriers,2019-05-09,Delhi,Cochin,10:00:00,01:30:00,930,1,No Info,15078
6691,Air India,2019-05-18,Delhi,Cochin,12:00:00,07:40:00,1180,2,No Info,8603
6692,Air Asia,2019-05-18,Delhi,Cochin,07:55:00,13:25:00,330,1,No Info,8759


## Validation Data

In [8]:
path_2 = r"C:\Users\User\Desktop\Learnabay Training\My Portfolio projects for resume\Flight Price Prediction\Data\val.csv"

val = pd.read_csv(path_2)
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-05-27,Delhi,Cochin,09:00:00,19:00:00,600,1,In-flight meal not included,10675
1,Jet Airways,2019-05-24,Kolkata,Banglore,18:55:00,10:05:00,910,1,In-flight meal not included,8586
2,Jet Airways,2019-03-18,Banglore,Delhi,21:25:00,09:30:00,725,1,No Info,13555
3,Spicejet,2019-06-27,Chennai,Kolkata,17:45:00,20:05:00,140,0,No check-in baggage included,3543
4,Air Asia,2019-05-15,Kolkata,Banglore,07:35:00,19:25:00,710,1,No Info,5192
...,...,...,...,...,...,...,...,...,...,...
1669,Vistara,2019-05-06,Kolkata,Banglore,07:10:00,22:40:00,930,1,No Info,8452
1670,Indigo,2019-04-03,Delhi,Cochin,21:05:00,00:20:00,195,0,No Info,5021
1671,Air India,2019-03-01,Banglore,Delhi,17:00:00,19:45:00,165,0,No Info,25913
1672,Air India,2019-06-18,Mumbai,Hyderabad,06:20:00,07:40:00,80,0,No Info,3100


## Test Data

In [9]:
path_3 = r"C:\Users\User\Desktop\Learnabay Training\My Portfolio projects for resume\Flight Price Prediction\Data\test.csv"

test = pd.read_csv(path_3)
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-06,Banglore,Delhi,08:00:00,08:15:00,1455,1,No Info,17996
1,Spicejet,2019-06-06,Kolkata,Banglore,22:20:00,00:40:00,140,0,No Info,3873
2,Indigo,2019-03-18,Kolkata,Banglore,05:30:00,08:20:00,170,0,No Info,4462
3,Jet Airways,2019-03-24,Mumbai,Hyderabad,15:50:00,17:20:00,90,0,In-flight meal not included,2228
4,Spicejet,2019-04-27,Banglore,Delhi,09:30:00,12:20:00,170,0,No Info,4991
...,...,...,...,...,...,...,...,...,...,...
2088,Jet Airways,2019-05-27,Delhi,Cochin,19:15:00,12:35:00,1040,1,In-flight meal not included,12898
2089,Jet Airways,2019-05-27,Delhi,Cochin,02:15:00,19:00:00,1005,1,In-flight meal not included,12898
2090,Jet Airways,2019-06-03,Delhi,Cochin,02:15:00,04:25:00,1570,1,In-flight meal not included,11627
2091,Multiple Carriers,2019-06-06,Delhi,Cochin,15:15:00,01:30:00,615,1,No Info,6795


## Preprocessing Operations

In [10]:

# Airline
air_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
	("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])




feature_to_extract = ["year", "month", "week", "day_of_week","day_of_month","weekend", "day_of_year","month_start","month_end", "quarter"]

doj_transformer = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
	("scaler", MinMaxScaler())
])



location_pipe1 = Pipeline(steps=[
	("encoder", MeanEncoder()),
	("scaler", PowerTransformer())
])



def is_south(X):
	columns = X.columns.to_list()
	south_cities = ['Banglore', 'Chennai', 'Cochin', 'Hyderabad']
	return (
		X
		.assign(**{
			f"{col}_is_south": X.loc[:, col].isin(south_cities).astype(int)
			for col in columns
		})
		.drop(columns=columns)
	)




location_transformer = FeatureUnion(transformer_list=[
	("part1", location_pipe1),
	("part2", FunctionTransformer(func=is_south))
])




time_pipe1 = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
	("scaler", MinMaxScaler())
])


def part_of_day(X, morning=4, noon=12, eve=16, night=20):
	columns = X.columns.to_list()
	X_temp = X.assign(**{
		col: pd.to_datetime(X.loc[:, col]).dt.hour
		for col in columns
	})

	return (
		X_temp
		.assign(**{
			f"{col}_part_of_day": np.select(
				[X_temp.loc[:, col].between(morning, noon, inclusive="left"),
				 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
				 X_temp.loc[:, col].between(eve, night, inclusive="left")],
				["morning", "afternoon", "evening"],
				default="night"
			)
			for col in columns
		})
		.drop(columns=columns)
	)

time_pipe2 = Pipeline(steps=[
	("part", FunctionTransformer(func=part_of_day)),
	("encoder", CountFrequencyEncoder()),
	("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
	("part1", time_pipe1),
	("part2", time_pipe2)
])





class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
	def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
		self.variables = variables
		self.percentiles = percentiles
		self.gamma = gamma


	def fit(self, X, y=None):
		if not self.variables:
			self.variables = X.select_dtypes(include="number").columns.to_list()

		self.reference_values_ = {
			col: (
				X
				.loc[:, col]
				.quantile(self.percentiles)
				.values
				.reshape(-1, 1)
			)
			for col in self.variables
		}

		return self


	def transform(self, X):
		objects = []
		for col in self.variables:
			columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
			obj = pd.DataFrame(
				data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
				columns=columns
			)
			objects.append(obj)
		return pd.concat(objects, axis=1)





def duration_category(X, short=180, med=400):
	return (
		X
		.assign(duration_cat=np.select([X.duration.lt(short),
									    X.duration.between(short, med, inclusive="left")],
									   ["short", "medium"],
									   default="long"))
		.drop(columns="duration")
	)



def is_over(X, value=1000):
	return (
		X
		.assign(**{
			f"duration_over_{value}": X.duration.ge(value).astype(int)
		})
		.drop(columns="duration")
	)

duration_pipe1 = Pipeline(steps=[
	("rbf", RBFPercentileSimilarity()),
	("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
	("cat", FunctionTransformer(func=duration_category)),
	("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
	("part1", duration_pipe1),
	("part2", duration_pipe2),
	("part3", FunctionTransformer(func=is_over)),
	("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("union", duration_union)
])




def is_direct(X):
	return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("", FunctionTransformer(func=is_direct))
])


info_pipe1 = Pipeline(steps=[
	("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
	("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])



def have_info(X):
	return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
	("part1", info_pipe1),
	("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
	("union", info_union)
])

column_transformer = ColumnTransformer(transformers=[
	("air", air_transformer, ["airline"]),
	("doj", doj_transformer, ["date_of_journey"]),
	("location", location_transformer, ["source", 'destination']),
	("time", time_transformer, ["dep_time", "arrival_time"]),
	("dur", duration_transformer, ["duration"]),
	("stops", total_stops_transformer, ["total_stops"]),
	("info", info_transformer, ["additional_info"])
], remainder="passthrough")


estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
	estimator=estimator,
	scoring="r2",
	threshold=0.1
) 



preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])













## Splitting Data

In [11]:
# Split into features and target
X_train = train.drop(columns=["price"])
y_train = train["price"]

X_val = val.drop(columns=["price"])
y_val = val["price"]

X_test = test.drop(columns=["price"])
y_test = test["price"]

# Optional: print shapes to confirm
print("Train:", X_train.shape, y_train.shape)
print("Validation:", X_val.shape, y_val.shape)
print("Test:", X_test.shape, y_test.shape)

Train: (6694, 9) (6694,)
Validation: (1674, 9) (1674,)
Test: (2093, 9) (2093,)


## Base-Level Prediction on Train Data

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.base import clone



In [13]:
# Clone your preprocessor for reuse
rf_pipeline = Pipeline(steps=[
    ("preprocessor", clone(preprocessor)),
    ("regressor", RandomForestRegressor(random_state=42))
])

xgb_pipeline = Pipeline(steps=[
    ("preprocessor", clone(preprocessor)),
    ("regressor", XGBRegressor(random_state=42, verbosity=0))
])

In [14]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(pipeline, X, y, model_name="Model"):
    y_pred = pipeline.predict(X)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    adj_r2 = 1 - (1 - r2) * ((len(y) - 1) / (len(y) - X.shape[1] - 1))
    
    print(f"\n{model_name} Results on Training Set:")
    print(f"  MSE     : {mse:.4f}")
    print(f"  R2      : {r2:.4f}")
    print(f"  Adj R2  : {adj_r2:.4f}")
    
    return {"model": model_name, "mse": mse, "r2": r2, "adj_r2": adj_r2}

In [15]:
# Train and evaluate Random Forest
rf_pipeline.fit(X_train, y_train)
rf_results = evaluate_model(rf_pipeline, X_train, y_train, "Random Forest")

# Train and evaluate XGBoost
xgb_pipeline.fit(X_train, y_train)
xgb_results = evaluate_model(xgb_pipeline, X_train, y_train, "XGBoost")



Random Forest Results on Training Set:
  MSE     : 1647554.8064
  R2      : 0.9245
  Adj R2  : 0.9244

XGBoost Results on Training Set:
  MSE     : 2162472.0000
  R2      : 0.9009
  Adj R2  : 0.9007


## Optuna tuning on Train + val

In [16]:
from sklearn.base import clone
from sklearn.pipeline import Pipeline
import pandas as pd

# Clone and fit preprocessor on train
fitted_preprocessor = clone(preprocessor)
fitted_preprocessor.fit(X_train,y_train)

# Transform train and val
X_train_trans = fitted_preprocessor.transform(X_train)
X_val_trans = fitted_preprocessor.transform(X_val)

# Combine train and val sets
X_full_train = pd.DataFrame(
    data=np.vstack([X_train_trans, X_val_trans])
)
y_full_train = pd.concat([y_train, y_val]).reset_index(drop=True)

print("✅ Combined training data shape:", X_full_train.shape, y_full_train.shape)

✅ Combined training data shape: (8368, 13) (8368,)


In [43]:
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", [ "sqrt", "log2"]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        "random_state": 42,
        "n_jobs": -1
    }

    model = RandomForestRegressor(**params)

    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    mse_scores = cross_val_score(model, X_full_train, y_full_train,
                                 cv=kf,
                                 scoring=make_scorer(mean_squared_error, greater_is_better=False))
    
    return -np.mean(mse_scores)


In [44]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\n🔍 Best Trial:")
print(f"  MSE     : {study.best_value}")
print(f"  Params  : {study.best_params}")

[I 2025-05-30 22:07:45,286] A new study created in memory with name: no-name-48027cb9-9ff0-4ac5-9fb4-3c06a6f3d64e


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-05-30 22:07:48,314] Trial 0 finished with value: 5471311.893504297 and parameters: {'n_estimators': 296, 'max_depth': 15, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 5471311.893504297.
[I 2025-05-30 22:07:50,178] Trial 1 finished with value: 5514177.181631086 and parameters: {'n_estimators': 154, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 7, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 5471311.893504297.
[I 2025-05-30 22:07:52,195] Trial 2 finished with value: 5665765.903161746 and parameters: {'n_estimators': 199, 'max_depth': 16, 'min_samples_split': 9, 'min_samples_leaf': 10, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 5471311.893504297.
[I 2025-05-30 22:07:54,595] Trial 3 finished with value: 6959037.852924156 and parameters: {'n_estimators': 283, 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': '

## Training and Prediction with best parameters on Train+val

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Best parameters
best_params = {
    'n_estimators': 242,
    'max_depth': 13,
    'min_samples_split': 9,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'bootstrap': True,
    'random_state': 42,
    'n_jobs': -1
}

# Train model on preprocessed train+val data
model = RandomForestRegressor(**best_params)
model.fit(X_full_train, y_full_train)

# Predict on train+val
y_trainval_pred = model.predict(X_full_train)

# Evaluate
mse = mean_squared_error(y_full_train, y_trainval_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_full_train, y_trainval_pred)
r2 = r2_score(y_full_train, y_trainval_pred)

# Adjusted R²
n = X_full_train.shape[0]
p = X_full_train.shape[1]
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Output
print("🔍 Training + Validation Set Metrics:")
print(f"  MSE     : {mse:,.2f}")
print(f"  RMSE    : {rmse:,.2f}")
print(f"  MAE     : {mae:,.2f}")
print(f"  R²      : {r2:.4f}")
print(f"  Adj R²  : {adj_r2:.4f}")

🔍 Training + Validation Set Metrics:
  MSE     : 3,681,142.24
  RMSE    : 1,918.63
  MAE     : 1,234.70
  R²      : 0.8292
  Adj R²  : 0.8289


## Prediction and Final Results on Test Data

In [53]:
# Step 1: Transform X_test
X_test_transformed = fitted_preprocessor.transform(X_test)

# Step 2: Predict using trained model
y_test_pred = model.predict(X_test_transformed)

# Step 3: Evaluate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Adjusted R²
n_test = X_test_transformed.shape[0]
p_test = X_test_transformed.shape[1]
adj_r2_test = 1 - (1 - r2_test) * (n_test - 1) / (n_test - p_test - 1)

# Optional: MAPE
mape_test = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100 if np.all(y_test != 0) else None

# Step 4: Print
print("\n🔍 Test Set Evaluation Metrics (after transformation):")
print(f"  MSE     : {mse_test:,.2f}")
print(f"  RMSE    : {rmse_test:,.2f}")
print(f"  MAE     : {mae_test:,.2f}")
print(f"  R²      : {r2_test:.4f}")
print(f"  Adj R²  : {adj_r2_test:.4f}")
if mape_test is not None:
    print(f"  MAPE    : {mape_test:.2f}%")
else:
    print("  MAPE    : Skipped (y_test contains zeros)")



🔍 Test Set Evaluation Metrics (after transformation):
  MSE     : 4,278,251.73
  RMSE    : 2,068.39
  MAE     : 1,406.11
  R²      : 0.7932
  Adj R²  : 0.7919
  MAPE    : 16.61%


# Final Pipeline

In [18]:
# Final pipeline: Preprocessing + Model
final_pipeline = Pipeline([
    ('preprocessor', fitted_preprocessor),  # already fitted or will be fitted here
    ('model', model)
])

In [19]:
import joblib
joblib.dump(final_pipeline, "random_forest_pipeline.pkl")

['random_forest_pipeline.pkl']

In [12]:
import joblib
joblib.dump(X_train, "X_train.pkl")

['X_train.pkl']

In [1]:
joblib.dump(preprocessor, "preprocessor.pkl")

NameError: name 'joblib' is not defined

In [13]:
import joblib
joblib.dump(y_train, "y_train.pkl")

['y_train.pkl']