# Dynamic Flight Pricing Model


In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [48]:
df=pd.read_csv('flight.csv')

In [49]:
df.head()

Unnamed: 0.1,Unnamed: 0,From,To,Booking_date,Journey_date,Airline,Code,Departure,Arrival,Duration,Stops,Price
0,0,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,AI-504,21:15,00:05\r\n+1D,2h 50m,non-stop,"₹9,420"
1,1,Delhi,Bangalore,2022-06-23,2022-06-23,IndiGo,6E-2131,21:30,00:15\r\n+1D,2h 45m,non-stop,"₹9,419"
2,2,Delhi,Bangalore,2022-06-23,2022-06-23,IndiGo,6E-6565,22:55,01:45\r\n+1D,2h 50m,non-stop,"₹9,419"
3,3,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,AI-605,21:15,07:50\r\n+1D,10h 35m,1 stop,"₹14,775"
4,4,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,AI-605,21:15,18:20\r\n+1D,21h 5m,1 stop,"₹14,985"


In [50]:
df.shape

(7548, 12)

In [51]:
df.isnull().sum()

Unnamed: 0         0
From               0
To                 0
Booking_date       0
Journey_date       0
Airline            0
Code               0
Departure          0
Arrival            0
Duration           0
Stops              0
Price           1486
dtype: int64

In [52]:
df = df.dropna(subset=['Price']).copy()

In [53]:
df['Price'] = df['Price'].replace({'₹': '', ',': ''}, regex=True).astype(int)

In [54]:
df.shape

(6062, 12)

In [55]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,From,To,Booking_date,Journey_date,Airline,Code,Departure,Arrival,Duration,Stops,Price
0,0,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,AI-504,21:15,00:05\r\n+1D,2h 50m,non-stop,9420
1,1,Delhi,Bangalore,2022-06-23,2022-06-23,IndiGo,6E-2131,21:30,00:15\r\n+1D,2h 45m,non-stop,9419


**Convert Booking and Journey Date to datetime**


In [56]:
df['Booking_date'] = pd.to_datetime(df['Booking_date'], format='%Y-%m-%d')
df['Journey_date'] = pd.to_datetime(df['Journey_date'], format='%Y-%m-%d')

In [57]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,From,To,Booking_date,Journey_date,Airline,Code,Departure,Arrival,Duration,Stops,Price
0,0,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,AI-504,21:15,00:05\r\n+1D,2h 50m,non-stop,9420
1,1,Delhi,Bangalore,2022-06-23,2022-06-23,IndiGo,6E-2131,21:30,00:15\r\n+1D,2h 45m,non-stop,9419


**Simulating booking hour**

In [58]:
np.random.seed(42)
df['Booking_Hour'] = np.random.randint(0, 24, size=len(df))
df['Booking_DateTime'] = df['Booking_date'] + pd.to_timedelta(df['Booking_Hour'], unit='h')

In [59]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,From,To,Booking_date,Journey_date,Airline,Code,Departure,Arrival,Duration,Stops,Price,Booking_Hour,Booking_DateTime
0,0,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,AI-504,21:15,00:05\r\n+1D,2h 50m,non-stop,9420,6,2022-06-23 06:00:00
1,1,Delhi,Bangalore,2022-06-23,2022-06-23,IndiGo,6E-2131,21:30,00:15\r\n+1D,2h 45m,non-stop,9419,19,2022-06-23 19:00:00


**Define booking slot function**

In [60]:
def get_booking_slot(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

In [61]:
df['Booking_Slot'] = df['Booking_Hour'].apply(get_booking_slot)

In [62]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,From,To,Booking_date,Journey_date,Airline,Code,Departure,Arrival,Duration,Stops,Price,Booking_Hour,Booking_DateTime,Booking_Slot
0,0,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,AI-504,21:15,00:05\r\n+1D,2h 50m,non-stop,9420,6,2022-06-23 06:00:00,Morning
1,1,Delhi,Bangalore,2022-06-23,2022-06-23,IndiGo,6E-2131,21:30,00:15\r\n+1D,2h 45m,non-stop,9419,19,2022-06-23 19:00:00,Evening


**Calculation of lead time in days**


In [63]:
df['Lead_Time_Days'] = (df['Journey_date'] - df['Booking_date']).dt.days

In [64]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,From,To,Booking_date,Journey_date,Airline,Code,Departure,Arrival,Duration,Stops,Price,Booking_Hour,Booking_DateTime,Booking_Slot,Lead_Time_Days
0,0,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,AI-504,21:15,00:05\r\n+1D,2h 50m,non-stop,9420,6,2022-06-23 06:00:00,Morning,0
1,1,Delhi,Bangalore,2022-06-23,2022-06-23,IndiGo,6E-2131,21:30,00:15\r\n+1D,2h 45m,non-stop,9419,19,2022-06-23 19:00:00,Evening,0


**Saving Cleaned Dataset**

In [65]:
df.to_csv("flight_cleaned.csv", index=False)
print("Cleaned dataset saved as flight_cleaned.csv")

Cleaned dataset saved as flight_cleaned.csv


# Feature Engineering

**Converting Duration in Minutes**

In [66]:
def convert_duration(duration):
    if 'h' in duration and 'm' in duration:
        h, m = duration.replace('h', '').replace('m', '').split()
        return int(h) * 60 + int(m)
    elif 'h' in duration:
        h = duration.replace('h', '').strip()
        return int(h) * 60
    elif 'm' in duration:
        m = duration.replace('m', '').strip()
        return int(m)
    else:
        return 0

In [67]:
df['Duration_mins'] = df['Duration'].apply(convert_duration)

In [68]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,From,To,Booking_date,Journey_date,Airline,Code,Departure,Arrival,Duration,Stops,Price,Booking_Hour,Booking_DateTime,Booking_Slot,Lead_Time_Days,Duration_mins
0,0,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,AI-504,21:15,00:05\r\n+1D,2h 50m,non-stop,9420,6,2022-06-23 06:00:00,Morning,0,170
1,1,Delhi,Bangalore,2022-06-23,2022-06-23,IndiGo,6E-2131,21:30,00:15\r\n+1D,2h 45m,non-stop,9419,19,2022-06-23 19:00:00,Evening,0,165


**Converting Stops to numerical variables**

In [69]:
def encode_stops(stop):
    stop = stop.lower().strip()
    if 'non-stop' in stop:
        return 0
    elif '1 stop' in stop:
        return 1
    elif '2 stops' in stop:
        return 2
    elif '3 stops' in stop:
        return 3
    else:
        return -1 

In [70]:
df['Stops_num'] = df['Stops'].apply(encode_stops)

In [71]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,From,To,Booking_date,Journey_date,Airline,Code,Departure,Arrival,Duration,Stops,Price,Booking_Hour,Booking_DateTime,Booking_Slot,Lead_Time_Days,Duration_mins,Stops_num
0,0,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,AI-504,21:15,00:05\r\n+1D,2h 50m,non-stop,9420,6,2022-06-23 06:00:00,Morning,0,170,0
1,1,Delhi,Bangalore,2022-06-23,2022-06-23,IndiGo,6E-2131,21:30,00:15\r\n+1D,2h 45m,non-stop,9419,19,2022-06-23 19:00:00,Evening,0,165,0
2,2,Delhi,Bangalore,2022-06-23,2022-06-23,IndiGo,6E-6565,22:55,01:45\r\n+1D,2h 50m,non-stop,9419,14,2022-06-23 14:00:00,Afternoon,0,170,0
3,3,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,AI-605,21:15,07:50\r\n+1D,10h 35m,1 stop,14775,10,2022-06-23 10:00:00,Morning,0,635,1
4,4,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,AI-605,21:15,18:20\r\n+1D,21h 5m,1 stop,14985,7,2022-06-23 07:00:00,Morning,0,1265,1


**Drop unnecessary columns**

In [72]:
df.drop(['Unnamed: 0','Code', 'Departure', 'Arrival', 'Duration', 'Stops', 'Booking_DateTime'], axis=1, inplace=True)

In [73]:
df.head(2)

Unnamed: 0,From,To,Booking_date,Journey_date,Airline,Price,Booking_Hour,Booking_Slot,Lead_Time_Days,Duration_mins,Stops_num
0,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,9420,6,Morning,0,170,0
1,Delhi,Bangalore,2022-06-23,2022-06-23,IndiGo,9419,19,Evening,0,165,0


**Simulating Competitor Price**

In [74]:
def simulate_competitor_price(row):
    base = row['Price']
    variation = np.random.uniform(0.9, 1.1)  # 10% range
    simulated_price = base * variation
    if row['Stops_num'] > 0:
        simulated_price += random.randint(200, 500)
    return round(simulated_price)

In [75]:
df['Competitor_Price'] = df.apply(simulate_competitor_price, axis=1)

In [76]:
df['Price_Diff_vs_Competitor'] = df['Price'] - df['Competitor_Price']

In [77]:
df.head(2)

Unnamed: 0,From,To,Booking_date,Journey_date,Airline,Price,Booking_Hour,Booking_Slot,Lead_Time_Days,Duration_mins,Stops_num,Competitor_Price,Price_Diff_vs_Competitor
0,Delhi,Bangalore,2022-06-23,2022-06-23,Air India,9420,6,Morning,0,170,0,9286,134
1,Delhi,Bangalore,2022-06-23,2022-06-23,IndiGo,9419,19,Evening,0,165,0,8997,422


# Model Training

In [78]:
x = df.drop(columns=['Price', 'Booking_date', 'Journey_date'])
y = df['Price']


In [79]:
from sklearn.model_selection import train_test_split

In [80]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [81]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Random Forest

In [82]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

In [83]:
rf.fit(x_train, y_train)

ValueError: could not convert string to float: 'Delhi'

In [None]:
rf_pred = rf.predict(x_test)

In [None]:
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

In [None]:
print("Random Forest")
print(f"MAE: ₹{rf_mae:.2f} | R² Score: {rf_r2:.3f}")


# XGBoost

In [None]:
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

In [None]:
xgb.fit(x_train, y_train)

In [None]:
xgb_pred = xgb.predict(x_test)

In [None]:
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)

In [None]:
print("XGBoost")
print(f"MAE: ₹{xgb_mae:.2f} | R² Score: {xgb_r2:.3f}")

**Saving the model**

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import joblib

In [84]:
features = ['Airline', 'From', 'To', 'Stops_num', 'Duration_mins', 'Lead_Time_Days',
    'Booking_Slot', 'Competitor_Price', 'Price_Diff_vs_Competitor','Booking_Hour']
target = ['Price']

In [85]:
x= df[features]
y = df[target]

In [86]:
cat_features = ['Airline', 'From', 'To', 'Booking_Slot']
num_features = ['Stops_num', 'Duration_mins', 'Lead_Time_Days', 'Competitor_Price', 'Price_Diff_vs_Competitor','Booking_Hour']

In [87]:
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
], remainder='passthrough')

In [88]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [89]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [90]:
pipeline.fit(x_train, y_train)
score = pipeline.score(x_test, y_test)

  return fit_method(estimator, *args, **kwargs)


In [91]:
print(f"✅ Pipeline Evaluation R² Score: {score:.4f}")

✅ Pipeline Evaluation R² Score: 0.9985


In [92]:
joblib.dump(pipeline, 'flight_price_model.pkl')
print("Model trained on full data and saved")

Model trained on full data and saved


In [93]:
x.columns

Index(['Airline', 'From', 'To', 'Stops_num', 'Duration_mins', 'Lead_Time_Days',
       'Booking_Slot', 'Competitor_Price', 'Price_Diff_vs_Competitor',
       'Booking_Hour'],
      dtype='object')

**--------------------------------------------------------------------------END-----------------------------------------------------------------------------**