In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score,mean_squared_error,root_mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression ,Lasso,Ridge

# Load dataset
df = pd.read_csv("flights_data.csv")

# Drop unnecessary columns
df.drop(['flight'], axis=1, inplace=True)

# Features & target
x =  df.drop('price', axis=1)
y = df['price']

# Categorical & numerical features
categorical_features = ['airline', 'source_city', 'destination_city', 'class', 'stops', 'departure_time', 'arrival_time']
numerical_features = ['duration', 'days_left']


In [2]:
# Numerical features
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical features
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformations
preprocessor = ColumnTransformer([
    ('num', num_transformer, numerical_features),
    ('cat', cat_transformer, categorical_features)
])


In [3]:
# Sample 50k rows for tuning
df_sample = df.sample(n=10000, random_state=42)
X_sample = df_sample.drop('price', axis=1)
y_sample = df_sample['price']


In [4]:

models = {
    'LinearRegression' : LinearRegression(),
    'Lasso': Lasso(random_state=42,alpha=0.05),
    'ridge_regression': Ridge(random_state=42,alpha=0.05),
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(objective='reg:squarederror', random_state=42)
}

param_grid = {

    'LinearRegression': {},
    'Lasso':
      {},
    'ridge_regression' :
      {},
    'RandomForest': {
        'model__n_estimators': [100, 150, 200],
        'model__max_depth': [i for i in range(50)],
        'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2']
    },
    'GradientBoosting': {
        'model__n_estimators': [100, 150],
        'model__learning_rate': [0.05, 0.1],
        'model__max_depth': [3, 5]
    },
    'XGBoost': {
        'model__n_estimators': [100, 150],
        'model__learning_rate': [0.05, 0.1],
        'model__max_depth': [i for i in range(50)]
    }
}


In [5]:
best_models = {}

for name, model in models.items():
    print(f"Training {name} on sample data...")
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    search = RandomizedSearchCV(
        pipeline,
        param_distributions=param_grid[name],
        n_iter=10,        # 5 random combinations
        cv=5,              # 3-fold CV
        scoring='r2',       # Use R² as metric
        n_jobs=-1,
        random_state=42
    )
    
    search.fit(X_sample, y_sample)
    
    best_models[name] = search.best_estimator_
    print(f"{name} best R² score (CV): {search.best_score_:.4f}")
    print(f"{name} best params: {search.best_params_}\n")
    mse = mean_squared_error(y_sample, search.predict(X_sample))
    rmse = mse ** 0.5
    print(f"mean square error: {mse:.4f}")
    print(f"{name} RMSE on sample data: {rmse:.4f}\n")



Training LinearRegression on sample data...




LinearRegression best R² score (CV): 0.9102
LinearRegression best params: {}

mean square error: 46749591.8249
LinearRegression RMSE on sample data: 6837.3673

Training Lasso on sample data...




Lasso best R² score (CV): 0.9102
Lasso best params: {}

mean square error: 46749593.0024
Lasso RMSE on sample data: 6837.3674

Training ridge_regression on sample data...




ridge_regression best R² score (CV): 0.9102
ridge_regression best params: {}

mean square error: 46749599.4754
ridge_regression RMSE on sample data: 6837.3679

Training RandomForest on sample data...
RandomForest best R² score (CV): 0.9626
RandomForest best params: {'model__n_estimators': 100, 'model__min_samples_split': 2, 'model__min_samples_leaf': 2, 'model__max_features': 'sqrt', 'model__max_depth': 40}

mean square error: 9442401.5468
RandomForest RMSE on sample data: 3072.8491

Training GradientBoosting on sample data...




GradientBoosting best R² score (CV): 0.9599
GradientBoosting best params: {'model__n_estimators': 150, 'model__max_depth': 5, 'model__learning_rate': 0.1}

mean square error: 14618335.0337
GradientBoosting RMSE on sample data: 3823.3931

Training XGBoost on sample data...
XGBoost best R² score (CV): 0.9638
XGBoost best params: {'model__n_estimators': 150, 'model__max_depth': 7, 'model__learning_rate': 0.1}

mean square error: 6638689.0799
XGBoost RMSE on sample data: 2576.5654



In [124]:
best_models

{'LinearRegression': Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   ['duration', 'days_left']),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='most_frequent')),
                                                                   ('onehot',
                                                                    OneHotEncoder(handle_unknown='ignore'))]),
                        

In [None]:

# Split full data
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

# Choose best model from tuning, e.g., XGBoost
final_model = best_models['XGBoost']

# Train on full training set
final_model.fit(X_train, y_train)

# Predict on test set
y_pred = final_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_absolute_error(y_test,y_pred)
rmse =sqrt(mean_absolute_error(y_test,y_pred))


print(f"R² score on test set: {r2:.4f}")
print(f"mse  score on test set: {mse:.4f}")
print(f"rmse  score on test set: {rmse:.4f}")


R² score on test set: 0.9764
mse  score on test set: 1954.6390
rmse  score on test set: 44.2113


In [126]:
import joblib
joblib.dump(final_model, 'flight_price_model.pkl')


['flight_price_model.pkl']