In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import joblib

In [3]:
# Perform stratified sampling by 'startingAirport' and 'destinationAirport'
df = pd.read_csv('/Users/thusondube/Downloads/combined_itineraries.csv', low_memory=False)
sample_fraction = 0.20
sampled_df = df.groupby(['startingAirport', 'destinationAirport'], group_keys=False).apply(lambda x: x.sample(frac=sample_fraction))

  sampled_df = df.groupby(['startingAirport', 'destinationAirport'], group_keys=False).apply(lambda x: x.sample(frac=sample_fraction))


In [57]:
sampled_df.head()


Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,travelDuration,isBasicEconomy,isRefundable,isNonStop,totalFare,...,segmentsArrivalTimeEpochSeconds,segmentsArrivalTimeRaw,segmentsArrivalAirportCode,segmentsDepartureAirportCode,segmentsAirlineName,segmentsAirlineCode,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode
4346043,30e1a0a9e8578d55af97ed9171600e77,2022-05-13,2022-06-17,ATL,BOS,PT4H35M,False,False,False,411.6,...,1655468940||1655477400,2022-06-17T08:29:00.000-04:00||2022-06-17T10:5...,EWR||BOS,ATL||EWR,United||United,UA||UA,Airbus A319||Boeing 737-700,8040||4860,762||185,coach||coach
4431074,204fdd5df0bc2cbc73e9651d50f39666,2022-05-01,2022-05-17,ATL,BOS,PT6H56M,False,False,False,198.6,...,1652823300||1652840160,2022-05-17T17:35:00.000-04:00||2022-05-17T22:1...,EWR||BOS,ATL||EWR,United||United,UA||UA,Boeing 737-800||Airbus A320,8100||5100,762||185,coach||coach
4313364,f761e7b895cf3326e213d63ff33d2a8c,2022-05-01,2022-05-22,ATL,BOS,PT2H35M,False,False,True,190.59,...,1653224400,2022-05-22T09:00:00.000-04:00,BOS,ATL,Spirit Airlines,NK,Airbus A319,9300,,coach
4041545,2e59877138961e0a89a2307c0637d1b1,2022-04-22,2022-05-18,ATL,BOS,PT6H56M,False,False,False,214.6,...,1652909700||1652926560,2022-05-18T17:35:00.000-04:00||2022-05-18T22:1...,EWR||BOS,ATL||EWR,United||United,UA||UA,Boeing 737-800||Airbus A320,8100||5100,762||185,coach||coach
4384964,a8aa7f4aebdbfe1a5af2b20657c12aab,2022-04-19,2022-05-19,ATL,BOS,PT4H56M,False,False,False,240.1,...,1652969100||1652982000,2022-05-19T10:05:00.000-04:00||2022-05-19T13:4...,CLT||BOS,ATL||CLT,American Airlines||American Airlines,AA||AA,Airbus A319||Boeing 737-800,4860||8400,228||728,coach||coach


In [58]:
sampled_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2704001 entries, 4346043 to 11081564
Data columns (total 23 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   legId                              object 
 1   searchDate                         object 
 2   flightDate                         object 
 3   startingAirport                    object 
 4   destinationAirport                 object 
 5   travelDuration                     object 
 6   isBasicEconomy                     bool   
 7   isRefundable                       bool   
 8   isNonStop                          bool   
 9   totalFare                          float64
 10  totalTravelDistance                float64
 11  segmentsDepartureTimeEpochSeconds  object 
 12  segmentsDepartureTimeRaw           object 
 13  segmentsArrivalTimeEpochSeconds    object 
 14  segmentsArrivalTimeRaw             object 
 15  segmentsArrivalAirportCode         object 
 16  segmentsDepartur

In [4]:
# Load and preprocess data
def load_and_preprocess_data(file_path):
    """
    Load the dataset, rename relevant columns, and select only the necessary columns.
    """
    # Load dataset
    data = file_path
    
    # Rename columns for clarity
    data = data.rename(columns={
        'startingAirport': 'origin_airport', 
        'destinationAirport': 'destination_airport',
        'totalFare': 'fare'
        
    })
    
    
    # Extract cabin type
    data['departure_time_seconds'] = data['segmentsDepartureTimeEpochSeconds'].apply(lambda x: x.split('||')[0] if '||' in str(x) else x)
    data['cabin_type'] = data['segmentsCabinCode'].apply(lambda x: x.split('||')[0] if '||' in str(x) else x)
    data['departure_time_seconds'] = data['departure_time_seconds'].astype('int64')
    # Select and return only the relevant columns
    data = data[['origin_airport', 'destination_airport','departure_time_seconds', 'cabin_type', 'fare']]
    return data

In [5]:

# Encode categorical columns and apply convert_to_epoch_seconds function
def encode_data(data):
    """
    Encode categorical columns in the dataset using Label Encoding.
    Apply convert_to_epoch_seconds function to 'segmentsDepartureTimeRaw' column.
    """
    # Initialize LabelEncoders
    label_encoders = {
        'origin_airport': LabelEncoder(),
        'destination_airport': LabelEncoder(),
        'cabin_type': LabelEncoder()
    }
    
    # Apply encoding and save encoders for each column
    for column, encoder in label_encoders.items():
        data[column] = encoder.fit_transform(data[column])
    
    return data, label_encoders

In [6]:
# Split the data
def split_data(data, test_size=0.2, random_state=42):
    """
    Sample a fraction of the data, separate features and target, and split into train/test sets.
    """
    # Take a random sample for quicker training
    
    
    # Define features and target
    X = data[['origin_airport', 'destination_airport', 'departure_time_seconds', 'cabin_type']]
    y = data['fare']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [7]:
# Train model
def train_model(X_train, y_train):

    # Initialize Random Forest with parameters for reduced model size
    model  = xgb.XGBRegressor(n_estimators=50, max_depth=3, learning_rate=0.1, n_jobs=-1, random_state=0)
    
    # Train the model
    model.fit(X_train, y_train)
    
    return model

In [8]:
# Evaluate model
def evaluate_model(model, X_test, y_test):
    """
    Predict and evaluate the model on test data.
    """
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    print("Mean Absolute Error:", mae)
    print("R² Score:", r2)
    print("RMSE:", rmse)
    
    return mae, r2, rmse

In [9]:
# Save model and encoders
def save_model_and_encoders(model, label_encoders, model_path, encoders_path_prefix):
    """
    Save the trained model and label encoders.
    """
    #Save model with compression
    joblib.dump(model, model_path, compress=3)
    print(f"Model saved at {model_path}")
    
    # Save each label encoder
    for column, encoder in label_encoders.items():
        encoder_path = f"{encoders_path_prefix}_{column}_label_encoder.pkl"
        joblib.dump(encoder, encoder_path)
        print(f"{column} encoder saved at {encoder_path}")

In [10]:
# Define main pipeline function
def main_pipeline(file_path, model_path, encoders_path_prefix):
    """
    Main pipeline for loading data, preprocessing, training, and saving the model.
    """
    # Load and preprocess data
    data = load_and_preprocess_data(file_path)
    
    # Encode categorical data
    data, label_encoders = encode_data(data)
    
    # Split data
    X_train, X_test, y_train, y_test = split_data(data)
    
    # Train model
    model = train_model(X_train, y_train)
    
    # Evaluate model
    evaluate_model(model, X_test, y_test)
    
    # Save model and encoders
    save_model_and_encoders(model, label_encoders, model_path, encoders_path_prefix)

In [11]:
# Execute the pipeline
file_path = sampled_df
model_path = '../models/xgboost_model.pkl'
encoders_path_prefix = '../models/'

main_pipeline(file_path, model_path, encoders_path_prefix)

Mean Absolute Error: 128.1879413199339
R² Score: 0.31093446118469836
RMSE: 172.6284840403228
Model saved at ../models/xgboost_model.pkl
origin_airport encoder saved at ../models/_origin_airport_label_encoder.pkl
destination_airport encoder saved at ../models/_destination_airport_label_encoder.pkl
cabin_type encoder saved at ../models/_cabin_type_label_encoder.pkl


In [12]:
X_train, X_test, y_train, y_test = split_data(data)

NameError: name 'data' is not defined