In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import joblib

In [2]:
# Load and preprocess data
def load_and_preprocess_data(file_path):
    """
    Load the dataset, rename relevant columns, and select only the necessary columns.
    """
    # Load dataset
    data = pd.read_csv(file_path)
    
    # Rename columns for clarity
    data = data.rename(columns={
        'startingAirport': 'origin_airport', 
        'destinationAirport': 'destination_airport',
        'flightDate': 'departure_date', 
        'totalFare': 'fare'
    })
    
    # Extract departure time in HH:MM format
    data['departure_time_HHMM'] = pd.to_datetime(
        data['segmentsDepartureTimeRaw'].apply(lambda x: x.split('||')[0]),
        utc=True,
        errors='coerce'
    ).dt.strftime('%H:%M')
    
    # Extract cabin type
    data['cabin_type'] = data['segmentsCabinCode'].apply(lambda x: x.split('||')[0])
    
    # Select and return only the relevant columns
    data = data[['origin_airport', 'destination_airport', 'departure_date', 'departure_time_HHMM', 'cabin_type', 'fare']]
    return data

In [3]:
# Encode categorical columns
def encode_data(data):
    """
    Encode categorical columns in the dataset using Label Encoding.
    """
    # Initialize LabelEncoders
    label_encoders = {
        'origin_airport': LabelEncoder(),
        'destination_airport': LabelEncoder(),
        'departure_date': LabelEncoder(),
        'departure_time_HHMM': LabelEncoder(),
        'cabin_type': LabelEncoder()
    }
    
    # Apply encoding and save encoders for each column
    for column, encoder in label_encoders.items():
        data[column] = encoder.fit_transform(data[column])
    
    return data, label_encoders

In [4]:
# Split the data
def split_data(data, test_size=0.2, sample_fraction=0.1, random_state=42):
    """
    Sample a fraction of the data, separate features and target, and split into train/test sets.
    """
    # Take a random sample for quicker training
    sampled_data = data.sample(frac=sample_fraction, random_state=random_state)
    
    # Define features and target
    X = sampled_data[['origin_airport', 'destination_airport', 'departure_date', 'departure_time_HHMM', 'cabin_type']]
    y = sampled_data['fare']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [5]:
# Train model
def train_model(X_train, y_train):
    """
    Train a Random Forest model with specified parameters.
    """
    # Initialize Random Forest with parameters for reduced model size
    model = RandomForestRegressor(n_estimators=50, max_depth=10, min_samples_leaf=2, random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    return model

In [6]:
# Evaluate model
def evaluate_model(model, X_test, y_test):
    """
    Predict and evaluate the model on test data.
    """
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    print("Mean Absolute Error:", mae)
    print("R² Score:", r2)
    print("RMSE:", rmse)
    
    return mae, r2, rmse

In [7]:
# Save model and encoders
def save_model_and_encoders(model, label_encoders, model_path, encoders_path_prefix):
    """
    Save the trained model and label encoders.
    """
    #Save model with compression
    joblib.dump(model, model_path, compress=3)
    print(f"Model saved at {model_path}")
    
    # Save each label encoder
    for column, encoder in label_encoders.items():
        encoder_path = f"{encoders_path_prefix}_{column}_label_encoder.pkl"
        joblib.dump(encoder, encoder_path)
        print(f"{column} encoder saved at {encoder_path}")

In [8]:
# Define main pipeline function
def main_pipeline(file_path, model_path, encoders_path_prefix):
    """
    Main pipeline for loading data, preprocessing, training, and saving the model.
    """
    # Load and preprocess data
    data = load_and_preprocess_data(file_path)
    
    # Encode categorical data
    data, label_encoders = encode_data(data)
    
    # Split data
    X_train, X_test, y_train, y_test = split_data(data)
    
    # Train model
    model = train_model(X_train, y_train)
    
    # Evaluate model
    evaluate_model(model, X_test, y_test)
    
    # Save model and encoders
    save_model_and_encoders(model, label_encoders, model_path, encoders_path_prefix)

In [9]:
# Execute the pipeline
if __name__ == "__main__":
    file_path = '/Users/bananavodka/Projects/at3_mla/at3-model-experimentation/data/combined_itineraries.csv'
    model_path = '/Users/bananavodka/Projects/at3_mla/at3-model-experimentation/models/rf_model.pkl'
    encoders_path_prefix = '/Users/bananavodka/Projects/at3_mla/at3-model-experimentation/models'

    main_pipeline(file_path, model_path, encoders_path_prefix)

Mean Absolute Error: 108.0273729140025
R² Score: 0.47180901173968803
RMSE: 149.96541783242972
Model saved at /Users/bananavodka/Projects/at3_mla/at3-model-experimentation/models/rf_model.pkl
origin_airport encoder saved at /Users/bananavodka/Projects/at3_mla/at3-model-experimentation/models_origin_airport_label_encoder.pkl
destination_airport encoder saved at /Users/bananavodka/Projects/at3_mla/at3-model-experimentation/models_destination_airport_label_encoder.pkl
departure_date encoder saved at /Users/bananavodka/Projects/at3_mla/at3-model-experimentation/models_departure_date_label_encoder.pkl
departure_time_HHMM encoder saved at /Users/bananavodka/Projects/at3_mla/at3-model-experimentation/models_departure_time_HHMM_label_encoder.pkl
cabin_type encoder saved at /Users/bananavodka/Projects/at3_mla/at3-model-experimentation/models_cabin_type_label_encoder.pkl
