In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import joblib


In [16]:
#Load Processed Datasets

data = pd.read_csv('/Users/thusondube/Downloads/sample_flights.csv')

In [19]:

    
# Rename columns for clarity
data = data.rename(columns={
        'startingAirport': 'origin_airport', 
        'destinationAirport': 'destination_airport',
        'totalFare': 'fare'
    })
    
def convert_to_epoch_seconds(time_str):
    try:
        # Extract the first time segment and convert to datetime
        time = pd.to_datetime(time_str.split('||')[0], utc=True, errors='coerce')
        # Round to the nearest 30 minutes
        time = (time.floor('30T') + pd.Timedelta(minutes=15)).floor('30T')
        # Convert to epoch seconds
        return int(time.timestamp())
    except Exception as e:
        # Handle any errors and return None or a default value
        return None

# Apply the function to the DataFrame column
data['departure_time_seconds'] = data['segmentsDepartureTimeRaw'].apply(convert_to_epoch_seconds)
    
    # Extract cabin type
data['cabin_type'] = data['segmentsCabinCode'].apply(lambda x: x.split('||')[0])
    
    # Select and return only the relevant columns
data = data[['origin_airport', 'destination_airport', 'departure_time_seconds', 'cabin_type', 'fare']]
   

  time = (time.floor('30T') + pd.Timedelta(minutes=15)).floor('30T')


In [26]:
# Encode categorical columns
def encode_data(data):
    """
    Encode categorical columns in the dataset using Label Encoding.
    """
    # Initialize LabelEncoders
    label_encoders = {
        'origin_airport': LabelEncoder(),
        'destination_airport': LabelEncoder(),
        'cabin_type': LabelEncoder()
    }
    
    # Apply encoding and save encoders for each column
    for column, encoder in label_encoders.items():
        data[column] = encoder.fit_transform(data[column])
    
    return data, label_encoders

In [27]:
# Split the data
def split_data(data, test_size=0.2, sample_fraction=0.1, random_state=42):
    """
    Sample a fraction of the data, separate features and target, and split into train/test sets.
    """
    # Take a random sample for quicker training
    sampled_data = data.sample(frac=sample_fraction, random_state=random_state)
    
    # Define features and target
    X = sampled_data[['origin_airport', 'destination_airport', 'departure_time_seconds', 'cabin_type']]
    y = sampled_data['fare']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [32]:
# Train model
def train_model(X_train, y_train):
    """
    Train a  Adaboost model with specified parameters.
    """
    # Initialize Random Forest with parameters for reduced model size
    model  = AdaBoostRegressor(random_state=0, n_estimators=100)
    
    # Train the model
    model.fit(X_train, y_train)
    
    return model

In [33]:
# Evaluate model
def evaluate_model(model, X_test, y_test):
    """
    Predict and evaluate the model on test data.
    """
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    print("Mean Absolute Error:", mae)
    print("R² Score:", r2)
    print("RMSE:", rmse)
    
    return mae, r2, rmse

In [None]:
# Execute the pipeline
if __name__ == "__main__":
    file_path = '/Users/bananavodka/Projects/at3_mla/at3-model-experimentation/data/combined_itineraries.csv'
    model_path = '/Users/bananavodka/Projects/at3_mla/at3-model-experimentation/models/rf_model.pkl'
    encoders_path_prefix = '/Users/bananavodka/Projects/at3_mla/at3-model-experimentation/model'

    main_pipeline(file_path, model_path, encoders_path_prefix)