In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
import joblib
import os
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping


2024-11-06 04:14:30.309484: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-06 04:14:30.727580: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Open master df

In [2]:
# Open master
df_train= pd.read_feather('../data/processed/train_data.feather')
df_test= pd.read_feather('../data/processed/test_data.feather')


In [3]:
# Calculate average price per route
route_avg_price = (
    df_train.groupby(['startingAirport', 'destinationAirport'])['totalFare']
    .mean()
    .reset_index()
    .rename(columns={'totalFare': 'average_price'})
)


# Merge this back to the original dataset
df_train = df_train.merge(route_avg_price, on=['startingAirport', 'destinationAirport'], how='left')
df_test = df_test.merge(route_avg_price, on=['startingAirport', 'destinationAirport'], how='left')


In [4]:
# Calculate average distance to the dataset
route_avg_distance = (
    df_train.groupby(['startingAirport', 'destinationAirport'])['totalTravelDistance']
    .mean()
    .reset_index()
    .rename(columns={'totalTravelDistance': 'average_distance'})
)

df_train = df_train.merge(route_avg_distance, on=['startingAirport', 'destinationAirport'], how='left')
df_test = df_test.merge(route_avg_distance, on=['startingAirport', 'destinationAirport'], how='left')


In [5]:
# Calculate unique routes in the dataset
# Create a new column to represent the unique route, combining airports alphabetically, store as string
def unique_route(df):
    df['route'] = df[['startingAirport', 'destinationAirport']].apply(
        lambda x: str(tuple(sorted(x))), axis=1
    )
    return df

df_train = unique_route(df_train)
df_test = unique_route(df_test)

# Modelling

Steps
1. Encode the categories
2. Normalise
3. Split and train
4. train 
5. Eval

### Preprocess data

In [6]:

# Preprocess the features
# Define the features and target
def preprocess_drop(df):
    df = df.drop([ 'searchDate', 'flightDate','segmentsArrivalAirportCode'], axis=1)
    
    return df

# Label encode for airports
def process_airports(df):
    le = LabelEncoder()
    all_airports = sorted(set(df['startingAirport']).union(df['destinationAirport']))
    le.fit(all_airports)
    
    df['startingAirport'] = le.transform(df['startingAirport'])
    df['destinationAirport'] = le.transform(df['destinationAirport'])
    # Print dictionary of the label encoder for airports with original values and the encoded values
    print(dict(zip(le.classes_, le.transform(le.classes_))))

    return df, le

# Label encode for routes
def process_routes(df):
    le_route = LabelEncoder()
    df['route'] = le_route.fit_transform(df['route'])
    return df, le_route
    


#Features to process
boolean_cols = ['isNonStop']
ohe_cols = ['AirlineNameScore', 'CabinCode']
scale_cols = ['DepartureTimeHour','date_diff_days', 'CabinCode','average_distance', 'average_price']
scale_cols = list(set(scale_cols) - set(ohe_cols))

# Process boolean columns
def process_boolean(df, boolean_cols):
    df[boolean_cols] = df[boolean_cols].astype(int)
    return df

# Scale data
def process_scale(df, scale_cols):
    scaler = StandardScaler()
    print('scale_cols',scale_cols)
    print('df[scale_cols]',df.columns)
    df[scale_cols] = scaler.fit_transform(df[scale_cols])
    return df, scaler

# One-hot encode categorical columns
def process_ohe(df, ohe_cols, ohe=None):
    if ohe is None:
        ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        ohe.fit(df[ohe_cols])
    ohe_df = pd.DataFrame(ohe.transform(df[ohe_cols]), columns=ohe.get_feature_names_out(ohe_cols))
    df = df.drop(ohe_cols, axis=1)
    df = pd.concat([df, ohe_df], axis=1)
    return df, ohe

# Process test data
def process_test_data(df, le, le_route, scaler,  boolean_cols, scale_cols, ohe_cols):
    print('le',le)
    df['startingAirport'] = le.transform(df['startingAirport'])
    df['destinationAirport'] = le.transform(df['destinationAirport'])
    print('le_route',le_route)
    df['route'] = le_route.transform(df['route'])
    df = process_boolean(df, boolean_cols)
    print('processboolean',df.columns)
    df[scale_cols] = scaler.transform(df[scale_cols])
    # df, _ = process_ohe(df, ohe_cols, ohe)
    return df

# # weekday to get cos and sine
# def process_weekday(df):
#     df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
#     df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)
#     df.drop('weekday', axis=1, inplace=True)
#     return df



# Preprocess the features
def preprocess_features(df):
    # df, le = process_airports(df)
    # print the columns in df 
    print('processairport',df.columns)
    df = process_boolean(df)
    print('processboolean',df.columns)
    df = process_weekday(df)
    print('processweekday',df.columns)
    df = process_ohe(df)
    print('processohe',df.columns)
    df, scaler = process_scale(df)
    print('processscale',df.columns)
    
    return df, le, scaler

def preprocess_features_test(df):
    # df = process_airports_test(df)
    df = process_boolean(df)
    df = process_weekday(df)
    df = process_ohe(df)
    df = process_scale_test(df)
    return df

def prepar_data_set(data_df):
    categoy_features = ['startingAirport', 'destinationAirport', 'AirlineNameScore', 'CabinCode', 'route']
    numerique_features = ['DepartureTimeHour', 'date_diff_days', 'average_distance', 'average_price', 'weekday']
    encoders = {}
    for col in categoy_features:
        encoder = LabelEncoder()
        data_df[col] = encoder.fit_transform(data_df[col])
        encoders[col] = encoder
    return data_df,categoy_features,numerique_features, encoders

def prepare_test_set(data_df,categoy_features):
    
    for col in categoy_features:
        print(col)
        data_df[col] = encoders[col].transform(data_df[col])
    return data_df



In [7]:
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=42)

In [8]:
train = preprocess_drop(df_train.copy())
val = preprocess_drop(df_val.copy())
test = preprocess_drop(df_test.copy())

In [9]:
# Process training data
df_train, le_airports = process_airports(train)
df_train, le_route = process_routes(df_train)
df_train = process_boolean(df_train, boolean_cols)
df_train, scaler = process_scale(df_train, scale_cols)
# df_train, ohe = process_ohe(df_train, ohe_cols)

# Process test data
df_test = process_test_data(test, le_airports, le_route, scaler,  boolean_cols, scale_cols, ohe_cols)


{'ATL': 0, 'BOS': 1, 'CLT': 2, 'DEN': 3, 'DFW': 4, 'DTW': 5, 'EWR': 6, 'IAD': 7, 'JFK': 8, 'LAX': 9, 'LGA': 10, 'MIA': 11, 'OAK': 12, 'ORD': 13, 'PHL': 14, 'SFO': 15}
scale_cols ['average_price', 'average_distance', 'DepartureTimeHour', 'date_diff_days']
df[scale_cols] Index(['startingAirport', 'destinationAirport', 'isNonStop', 'totalFare',
       'totalTravelDistance', 'DepartureTimeHour', 'CabinCode',
       'AirlineNameScore', 'date_diff_days', 'weekday', 'average_price',
       'average_distance', 'route'],
      dtype='object')
le LabelEncoder()
le_route LabelEncoder()
processboolean Index(['startingAirport', 'destinationAirport', 'isNonStop', 'totalFare',
       'totalTravelDistance', 'DepartureTimeHour', 'CabinCode',
       'AirlineNameScore', 'date_diff_days', 'weekday', 'average_price',
       'average_distance', 'route'],
      dtype='object')


In [10]:
# Create tensorflow nn
def create_nn():
    # Define the input layers
    input_layers = []
    output_layers = []
    for col in categoy_features:
        input_layer = Input(shape=(1,), name=col)
        embedding = Embedding(input_dim=int(df_train[col].max()) + 1, output_dim=20)(input_layer)
        embedding = Flatten()(embedding)
        input_layers.append(input_layer)
        output_layers.append(embedding)
    for col in numerique_features:
        input_layer = Input(shape=(1,), name=col)
        input_layers.append(input_layer)
        output_layers.append(input_layer)
    # Concatenate the layers
    x = Concatenate()(output_layers)
    x = Dense(192, activation='relu')(x)
    x = Dense(288, activation='relu')(x)
    x = Dense(192, activation='relu')(x)
    x = Dense(1)(x)
    model = Model(inputs=input_layers, outputs=x)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')
    return model

# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Metric to monitor
    patience=5,          # Number of epochs to wait for improvement
    restore_best_weights=True  # Restore the best weights after stopping
)

In [11]:
# Features to use
categoy_features = ['startingAirport', 'destinationAirport', 'AirlineNameScore', 'CabinCode', 'route']
numerique_features = ['DepartureTimeHour', 'date_diff_days', 'average_distance', 'average_price', 'weekday']
# 

input_dict={
    'startingAirport': df_train['startingAirport'],
    'destinationAirport': df_train['destinationAirport'],
    'AirlineNameScore': df_train['AirlineNameScore'],
    'CabinCode': df_train['CabinCode'],
    'DepartureTimeHour': df_train['DepartureTimeHour'],
    'date_diff_days': df_train['date_diff_days'],
    'average_distance': df_train['average_distance'],
    'average_price': df_train['average_price'],
    'weekday': df_train['weekday'],
    'route': df_train['route']
}

model = create_nn()
# Train the model with early stopping
history = model.fit(
    input_dict, 
    df_train['totalFare'], 
    epochs=100,  # Set a high number of epochs
    batch_size=64, 
    validation_split=0.2, 
    callbacks=[early_stopping]  # Include the EarlyStopping callback
)

# Evaluate the model
# predict
input_dict_test={
    'startingAirport': df_test['startingAirport'],
    'destinationAirport': df_test['destinationAirport'],
    'AirlineNameScore': df_test['AirlineNameScore'],
    'CabinCode': df_test['CabinCode'],
    'DepartureTimeHour': df_test['DepartureTimeHour'],
    'date_diff_days': df_test['date_diff_days'],
    'average_distance': df_test['average_distance'],
    'average_price': df_test['average_price'],
    'weekday': df_test['weekday'],
    'route': df_test['route']
}

y_pred = model.predict(input_dict_test)
mae = mean_absolute_error(df_test['totalFare'], y_pred)
rmse = root_mean_squared_error(df_test['totalFare'], y_pred)
print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')

# Save the model
model.save('../models/nns/nn_model.keras')




2024-11-06 04:15:37.021513: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-06 04:15:37.156574: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/100




[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 2ms/step - loss: 18843.9355 - val_loss: 15424.2109
Epoch 2/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 2ms/step - loss: 15184.3828 - val_loss: 14770.0430
Epoch 3/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 2ms/step - loss: 14616.9707 - val_loss: 14665.5166
Epoch 4/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 2ms/step - loss: 14199.7510 - val_loss: 14639.1143
Epoch 5/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 2ms/step - loss: 14025.7236 - val_loss: 14428.8584
Epoch 6/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 2ms/step - loss: 13998.5625 - val_loss: 14151.4658
Epoch 7/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 2ms/step - loss: 13732.3525 - val_loss: 13825.8857
Epoch 8/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [None]:
# Features to use
categoy_features = ['startingAirport', 'destinationAirport', 'AirlineNameScore', 'CabinCode', 'route']
numerique_features = ['DepartureTimeHour', 'date_diff_days', 'average_distance', 'average_price', 'weekday']
# 

input_dict={
    'startingAirport': df_train['startingAirport'],
    'destinationAirport': df_train['destinationAirport'],
    'AirlineNameScore': df_train['AirlineNameScore'],
    'CabinCode': df_train['CabinCode'],
    'DepartureTimeHour': df_train['DepartureTimeHour'],
    'date_diff_days': df_train['date_diff_days'],
    'average_distance': df_train['average_distance'],
    'average_price': df_train['average_price'],
    'weekday': df_train['weekday'],
    'route': df_train['route']
}

model = create_nn()
# Train the model with early stopping
history = model.fit(
    input_dict, 
    df_train['totalFare'], 
    epochs=100,  # Set a high number of epochs
    batch_size=64, 
    validation_split=0.2, 
    callbacks=[early_stopping]  # Include the EarlyStopping callback
)

# Evaluate the model
# predict
input_dict_test={
    'startingAirport': df_test['startingAirport'],
    'destinationAirport': df_test['destinationAirport'],
    'AirlineNameScore': df_test['AirlineNameScore'],
    'CabinCode': df_test['CabinCode'],
    'DepartureTimeHour': df_test['DepartureTimeHour'],
    'date_diff_days': df_test['date_diff_days'],
    'average_distance': df_test['average_distance'],
    'average_price': df_test['average_price'],
    'weekday': df_test['weekday'],
    'route': df_test['route']
}

y_pred = model.predict(input_dict_test)
mae = mean_absolute_error(df_test['totalFare'], y_pred)
rmse = root_mean_squared_error(df_test['totalFare'], y_pred)
print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')

# Save the model
model.save('../models/nns/nn_model.keras')




2024-11-06 04:15:37.021513: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-06 04:15:37.156574: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/100




[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 2ms/step - loss: 18843.9355 - val_loss: 15424.2109
Epoch 2/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 2ms/step - loss: 15184.3828 - val_loss: 14770.0430
Epoch 3/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 2ms/step - loss: 14616.9707 - val_loss: 14665.5166
Epoch 4/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 2ms/step - loss: 14199.7510 - val_loss: 14639.1143
Epoch 5/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 2ms/step - loss: 14025.7236 - val_loss: 14428.8584
Epoch 6/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 2ms/step - loss: 13998.5625 - val_loss: 14151.4658
Epoch 7/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 2ms/step - loss: 13732.3525 - val_loss: 13825.8857
Epoch 8/100
[1m57450/57450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [None]:
# Test set predictions and metrics
# Process the test set

