In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
import joblib
import os
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras import layers



2024-11-05 23:19:12.676996: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-05 23:19:13.124577: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Open master df

In [2]:
# Open master
df_train= pd.read_feather('../data/processed/train_data.feather')
df_test= pd.read_feather('../data/processed/test_data.feather')


In [3]:
# Calculate average price per route
route_avg_price = (
    df_train.groupby(['startingAirport', 'destinationAirport'])['totalFare']
    .mean()
    .reset_index()
    .rename(columns={'totalFare': 'average_price'})
)


# Merge this back to the original dataset
df_train = df_train.merge(route_avg_price, on=['startingAirport', 'destinationAirport'], how='left')
df_test = df_test.merge(route_avg_price, on=['startingAirport', 'destinationAirport'], how='left')


In [4]:
# Calculate average distance to the dataset
route_avg_distance = (
    df_train.groupby(['startingAirport', 'destinationAirport'])['totalTravelDistance']
    .mean()
    .reset_index()
    .rename(columns={'totalTravelDistance': 'average_distance'})
)

df_train = df_train.merge(route_avg_distance, on=['startingAirport', 'destinationAirport'], how='left')
df_test = df_test.merge(route_avg_distance, on=['startingAirport', 'destinationAirport'], how='left')


In [5]:
# Calculate unique routes in the dataset
# Create a new column to represent the unique route, combining airports alphabetically, store as string
def unique_route(df):
    df['route'] = df[['startingAirport', 'destinationAirport']].apply(
        lambda x: str(tuple(sorted(x))), axis=1
    )
    return df

df_train = unique_route(df_train)
df_test = unique_route(df_test)

# Modelling

Steps
1. Encode the categories
2. Normalise
3. Split and train
4. train 
5. Eval

### Preprocess data

In [6]:

# Preprocess the features
# Define the features and target
def preprocess_drop(df):
    df = df.drop([ 'searchDate', 'flightDate','segmentsArrivalAirportCode'], axis=1)
    
    return df

# #Label encode for airports
# def process_airports(df):
#     le = LabelEncoder()
#     all_airports = sorted(set(df['startingAirport']).union(df['destinationAirport']))
#     le.fit(all_airports)
    
#     df['startingAirport'] = le.transform(df['startingAirport'])
#     df['destinationAirport'] = le.transform(df['destinationAirport'])
#     # print dictionary of the label encoder for airports with original values and the encoded values
#     print(dict(zip(le.classes_, le.transform(le.classes_))))

#     return df, le

# def process_airports_test(df):

#     df['startingAirport'] = le.transform(df['startingAirport'])
#     df['destinationAirport'] = le.transform(df['destinationAirport'])
#     return df
    


#Features to process
boolean_cols = ['isNonStop']
ohe_cols = ['AirlineNameScore', 'CabinCode']
scale_cols = ['DepartureTimeHour','date_diff_days', 'CabinCode','average_distance', 'average_price']
scale_cols = list(set(scale_cols) - set(ohe_cols))

# Encode the boolean column
def process_boolean(df):
    df[boolean_cols] = df[boolean_cols].astype(int)
    return df

# scale data
def process_scale(df):
    scaler = StandardScaler()
    df[scale_cols] = scaler.fit_transform(df[scale_cols])
    return df, scaler

def process_scale_test(df):
    df[scale_cols] = scaler.transform(df[scale_cols])
    return df

# weekday to get cos and sine
def process_weekday(df):
    df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
    df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)
    df.drop('weekday', axis=1, inplace=True)
    return df

# onehot encode cabin code
def process_ohe(df):
    print("Columns before one-hot encoding:", df.columns)
    print("Columns to one-hot encode:", ohe_cols)
    df = pd.get_dummies(df, columns=ohe_cols)
    return df

# Preprocess the features
def preprocess_features(df):
    # df, le = process_airports(df)
    # print the columns in df 
    print('processairport',df.columns)
    df = process_boolean(df)
    print('processboolean',df.columns)
    df = process_weekday(df)
    print('processweekday',df.columns)
    df = process_ohe(df)
    print('processohe',df.columns)
    df, scaler = process_scale(df)
    print('processscale',df.columns)
    
    return df, le, scaler

def preprocess_features_test(df):
    # df = process_airports_test(df)
    df = process_boolean(df)
    df = process_weekday(df)
    df = process_ohe(df)
    df = process_scale_test(df)
    return df

def prepar_data_set(data_df):
    categoy_features = ['startingAirport', 'destinationAirport', 'AirlineNameScore', 'CabinCode', 'route']
    numerique_features = ['DepartureTimeHour', 'date_diff_days', 'average_distance', 'average_price', 'weekday']
    encoders = {}
    for col in categoy_features:
        encoder = LabelEncoder()
        data_df[col] = encoder.fit_transform(data_df[col])
        encoders[col] = encoder
    return data_df,categoy_features,numerique_features, encoders

def prepare_test_set(data_df):
    categoy_features = ['startingAirport', 'destinationAirport', 'AirlineNameScore', 'CabinCode', 'route']
    
    for col in categoy_features:
        print(col)
        data_df[col] = encoders[col].transform(data_df[col])
    return data_df



In [7]:
df_train, df_val = train_test_split(df_train, test_size=0.9, random_state=42)

In [8]:
train = preprocess_drop(df_train.copy())
val = preprocess_drop(df_val.copy())
test = preprocess_drop(df_test.copy())

In [9]:
#trainset
data_df,categoy_features,numerique_features, encoders = prepar_data_set(train)
scaler = StandardScaler()
data_df[numerique_features] = scaler.fit_transform(data_df[numerique_features])

data_df_test = prepare_test_set(test)
data_df_test[numerique_features] = scaler.transform(data_df_test[numerique_features])


startingAirport
destinationAirport
AirlineNameScore
CabinCode
route


In [None]:
# Create tensorflow nn
def create_nn():
    # Define the input layers
    input_layers = []
    output_layers = []
    for col in categoy_features:
        input_layer = Input(shape=(1,), name=col)
        embedding = Embedding(input_dim=int(data_df[col].max()) + 1, output_dim=10)(input_layer)
        embedding = Flatten()(embedding)
        input_layers.append(input_layer)
        output_layers.append(embedding)
    for col in numerique_features:
        input_layer = Input(shape=(1,), name=col)
        input_layers.append(input_layer)
        output_layers.append(input_layer)
    # Concatenate the layers
    x = Concatenate()(output_layers)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)   
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(1)(x)
    model = Model(inputs=input_layers, outputs=x)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

input_dict={
    'startingAirport': data_df['startingAirport'],
    'destinationAirport': data_df['destinationAirport'],
    'AirlineNameScore': data_df['AirlineNameScore'],
    'CabinCode': data_df['CabinCode'],
    'DepartureTimeHour': data_df['DepartureTimeHour'],
    'date_diff_days': data_df['date_diff_days'],
    'average_distance': data_df['average_distance'],
    'average_price': data_df['average_price'],
    'weekday': data_df['weekday'],
    'route': data_df['route']
}

model = create_nn()
history = model.fit(input_dict, data_df['totalFare'], epochs=10, batch_size=32, validation_split=0.2)
# Evaluate the model
# predict
input_dict_test={
    'startingAirport': data_df_test['startingAirport'],
    'destinationAirport': data_df_test['destinationAirport'],
    'AirlineNameScore': data_df_test['AirlineNameScore'],
    'CabinCode': data_df_test['CabinCode'],
    'DepartureTimeHour': data_df_test['DepartureTimeHour'],
    'date_diff_days': data_df_test['date_diff_days'],
    'average_distance': data_df_test['average_distance'],
    'average_price': data_df_test['average_price'],
    'weekday': data_df_test['weekday'],
    'route': data_df_test['route']
}

y_pred = model.predict(input_dict_test)
mae = mean_absolute_error(data_df_test['totalFare'], y_pred)
rmse = root_mean_squared_error(data_df_test['totalFare'], y_pred)
print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')

# Save the model
model.save('../models/nn/nn_model')





2024-11-05 23:19:45.648336: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-05 23:19:45.896231: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/10




[1m12767/12767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2ms/step - loss: 32554.7109 - val_loss: 20135.9668
Epoch 2/10
[1m12767/12767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - loss: 24085.2871 - val_loss: 19876.9297
Epoch 3/10
[1m12767/12767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - loss: 23465.6895 - val_loss: 19979.5762
Epoch 4/10
[1m12767/12767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 23422.5977 - val_loss: 19255.8301
Epoch 5/10
[1m12767/12767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 23077.3574 - val_loss: 19816.5469
Epoch 6/10
[1m12767/12767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 2ms/step - loss: 22766.9531 - val_loss: 18943.2285
Epoch 7/10
[1m12767/12767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - loss: 22510.0566 - val_loss: 18827.3926
Epoch 8/10
[1m12767/12767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0

ValueError: Invalid filepath extension for saving. Please add either a `.keras` extension for the native Keras format (recommended) or a `.h5` extension. Use `model.export(filepath)` if you want to export a SavedModel for use with TFLite/TFServing/etc. Received: filepath=../models/nn/nn_model.

In [None]:
# Test set predictions and metrics
# Process the test set

