In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import minmax_scale

from pullDate_FullPeriod import pull_data , pull_seas 
from utility import *


In [2]:
# Python local connection to Oracle (herccrt) and Teradata (mosaic)
def connect_to_servers():
    from config import  herccrt,mosaic, azure
    hcrt = herccrt().con()
    mos = mosaic().con()
    az = azure().con()
    return hcrt, mos, az

# jupyter notebook settings
import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20) # DON't Use None, it will show every row --> resulting in CRASH

hcrt, mos, az = connect_to_servers()

## Kronos .Sina Model

_SUMMARY_


In [3]:
# Kronos Model
import os
from keras import backend as K 
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import Reshape , BatchNormalization, LSTM, Concatenate, Dense, Activation, Flatten, Conv2D, Conv1D, ConvLSTM2D, Conv3D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, History, ModelCheckpoint
import random


def kronos_32s_model(para_epochs, para_early_stop, para_model_name, para_sea_len, para_sea_dense, window, train_list, val_list, test_list):
        
    # Set Random Seed 
    seed_value = 44
    random.seed(seed_value)
    np.random.seed(seed_value)
    tf.random.set_seed(seed_value)

    
    # extract train/val/test datasets  ** since TF.keras only accepts channels_last we have to move axix **
    train_fc, train_sea, train_traf_time, train_output = train_list
    val_fc, val_sea, val_traf_time, val_output = val_list
    test_fc, test_sea, test_traf_time, test_gold_output = test_list
    
    # Reshape for Conv3d with channel 1.
    train_fc, val_fc, test_fc = train_fc.reshape(list(train_fc.shape)+[1]), val_fc.reshape(list(val_fc.shape)+[1]), test_fc.reshape(list(test_fc.shape)+[1]) 
    
    
    # Model Parameters:
    batch_size = 100
    early_stop = 100
    filter_num = 8
    rec_act = 'relu'
    rec_dropout = 0.1
    dropout = 0.1
    lstm_size = 64
    patience= early_stop
    batch_size_nbr = batch_size
    Error_Method = 'MC_dropout'
    B = batch_size
    # 128, 64,64

    # Input sizes: 
    K.clear_session()
    input_tensor_closure = Input(shape = (2,7,10,1)) # lets use 3D  with Channel of 1
    input_tensor_seas = Input(shape = (window ,para_sea_len))
    input_tensor_seas = Input(shape = (para_sea_len))
    input_tensor_trrafic_tseries = Input(shape = (window ,2,7,10))

    # Feed the TF-t-series to ConvLSTM:
    cnn_lstm_tf_time = ConvLSTM2D(filters = filter_num , kernel_size = 3, padding = 'same', data_format = 'channels_first', 
                                  recurrent_activation = rec_act, recurrent_dropout = rec_dropout, dropout=dropout ,  return_sequences = False)(input_tensor_trrafic_tseries)
    cnn_lstm_tf_time = BatchNormalization(axis = 1)(cnn_lstm_tf_time)
    # cnn_lstm_tf_time = ConvLSTM2D(filters = filter_num//2 , kernel_size = 2, padding = 'same', data_format = 'channels_first', 
    #                               recurrent_activation = rec_act, recurrent_dropout = rec_dropout, dropout=dropout ,  return_sequences = False)(cnn_lstm_tf_time)
    # cnn_lstm_tf_time = BatchNormalization(axis = 1)(cnn_lstm_tf_time)
    cnn_lstm_tf_time = Flatten()(cnn_lstm_tf_time)
    cnn_lstm_tf_time = Dense(32, activation=rec_act)(cnn_lstm_tf_time)

    # print(cnn_lstm_tf_time)


    # # layer for seasonality Using a RNN (Here GRU instead of LSTM)
    # rnn_seas = GRU(lstm_size, recurrent_dropout = rec_dropout, dropout=dropout  , return_sequences = False  )(input_tensor_seas)
    # # rnn_seas = Bidirectional(GRU(lstm_size//2, recurrent_dropout = rec_dropout, dropout=dropout  , return_sequences = False ))(rnn_seas)
    # rnn_seas = BatchNormalization()(rnn_seas)
    # rnn_seas = Dense(64, activation=rec_act)(rnn_seas)

    # Seas No RNN:
    rnn_seas = Dense(128, activation=rec_act)(input_tensor_seas)
    rnn_seas = Dense(32, activation=rec_act)(rnn_seas)


    # print(rnn_seas)


    # layer for FC:
    cnn_FC = Conv3D(filters = filter_num , kernel_size = 3, strides = (1, 1, 1), padding = 'same', data_format = 'channels_last')(input_tensor_closure)
    cnn_FC = Conv3D(filters = filter_num//2 , kernel_size = 3, strides = (1, 1, 1), padding = 'same', data_format = 'channels_last')(cnn_FC)
    cnn_FC = BatchNormalization()(cnn_FC)
    cnn_FC = Flatten()(cnn_FC)
    cnn_FC = Dense(32, activation=rec_act)(cnn_FC)

    # print(cnn_FC)


    # concat frac_closure and seasonality and TF_sries
    FF_concat_all = Concatenate()([cnn_lstm_tf_time, rnn_seas, cnn_FC])
    # FF_concat_all = Dense(256, activation=rec_act)(FF_concat_all)
    FF_concat_all = Dense(140, activation=rec_act)(FF_concat_all)

    # # Output format:
    output_tensor = Reshape((2, 7, 10))(FF_concat_all)

    # print(output_tensor)


    # Kronos model setting
    Kronos_model = Model(inputs = [ input_tensor_closure, input_tensor_seas, input_tensor_trrafic_tseries], outputs = output_tensor)
    Kronos_model.compile(loss = 'mean_squared_error', optimizer = 'adam')
    
    if para_early_stop: # with early stop
        
        os.chdir(r"\\corpaa.aa.com\campusshared\HDQ\HDQ_REVMGMT_Share\RMDEPT")
        path = "BFox/Kronos/Prototype/Output/FullPeriod/"
        market_nm = orig + dest
        # if not os.path.exists(path):
            # os.mkdir(path)
        
        # history = History()
        metric = 'val_accuracy'
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=patience , restore_best_weights=True)
        mc = ModelCheckpoint(path + '//' + market_nm + '_' + f'{para_model_name}.h5', monitor='val_loss', mode='auto', verbose=0, save_best_only=True, save_freq='epoch')
        base_hist = Kronos_model.fit([train_fc, train_sea, train_traf_time], train_output, epochs = para_epochs, 
                                 batch_size = batch_size_nbr, validation_data = ([ val_fc, val_sea, val_traf_time], val_output), verbose = 0, callbacks=[es,mc])
    else: # no early stop
        base_hist = Kronos_model.fit([train_fc, train_sea, train_traf_time], train_output, epochs = para_epochs, 
                                 batch_size = batch_size_nbr, validation_data = ([ val_fc, val_sea, val_traf_time], val_output) , verbose = 0)
         
    
    # use the trained model to predict
    test_pred = Kronos_model.predict([ test_fc, test_sea, test_traf_time])

    return test_pred, base_hist, Kronos_model


def test_acc(prediction_results, gold_labels):
    results = pd.DataFrame()
    test_size = gold_labels.shape[0]

    gold_tr_reshaped = gold_labels.reshape(test_size*14,10)
    pred_tr_reshaped = prediction_results.reshape(test_size*14,10)

    # Gold, top, mid, bot
    results['gold_top_tr'] = gold_tr_reshaped[:,:3].sum(1)
    results['gold_mid_tr'] = gold_tr_reshaped[:,3:7].sum(1)
    results['gold_bot_tr'] = gold_tr_reshaped[:,7:].sum(1)
    results['gold_sum_tr'] = gold_tr_reshaped.sum(1)

    # Pred Top, Mid, Bot
    results['pred_top_tr'] = pred_tr_reshaped[:,:3].sum(1)
    results['pred_mid_tr'] = pred_tr_reshaped[:,3:7].sum(1)
    results['pred_bot_tr'] = pred_tr_reshaped[:,7:].sum(1)
    results['pred_sum_tr'] = pred_tr_reshaped.sum(1)

    # all FvT errors
    results['top_FvT'] = results['pred_top_tr'] - results['gold_top_tr']
    results['mid_FvT'] = results['pred_mid_tr'] - results['gold_mid_tr']
    results['bot_FvT'] = results['pred_bot_tr'] - results['gold_bot_tr']
    results['sum_FvT'] = results['pred_sum_tr'] - results['gold_sum_tr']

    # squared FvT errors for MSE
    results['top_FvT_sqr'] = results['top_FvT']**2
    results['mid_FvT_sqr'] = results['mid_FvT']**2
    results['bot_FvT_sqr'] = results['bot_FvT']**2
    results['sum_FvT_sqr'] = results['sum_FvT']**2
    
    result_sum = { 
                    "top_FvT" : [results['top_FvT'].mean(), results['top_FvT'].std(), results['top_FvT_sqr'].mean()],
                    "mid_FvT" : [results['mid_FvT'].mean(), results['mid_FvT'].std(), results['mid_FvT_sqr'].mean()],
                    "bot_FvT" : [results['bot_FvT'].mean(), results['bot_FvT'].std(), results['bot_FvT_sqr'].mean()],
                    "sum_FvT" : [results['sum_FvT'].mean(), results['sum_FvT'].std(), results['sum_FvT_sqr'].mean()]
                }
    return results , result_sum



## Training:

_SUMMARY_

Run the model for only one leg:

In [4]:
import warnings
warnings.filterwarnings('ignore')

print("Ignoring the warnings")



In [5]:
from tqdm.notebook import trange, tqdm
from collections import defaultdict
from sklearn.utils import shuffle

orig = "DFW"
dest = "TUS"

DOW = True

# If test_random_masking is False -> Means you have to give it a date for creating "fake today"
test_random_masking = True
test_today = '2022-06-01'

yesterday =  datetime.today() - timedelta(days=2)
next_year_today = datetime.today() + timedelta(days=365)

pull_start = '2017-09-01'
pull_end = next_year_today.strftime("%Y-%m-%d")

# Pre: pre-covid period, used for train and validation
Pre_start, Pre_end = '2017-09-01', '2020-01-30'
# Post: post-covid period, used for test
Post_start, Post_end = '2021-07-01',  yesterday.strftime("%Y-%m-%d")
# Train on All:

# Future: Today till one year in future:
future_start , future_end = datetime.today().strftime("%Y-%m-%d") ,   next_year_today.strftime("%Y-%m-%d")

new_market = False # change this to True if it is a new market
ulcc_list = ['NK','SY','F9'] # Spirit SunCountry Frontier 

# Extracting for Seas:
sea_col_fcst = ['week_x', 'week_y', 'forecastDayOfWeek','avgrasm','dowavgrasm','seats_AA_fcst', 'holiday', 'forecastId'] #+ forecastDayOfWeek, FCST
sea_col_Cap = ['week_x', 'week_y','dow_x', 'dow_y', 'avgrasm','seats_AA_fcst','seats_OA_fcst','seats_ulcc_fcst' , 'seats_AA' , 'seats_OA' , 'seats_ulcc']
sea_col = ['week_x', 'week_y', 'dow_x', 'dow_y','avgrasm','dowavgrasm']

# Data reshaping parameters:
train_val_percentage = .9
time_series = False
seasenality_one_dimension = False 
window = 0 

# Model parameters;
epochs = 150
early_stop = 10
sea_dense = 128




kronos_3_timeseries = defaultdict()

hcrt, mos, az = connect_to_servers()


print(f" Flights from {orig} to {dest}:")
# Pull all the FCSTs
fcst_id_df = get_fcst_given_leg(orig, dest, hcrt )   

# Pull OAG:
oag_df = get_oag_data(orig, dest, pull_start, pull_end, ulcc_list, mos)

#Pull prdMaps:
prdMaps = get_prdMaps(orig, dest, hcrt)


# Processing: OAG Per Day:
oag_kl_total_Per_Day_and_AA = oag_per_day(oag_df)

all_tensors = [[] for i in range(12)] # 4 (Number of tensors - TF (output) , FC, SEA, TF_Time) * 3 (Train, Val, Test) 

for _,_ , fcst_id , fcst_start , fcst_end in tqdm(fcst_id_df.values):

    hcrt, mos, az = connect_to_servers()

    print(f" ------------- ****** {orig}-{dest}-{fcst_id}) ****** ------------- ")

    # print( fcst_id , fas, adf )
    print( f"fcst_start and fcst_end for {orig}-{dest} at FCST_ID {fcst_id} are: {fcst_start}, {fcst_end}") 

    #  Processing: OAG per FCST:
    oag_kl =  oag_per_fcst(oag_df, fcst_start, fcst_end )

    # Merge and Normalize: OAG per FCST and OAG per Day:
    oag_kl_fcst_total = pd.merge(oag_kl,oag_kl_total_Per_Day_and_AA ,on = "adj_dep_date", how='left',suffixes=('_fcst', '_day'))
    oag_kl_fcst_total = normalize_oag_kl_fcst_total(oag_kl_fcst_total)


    # Pull data from the file: pullData_FullPeriod.py
    df = pull_data(orig,dest,fcst_id,new_market)
    df = pull_seas(df, orig, dest)

    if len(df) < 100:
        print(f"insufficent data for market ({orig}-{dest}-{fcst_id}), IGNORED")
        continue


    df['flightDepartureDate'] = pd.to_datetime(df['flightDepartureDate'], format='%Y/%m/%d')

    # Merge new features (including the total day seats) into current Kronos dataset by dep_date
    df = pd.merge(df,oag_kl_fcst_total, left_on=['flightDepartureDate'],\
          right_on=['adj_dep_date'], how='left')
    df.dropna(inplace=True)

    # processing: Group and pad the DF:
    post = group_and_pad(df)

    # Cut the 'post' data into Pre-covid and Post-covid parts
    Data_PRE = post[ (post['flightDepartureDate']>=Pre_start) & (post['flightDepartureDate']<=Pre_end) ]
    Data_POST = post[ (post['flightDepartureDate']>=Post_start) & (post['flightDepartureDate']<=Post_end) ]
    Data_FUTURE = post[ (post['flightDepartureDate']>=future_start) & (post['flightDepartureDate']<=future_end) ]
    
    Data_PRE = Data_PRE.reset_index(drop = True)
    Data_POST = Data_POST.reset_index(drop = True)
    Data_FUTURE = Data_FUTURE.reset_index(drop = True)
    
    if test_today and not test_random_masking:
        if DOW:
            split_date = (datetime.strptime(test_today,'%Y-%m-%d') - timedelta(days=window*7)).strftime("%Y-%m-%d")
        else:
            split_date = (datetime.strptime(test_today,'%Y-%m-%d') - timedelta(days=window)).strftime("%Y-%m-%d")
        
        Data_PAST = pd.concat([Data_PRE,Data_POST])
        Data_PAST =  Data_PAST[(Data_PAST['flightDepartureDate']>=Post_start)]
        
        Data_PRE = Data_PAST[Data_PAST['flightDepartureDate']<split_date]
        Data_POST = Data_PAST[Data_PAST['flightDepartureDate']>=split_date]     
        
        Data_PRE = Data_PRE.reset_index(drop = True)
        Data_POST = Data_POST.reset_index(drop = True)
    
    print(f" For market ({orig}-{dest}-{fcst_id}) , we have {Data_PRE.shape[0]/14} Pre-Covid data, {Data_POST.shape[0]/14} Post-Covid data, and {Data_FUTURE.shape[0]/14} future data (from now to one year from now)")

    # TODO: This can be edited later, so we use the POST data to train....
    if len(Data_PRE) <= len(Data_POST):
        print(f" *** The Pre Covid data is less than the post covid data, so we ignore {orig}-{dest}-{fcst_id} market ***")
        continue

#         # TODO: When merging models together this can be useefull for training porposes, but for now not usefull.
    if len(Data_FUTURE)//14 <= 5 * 7:
        print(f" *** There is no future flights for the {orig}-{dest}-{fcst_id} market, so ignored! *** ")
        continue

    if any([Data_PRE.shape[0]/14 <= 10 * 7 , Data_POST.shape[0]/14 <= 10 * 7]):
        print(f" *** Low amount of data for either PRE, or POST of {orig}-{dest}-{fcst_id} market, so ignored! *** ")
        continue
    
    # MErge FCSTs:

    # Train Kronos 2 (No Additional Feat (using sea_col)):
    use_channels = True
    seasenality_one_dimension = True
    DOW= True
    FC_time_series = False
    traffic_time_series = True 
    window = 10
    test_random_masking = False
    # test_today = '2022-07-01'
    train, val, test = get_train_test_samples2(Data_PRE, Data_POST, Data_FUTURE, sea_col_fcst, prdMaps , DOW= DOW , train_val_percentage = train_val_percentage ,  FC_time_series = FC_time_series , traffic_time_series = traffic_time_series,  use_channels = use_channels , seasenality_one_dimension = seasenality_one_dimension ,  window = window, test_random_masking = test_random_masking, test_today = test_today )
    break
#     [all_tensors[i].append(item) for i,item in enumerate( train + val +test)]

# if len(all_tensors[0]) == 0:
#     print(f"No data for {orig}-{dest} -> No Model!")

# # Now concatinate and shuffle the data:
# train = [np.concatenate(all_tensors[i]) for i in range(0,4)] 
# val = [np.concatenate(all_tensors[i]) for i in range(4,8)] 
# test = [np.concatenate(all_tensors[i]) for i in range(8,12)] 

# # Shuffle:
# train = shuffle(train[0],train[1],train[2],train[3])
# val = shuffle(val[0],val[1],val[2],val[3])


# kronos32s_test_results3 , kronos32s_hist3,  kronos32s_model3 = kronos_32s_model(para_epochs = 500, para_early_stop =  True, para_model_name = 'kronos32s', para_sea_len = len(sea_col_fcst), para_sea_dense =  160, window = window ,
#          train_list = train , val_list = val, test_list = test)

# _ , results_summary = test_acc(kronos32s_test_results3, test[3])

# kronos_3_timeseries[f"{orig}-{dest}"] = results_summary

# print(f"val plot {orig}-{dest}")
# plt.plot(kronos32s_hist3.history['loss'])
# plt.plot(kronos32s_hist3.history['val_loss'])
# plt.title('loss')
# plt.ylabel('MSE')
# plt.xlabel('epoch')
# plt.legend(['train','val'] , loc = "upper left")
# plt.show()
        

        

 Flights from DFW to TUS:


  0%|          | 0/8 [00:00<?, ?it/s]

 ------------- ****** DFW-TUS-1) ****** ------------- 
fcst_start and fcst_end for DFW-TUS at FCST_ID 1 are: 180, 593
 For market (DFW-TUS-1) , we have 812.0 Pre-Covid data, 418.0 Post-Covid data, and 147.0 future data (from now to one year from now)


Unnamed: 0,snapshotDate,origin,destination,forecastId,forecastDepartureDate,forecastDayOfWeek,poolCode,cabinCode,forecastPeriod,localFlowIndicator,flightDepartureDate,fracClosure_1,fracClosure_2,fracClosure_3,fracClosure_4,fracClosure_5,fracClosure_6,fracClosure_7,fracClosure_8,fracClosure_9,fracClosure_10,trafficActual_1,trafficActual_2,trafficActual_3,trafficActual_4,trafficActual_5,trafficActual_6,trafficActual_7,trafficActual_8,trafficActual_9,trafficActual_10,trafficActualAadv_1,trafficActualAadv_2,trafficActualAadv_3,trafficActualAadv_4,trafficActualAadv_5,trafficActualAadv_6,trafficActualAadv_7,trafficActualAadv_8,trafficActualAadv_9,trafficActualAadv_10,holiday,H1,H2,H3,HL,weekNumber,week_x,week_y,dow_x,dow_y,avgtraffic,avgtrafficopenness,avgrasm,dowavgtraffic,dowavgtrafficopenness,dowavgrasm,adj_dep_date,fcst_start,fcst_end,seats_AA_fcst,seats_OA_fcst,seats_ulcc_fcst,seats_All_fcst,flt_ct_AA_fcst,flt_ct_OA_fcst,flt_ct_ulcc_fcst,flt_ct_All_fcst,asm_AA_fcst,asm_All_fcst,seats_AA,seats_OA,seats_ulcc,seats_All,flt_ct_AA,flt_ct_OA,flt_ct_ulcc,flt_ct_All,asm_AA,asm_All,groupID,fullHistory,real
70,2022-11-18,DFW,TUS,1.0,2022-11-23,3.0,H3,Y,1.0,F,2022-11-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-11-23,180.0,593.0,0.459821,0.0,0.0,0.459821,0.0,0.0,0.0,0.0,0.459821,0.459821,0.711297,0.0,0.0,0.711297,0.571429,0.0,0.0,0.571429,0.711297,0.711297,8,14,0.0
71,2022-11-18,DFW,TUS,1.0,2022-11-23,3.0,H3,Y,2.0,F,2022-11-23,0.0,0.019,0.051,0.07,0.577,0.632,0.918,0.964,1.0,1.0,0.0,0.0,0.0,0.8,0.0,3.2,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,47.0,0.886688,0.182648,0.381,0.987,0.168,0.103,0.367,0.064,0.0,0.086,2022-11-23,180.0,593.0,0.459821,0.0,0.0,0.459821,0.0,0.0,0.0,0.0,0.459821,0.459821,0.711297,0.0,0.0,0.711297,0.571429,0.0,0.0,0.571429,0.711297,0.711297,8,14,1.0
72,2022-11-18,DFW,TUS,1.0,2022-11-23,3.0,H3,Y,3.0,F,2022-11-23,0.0,0.014,0.041,0.058,0.069,0.107,0.775,0.886,0.954,1.0,0.0,0.0,0.8,0.8,0.8,1.6,0.0,4.0,2.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,47.0,0.886688,0.182648,0.381,0.987,0.168,0.103,0.367,0.064,0.0,0.086,2022-11-23,180.0,593.0,0.459821,0.0,0.0,0.459821,0.0,0.0,0.0,0.0,0.459821,0.459821,0.711297,0.0,0.0,0.711297,0.571429,0.0,0.0,0.571429,0.711297,0.711297,8,14,1.0
73,2022-11-18,DFW,TUS,1.0,2022-11-23,3.0,H3,Y,4.0,F,2022-11-23,0.0,0.012,0.041,0.06,0.099,0.242,0.29,0.411,0.959,0.999,0.0,0.0,0.0,0.0,0.8,0.0,2.4,3.2,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,47.0,0.886688,0.182648,0.381,0.987,0.168,0.103,0.367,0.064,0.0,0.086,2022-11-23,180.0,593.0,0.459821,0.0,0.0,0.459821,0.0,0.0,0.0,0.0,0.459821,0.459821,0.711297,0.0,0.0,0.711297,0.571429,0.0,0.0,0.571429,0.711297,0.711297,8,14,1.0
74,2022-11-18,DFW,TUS,1.0,2022-11-23,3.0,H3,Y,5.0,F,2022-11-23,0.0,0.0,0.035,0.041,0.062,0.136,0.337,0.446,0.979,1.0,0.8,0.0,0.0,0.0,5.6,0.8,6.4,4.8,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,47.0,0.886688,0.182648,0.381,0.987,0.168,0.103,0.367,0.064,0.0,0.086,2022-11-23,180.0,593.0,0.459821,0.0,0.0,0.459821,0.0,0.0,0.0,0.0,0.459821,0.459821,0.711297,0.0,0.0,0.711297,0.571429,0.0,0.0,0.571429,0.711297,0.711297,8,14,1.0
75,2022-11-18,DFW,TUS,1.0,2022-11-23,3.0,H3,Y,6.0,F,2022-11-23,0.0,0.002,0.011,0.036,0.088,0.199,0.427,0.913,1.0,1.0,0.0,0.0,1.6,1.6,2.4,11.2,5.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,47.0,0.886688,0.182648,0.381,0.987,0.168,0.103,0.367,0.064,0.0,0.086,2022-11-23,180.0,593.0,0.459821,0.0,0.0,0.459821,0.0,0.0,0.0,0.0,0.459821,0.459821,0.711297,0.0,0.0,0.711297,0.571429,0.0,0.0,0.571429,0.711297,0.711297,8,14,1.0
76,2022-11-18,DFW,TUS,1.0,2022-11-23,3.0,H3,Y,7.0,F,2022-11-23,0.001,0.015,0.024,0.071,0.22,0.271,0.354,0.766,0.988,1.0,0.0,0.0,0.0,0.8,0.0,0.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,47.0,0.886688,0.182648,0.381,0.987,0.168,0.103,0.367,0.064,0.0,0.086,2022-11-23,180.0,593.0,0.459821,0.0,0.0,0.459821,0.0,0.0,0.0,0.0,0.459821,0.459821,0.711297,0.0,0.0,0.711297,0.571429,0.0,0.0,0.571429,0.711297,0.711297,8,14,1.0
77,2022-11-18,DFW,TUS,1.0,2022-11-23,3.0,H3,Y,1.0,L,2022-11-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-11-23,180.0,593.0,0.459821,0.0,0.0,0.459821,0.0,0.0,0.0,0.0,0.459821,0.459821,0.711297,0.0,0.0,0.711297,0.571429,0.0,0.0,0.571429,0.711297,0.711297,8,14,0.0
78,2022-11-18,DFW,TUS,1.0,2022-11-23,3.0,H3,Y,2.0,L,2022-11-23,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.6,0.0,1.0,0.0,0.0,1.0,0.0,47.0,0.886688,0.182648,0.381,0.987,0.686,0.599,0.367,0.408,0.32,0.086,2022-11-23,180.0,593.0,0.459821,0.0,0.0,0.459821,0.0,0.0,0.0,0.0,0.459821,0.459821,0.711297,0.0,0.0,0.711297,0.571429,0.0,0.0,0.571429,0.711297,0.711297,8,14,1.0
79,2022-11-18,DFW,TUS,1.0,2022-11-23,3.0,H3,Y,3.0,L,2022-11-23,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,47.0,0.886688,0.182648,0.381,0.987,0.686,0.599,0.367,0.408,0.32,0.086,2022-11-23,180.0,593.0,0.459821,0.0,0.0,0.459821,0.0,0.0,0.0,0.0,0.459821,0.459821,0.711297,0.0,0.0,0.711297,0.571429,0.0,0.0,0.571429,0.711297,0.711297,8,14,1.0


In [11]:
if DOW:
    PRE_FC, PRE_Seas, PRE_Traf, PRE_TF_timeseries = dow_get_tensors2(
        Data_PRE,
        sea_col_Cap,
        prdMaps,
        FC_time_series=FC_time_series,
        traffic_time_series=traffic_time_series,
        use_channels=use_channels,
        seasenality_one_dimension=seasenality_one_dimension,
        window=window,
        random_masking=True,
        test_today=None,
    )
    POST_FC, POST_Seas, POST_Traf, POST_TF_timeseries = dow_get_tensors2(
        Data_POST,
        sea_col_Cap,
        prdMaps,
        FC_time_series=FC_time_series,
        traffic_time_series=traffic_time_series,
        use_channels=use_channels,
        seasenality_one_dimension=seasenality_one_dimension,
        window=window,
        random_masking=test_random_masking,
        test_today=test_today,
    )
    # FUTURE_FC , FUTURE_Seas , FUTURE_Traf ,FUTUR_TF_timeseries = dow_get_tensors2(Data_FUTURE , sea_col_Cap, prdMaps  , FC_time_series = False , traffic_time_series = True ,  use_channels = True , seasenality_one_dimension = True ,   window = window)

else:
    PRE_FC, PRE_Seas, PRE_Traf, PRE_TF_timeseries = get_tensors2(
        Data_PRE,
        sea_col_Cap,
        prdMaps,
        FC_time_series=FC_time_series,
        traffic_time_series=traffic_time_series,
        use_channels=use_channels,
        seasenality_one_dimension=seasenality_one_dimension,
        window=window,
    )

    if test_random_masking:
        POST_FC, POST_Seas, POST_Traf, POST_TF_timeseries = get_tensors2(
            Data_POST,
            sea_col_Cap,
            prdMaps,
            FC_time_series=FC_time_series,
            traffic_time_series=traffic_time_series,
            use_channels=use_channels,
            seasenality_one_dimension=seasenality_one_dimension,
            window=window,
        )
    else:
        masked_df = create_masking_based_on_given_day(Data_POST, test_today, prdMaps)
        POST_FC, POST_Seas, POST_Traf, POST_TF_timeseries = get_tensors2_faketoday(
            Data_POST, masked_df, sea_col_Cap, use_channels, seasenality_one_dimension, window
        )

    # FUTURE_FC , FUTURE_Seas , FUTURE_Traf , FUTURE_TF_timeseries = get_tensors2(Data_FUTURE, sea_col_Cap, prdMaps , FC_time_series = False , traffic_time_series = True ,  use_channels = True , seasenality_one_dimension = True ,   window = window)

# Train/Val Spilit:
# TODO: THIS SHOULD BE CHANGED TO RANDOMIZED.
train_val_cutoff = round(PRE_FC.shape[0] * train_val_percentage)

# prepare train/val/test datasets
PRE_FC_train = PRE_FC[:train_val_cutoff, :]
PRE_FC_val = PRE_FC[train_val_cutoff:, :]

PRE_Seas_train = PRE_Seas[:train_val_cutoff, :]
PRE_Seas_val = PRE_Seas[train_val_cutoff:, :]

PRE_Traf_train = PRE_Traf[:train_val_cutoff, :]
PRE_Traf_val = PRE_Traf[train_val_cutoff:, :]

PRE_TF_timeseries_train = PRE_TF_timeseries[:train_val_cutoff, :]
PRE_TF_timeseries_val = PRE_TF_timeseries[train_val_cutoff:, :]

train = [PRE_FC_train, PRE_Seas_train, PRE_TF_timeseries_train, PRE_Traf_train]
val = [PRE_FC_val, PRE_Seas_val, PRE_TF_timeseries_val, PRE_Traf_val]
test = [POST_FC, POST_Seas, POST_TF_timeseries, POST_Traf]


In [16]:
# def dow_get_tensors2(
#     DataFarame,
#     sea_col_Cap,
#     prdMaps=None,
#     FC_time_series=False,
#     traffic_time_series=False,
#     use_channels=False,
#     seasenality_one_dimension=True,
#     window=10,
#     random_masking=True,
#     test_today=None,
# ):
DOW = True
DataFarame = Data_PRE
random_masking = True
FC_dow, Seasenality_dow, Traffic_dow, TF_time_dow = defaultdict(), defaultdict(), defaultdict(), defaultdict()

if not random_masking:
    masked_df = create_masking_based_on_given_day(DataFarame, test_today, prdMaps)

for i in DataFarame.loc[:, ["forecastDayOfWeek"]].drop_duplicates().values:
    # filter_y = DataFarame['dow_y' ] == i[1]
    # filter_x = DataFarame['dow_x'] == i[0]
    filter_dow = DataFarame["forecastDayOfWeek"] == i[0]
    # print(filter_y.shape , filter_x.shape)
    # print(i)
    Data_dow = DataFarame[filter_dow]
    # print(Data_dow.shape)
    if random_masking:
        FC, Seasenality, Traffic, TF_time = get_tensors2(
            Data_dow,
            sea_col_Cap,
            prdMaps,
            FC_time_series,
            traffic_time_series,
            use_channels,
            seasenality_one_dimension,
            window,
            DOW,
        )
    else:
        Data_dow_masked = masked_df[filter_dow]
        FC, Seasenality, Traffic, TF_time = get_tensors2_faketoday(
            Data_dow, Data_dow_masked, sea_col_Cap, use_channels, seasenality_one_dimension, window
        )

    # FC, Seasenality, Traffic, TF_time= get_tensors2_faketoday(Data_dow, Data_dow_masked ,  sea_col_Cap , use_channels , seasenality_one_dimension ,  window)
    for i, j in enumerate(Data_dow.index[::14][window:]):
        FC_dow[j] = FC[i]
        Seasenality_dow[j] = Seasenality[i]
        Traffic_dow[j] = Traffic[i]
        if traffic_time_series:
            TF_time_dow[j] = TF_time[i]

FC_dow = np.stack(list(dict(sorted(FC_dow.items())).values()))
Seasenality_dow = np.stack(list(dict(sorted(Seasenality_dow.items())).values()))
Traffic_dow = np.stack(list(dict(sorted(Traffic_dow.items())).values()))
if traffic_time_series:
    TF_time_dow = np.stack(list(dict(sorted(TF_time_dow.items())).values()))



In [19]:
# def get_tensors2(
#     DataFarame,
#     sea_col_Cap,
#     prdMaps=None,
#     FC_time_series=True,
#     traffic_time_series=False,
#     use_channels=False,
#     seasenality_one_dimension=True,
#     window=10,
#     DOW=False,
# ):
DataFrame = Data_dow

len_sea_cap = len(sea_col_Cap)

# fractional closure
PRE_FC_L = DataFarame[["fracClosure_" + str(i + 1) for i in range(10)]].values.astype("float32")
# seasonality
PRE_Sea_L = DataFarame[sea_col_Cap].values.astype("float32")
# actual traffic
PRE_Traf_L = DataFarame[["trafficActual_" + str(i + 1) for i in range(10)]].values.astype("float32")

# reshape the data for CNNLSTM model
FC = PRE_FC_L.reshape(int(PRE_FC_L.shape[0] / 14), 1, 14, 10)
Seasenality = PRE_Sea_L.reshape(int(PRE_Sea_L.shape[0] / 14), 1, 14, len_sea_cap)
Traffic = PRE_Traf_L.reshape(int(PRE_Traf_L.shape[0] / 14), 1, 14, 10)

# Remove Duplicates (from 2d to 1d vector)
if seasenality_one_dimension:
    Seasenality = np.delete(Seasenality, slice(13), 2).reshape(Seasenality.shape[0], len_sea_cap)

if use_channels:
    FC = FC.reshape(len(FC), 2, 7, 10)
    Traffic = Traffic.reshape(len(Traffic), 2, 7, 10)

# Change FC shape to refelect time series:
# print(FC.shape)
if FC_time_series:
    time_series_widow = list()
    Seasenality_times = list()
    for i in range(window, len(FC)):
        # print(FC[i-window:i].shape)
        time_series_widow.append(FC[i - window : i].reshape(window, 2, 7, 10))
        # print((Seasenality[i-window:i].shape))
        Seasenality_times.append(Seasenality[i - window : i])
    FC = np.array(time_series_widow)
    Seasenality = np.array(Seasenality_times)

    # Since the 1st window size data points are removed:
    # Seasenality = Seasenality[window:]
    Traffic = Traffic[window:]

elif traffic_time_series:
    traffic_time_series_window = list()
    Seasenality_times = list()
    for i in range(window, len(Traffic)):
        # Find Random period and random day:
        if DOW:
            tf_window_masked = tf_timeseries_masking_DOW(Traffic, i, prdMaps, window)
        else:
            tf_window_masked = tf_timeseries_masking(Traffic, i, prdMaps, window)
        traffic_time_series_window.append(tf_window_masked)
        # Seasenality_times.append(Seasenality[i-window:i])
    TF_time = np.array(traffic_time_series_window)
    # Seasenality = np.array(Seasenality_times)
    Seasenality = Seasenality[window:]
    FC = FC[window:]

    Traffic = Traffic[window:]

    # return FC, Seasenality, Traffic, TF_time

# return FC, Seasenality, Traffic, None

In [36]:
# def tf_timeseries_masking_DOW(tf_tensors, data_index, prdMaps, window):
    # """This function will generate masked time-seried terrafic data, and is based on DOW."""
tf_tensors = Traffic
data_index = 15
    
random_period, random_day_to_dept = randPeriod(prdMaps)
arr = prdMaps.iloc[:, 3].values
test_tensors = tf_tensors.copy()

day_to_dept = random_day_to_dept
current_index = data_index
print(day_to_dept)
for i in range(0, window):
    # Move back 7 days in each iter.
    day_to_dept = random_day_to_dept - i * 7
    # Get the period of that day to dept.
    flrs = floorSearch(arr, 0, 6, day_to_dept)
    current_period = flrs + 1
    # If we get today, will break the loop. and use all the values (no masking)
    if current_period == 0:
        break
    # mask the values
    test_tensors[
        current_index,
        :,
        :current_period,
    ] = -1

    # Update index:
    current_index -= 1

    # return test_tensors[data_index + 1 - window : data_index + 1]

26


In [39]:
test_tensors[data_index + 1 - window : data_index + 1]

array([[[[ 0.,  0.,  0., ...,  2.,  1.,  0.],
         [ 0.,  0.,  1., ...,  2.,  4.,  7.],
         [ 0.,  0.,  0., ...,  6.,  0.,  5.],
         ...,
         [ 1.,  0.,  0., ...,  0.,  8.,  3.],
         [ 0.,  0.,  0., ...,  0., 10.,  3.],
         [ 0.,  6.,  0., ...,  0.,  1.,  0.]],

        [[ 0.,  0.,  5., ...,  3.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  1.],
         ...,
         [ 0.,  0.,  0., ...,  0.,  0.,  4.],
         [ 0.,  0.,  0., ...,  0.,  0.,  1.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]]],


       [[[ 0.,  0.,  0., ...,  0.,  4.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  1.],
         [ 2.,  0.,  0., ...,  0.,  3.,  2.],
         ...,
         [ 0.,  0.,  1., ...,  0.,  7.,  4.],
         [ 0.,  0.,  0., ...,  0., 11.,  6.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]],

        [[ 0.,  0., 10., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  3., ..., 

In [35]:
floorSearch(arr, 0, 6, 2)

0

In [25]:
prdMaps

Unnamed: 0,ORIGIN,DESTINATION,FORECASTPERIOD,RRD_START,RRD_END
0,DFW,TUS,1,2,6
1,DFW,TUS,2,7,13
2,DFW,TUS,3,14,20
3,DFW,TUS,4,21,29
4,DFW,TUS,5,30,49
5,DFW,TUS,6,50,149
6,DFW,TUS,7,150,331


In [30]:
randPeriod(prdMaps)

(5, 40)

In [21]:
Traffic

array([[[[ 0.,  0.,  0., ...,  0.,  1.,  0.],
         [ 0.,  0.,  0., ...,  1.,  2.,  2.],
         [ 0.,  0.,  0., ...,  2.,  0.,  0.],
         ...,
         [ 0.,  1.,  0., ...,  0.,  5.,  5.],
         [ 0.,  0.,  1., ...,  0.,  1.,  7.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]],

        [[ 0.,  0.,  6., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ...,
         [ 0.,  0.,  1., ...,  3.,  0.,  1.],
         [ 0.,  0.,  0., ...,  2.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]]],


       [[[ 0.,  0.,  0., ...,  1.,  2.,  0.],
         [ 0.,  0.,  0., ...,  0.,  6.,  3.],
         [ 0.,  0.,  0., ...,  0.,  4.,  0.],
         ...,
         [ 0.,  0.,  0., ...,  3., 17.,  8.],
         [ 2.,  0.,  0., ...,  0., 26.,  5.],
         [ 0.,  0.,  0., ...,  0.,  1.,  0.]],

        [[ 0.,  0.,  1., ...,  1.,  0.,  0.],
         [ 0.,  0.,  1., ...,  2.,  0.,  0.],
         [ 0.,  0.,  0., ..., 

In [14]:


def randPeriod(prdMaps):
    """_summary_

    Args:
        prdMaps (Dataframe): Dataframe that shows the time to departure where the period class of a given flight gets closed.

    Returns:
        random_period (int): Random Class to departure. (between 1-7)
        random_day (int): Random Day to Departure (it should be between 2 to 331- When the fist class opens up.)
    """
    random_period = np.random.randint(1, 7)  # Gets a class between 1 to 7, these are our period to departure classes.
    rrd_start, rrt_end = prdMaps[prdMaps["FORECASTPERIOD"] == random_period].loc[:, ["RRD_START", "RRD_END"]].values[0]
    random_day = np.random.randint(rrd_start, rrt_end)
    return random_period, random_day


def tf_timeseries_masking(tf_tensors, data_index, prdMaps, window):
    """This function will generate masked time-seried terrafic data."""

    random_period, random_day_to_dept = randPeriod(prdMaps)
    # print(random_period , random_day_to_dept )
    arr = prdMaps.iloc[:, 3].values

    # output = tf_tensors[data_index].copy()
    test_tensors = tf_tensors.copy()
    test_tensors[data_index][
        :,
        :random_period,
    ] = -1

    max_bond_period = random_day_to_dept
    min_bond_period = arr[random_period - 1]
    remaining_window = window - 1
    current_index = data_index
    max_min_range = max_bond_period - min_bond_period
    current_period = random_period

    if max_min_range < remaining_window:
        while max_min_range <= remaining_window:
            # print(current_index-max_min_range,current_index)
            test_tensors[
                current_index - max_min_range : current_index,
                :,
                :current_period,
            ] = -1
            current_period -= 1
            if current_period == 0:
                break
            current_index -= max_min_range
            remaining_window -= max_min_range
            max_bond_period -= max_min_range
            min_bond_period = arr[current_period - 1]
            max_min_range = max_bond_period - min_bond_period
            # reaching Today date:

    if max_min_range >= remaining_window:
        # print(current_index-max_min_range,current_index)
        test_tensors[
            current_index - remaining_window : current_index,
            :,
            :current_period,
        ] = -1

    return test_tensors[data_index + 1 - window : data_index + 1]


def get_tensors2(
    DataFarame,
    sea_col_Cap,
    prdMaps=None,
    FC_time_series=True,
    traffic_time_series=False,
    use_channels=False,
    seasenality_one_dimension=True,
    window=10,
    DOW=False,
):

    len_sea_cap = len(sea_col_Cap)

    # fractional closure
    PRE_FC_L = DataFarame[["fracClosure_" + str(i + 1) for i in range(10)]].values.astype("float32")
    # seasonality
    PRE_Sea_L = DataFarame[sea_col_Cap].values.astype("float32")
    # actual traffic
    PRE_Traf_L = DataFarame[["trafficActual_" + str(i + 1) for i in range(10)]].values.astype("float32")

    # reshape the data for CNNLSTM model
    FC = PRE_FC_L.reshape(int(PRE_FC_L.shape[0] / 14), 1, 14, 10)
    Seasenality = PRE_Sea_L.reshape(int(PRE_Sea_L.shape[0] / 14), 1, 14, len_sea_cap)
    Traffic = PRE_Traf_L.reshape(int(PRE_Traf_L.shape[0] / 14), 1, 14, 10)

    # Remove Duplicates (from 2d to 1d vector)
    if seasenality_one_dimension:
        Seasenality = np.delete(Seasenality, slice(13), 2).reshape(Seasenality.shape[0], len_sea_cap)

    if use_channels:
        FC = FC.reshape(len(FC), 2, 7, 10)
        Traffic = Traffic.reshape(len(Traffic), 2, 7, 10)

    # Change FC shape to refelect time series:
    # print(FC.shape)
    if FC_time_series:
        time_series_widow = list()
        Seasenality_times = list()
        for i in range(window, len(FC)):
            # print(FC[i-window:i].shape)
            time_series_widow.append(FC[i - window : i].reshape(window, 2, 7, 10))
            # print((Seasenality[i-window:i].shape))
            Seasenality_times.append(Seasenality[i - window : i])
        FC = np.array(time_series_widow)
        Seasenality = np.array(Seasenality_times)

        # Since the 1st window size data points are removed:
        # Seasenality = Seasenality[window:]
        Traffic = Traffic[window:]

    elif traffic_time_series:
        traffic_time_series_window = list()
        Seasenality_times = list()
        for i in range(window, len(Traffic)):
            # Find Random period and random day:
            if DOW:
                tf_window_masked = tf_timeseries_masking_DOW(Traffic, i, prdMaps, window)
            else:
                tf_window_masked = tf_timeseries_masking(Traffic, i, prdMaps, window)
            traffic_time_series_window.append(tf_window_masked)
            # Seasenality_times.append(Seasenality[i-window:i])
        TF_time = np.array(traffic_time_series_window)
        # Seasenality = np.array(Seasenality_times)
        Seasenality = Seasenality[window:]
        FC = FC[window:]

        Traffic = Traffic[window:]

        return FC, Seasenality, Traffic, TF_time

    return FC, Seasenality, Traffic, None


def floorSearch(arr, low, high, x):

    # If low and high cross each other
    if low > high:
        return -1

    # If last element is smaller than x
    if x >= arr[high]:
        return high

    # Find the middle point
    mid = int((low + high) / 2)

    # If middle point is floor.
    if arr[mid] == x:
        return mid

    # If x lies between mid-1 and mid
    if mid > 0 and arr[mid - 1] <= x and x < arr[mid]:
        return mid - 1

    # If x is smaller than mid,
    # floor must be in left half.
    if x < arr[mid]:
        return floorSearch(arr, low, mid - 1, x)

    # If mid-1 is not floor and x is greater than
    # arr[mid],
    return floorSearch(arr, mid + 1, high, x)


def tf_timeseries_masking_DOW(tf_tensors, data_index, prdMaps, window):
    """This function will generate masked time-seried terrafic data, and is based on DOW."""

    random_period, random_day_to_dept = randPeriod(prdMaps)
    arr = prdMaps.iloc[:, 3].values
    test_tensors = tf_tensors.copy()

    day_to_dept = random_day_to_dept
    current_index = data_index

    for i in range(0, window):
        # Move back 7 days in each iter.
        day_to_dept = random_day_to_dept - i * 7
        # Get the period of that day to dept.
        flrs = floorSearch(arr, 0, 6, day_to_dept)
        current_period = flrs + 1
        # If we get today, will break the loop. and use all the values (no masking)
        if current_period == 0:
            break
        # mask the values
        test_tensors[
            current_index,
            :,
            :current_period,
        ] = -1

        # Update index:
        current_index -= 1

    return test_tensors[data_index + 1 - window : data_index + 1]


def get_prdMaps(orig, dest, hcrt):
    """TODO: add the lcl_flw_ind and change the data to mask diffrent between the local and Flow Traffics"""

    prdMaps = pd.read_sql(
        f"""select DISTINCT leg_orig as origin, leg_dest as destination, fcst_period as forecastPeriod, rrd_band_start_i as rrd_start, rrd_band_end_i as rrd_end
                            -- , lcl_flw_ind
                            from market_xref a
                            join FCST.FCST_PERIOD_REF b
                            on a.infl_period_id = b.FCST_PERIOD_ID
                            where 1=1
                            and cabin_code = 'Y'
                            and leg_orig = '{orig}'
                            and leg_dest = '{dest}'
                            and lcl_flw_ind = 'L'
                            ORDER BY forecastPeriod
                            """,
        con=hcrt,
    )
    return prdMaps


# def dow_get_tensors2(DataFarame , sea_col_Cap, prdMaps= None  ,  test = False, time_series = True,  use_channels = False , window = 10):
# def dow_get_tensors2(DataFarame , sea_col_Cap, prdMaps= None  ,  FC_time_series = True , traffic_time_series = False ,  use_channels = False , seasenality_one_dimension = True ,  window = 10):
#     DOW = True
#     FC_dow , Seasenality_dow, Traffic_dow ,  TF_time_dow  = list(), list(), list(), list()

#     for i in DataFarame.loc[ :,	['forecastDayOfWeek' ]].drop_duplicates().values:
#         # filter_y = DataFarame['dow_y' ] == i[1]
#         # filter_x = DataFarame['dow_x'] == i[0]
#         filter_dow =  DataFarame['forecastDayOfWeek'] == i[0]
#         # print(filter_y.shape , filter_x.shape)
#         # print(i)
#         Data_dow =DataFarame[filter_dow]
#         # print(Data_dow.shape)
#         FC, Seasenality, Traffic, TF_time= get_tensors2(Data_dow, sea_col_Cap, prdMaps  , FC_time_series  , traffic_time_series ,  use_channels  , seasenality_one_dimension  ,  window, DOW )
#         FC_dow.append(FC)
#         Seasenality_dow.append(Seasenality)
#         Traffic_dow.append(Traffic)
#         if traffic_time_series:
#             TF_time_dow.append(TF_time)

#     # Then Concat together, now each datapoint is based on DOW.
#     FC_dow = [ i  for i in FC_dow if i.shape!=(0,)]
#     Seasenality_dow = [ i  for i in Seasenality_dow if i.shape!=(0,)]
#     # Traffic_dow = [ i  for i in Traffic_dow if i.shape!=(0,)]

#     if traffic_time_series:
#         TF_time_dow = [ i  for i in TF_time_dow if i.shape!=(0,)]
#         TF_time_dow = np.concatenate(TF_time_dow)

#     FC_dow = np.concatenate(FC_dow)
#     Seasenality_dow = np.concatenate(Seasenality_dow)
#     Traffic_dow = np.concatenate(Traffic_dow)

#     return FC_dow , Seasenality_dow, Traffic_dow , TF_time_dow


def dow_get_tensors2(
    DataFarame,
    sea_col_Cap,
    prdMaps=None,
    FC_time_series=False,
    traffic_time_series=False,
    use_channels=False,
    seasenality_one_dimension=True,
    window=10,
    random_masking=True,
    test_today=None,
):
    DOW = True
    FC_dow, Seasenality_dow, Traffic_dow, TF_time_dow = defaultdict(), defaultdict(), defaultdict(), defaultdict()

    if not random_masking:
        masked_df = create_masking_based_on_given_day(DataFarame, test_today, prdMaps)

    for i in DataFarame.loc[:, ["forecastDayOfWeek"]].drop_duplicates().values:
        # filter_y = DataFarame['dow_y' ] == i[1]
        # filter_x = DataFarame['dow_x'] == i[0]
        filter_dow = DataFarame["forecastDayOfWeek"] == i[0]
        # print(filter_y.shape , filter_x.shape)
        # print(i)
        Data_dow = DataFarame[filter_dow]
        # print(Data_dow.shape)
        if random_masking:
            FC, Seasenality, Traffic, TF_time = get_tensors2(
                Data_dow,
                sea_col_Cap,
                prdMaps,
                FC_time_series,
                traffic_time_series,
                use_channels,
                seasenality_one_dimension,
                window,
                DOW,
            )
        else:
            Data_dow_masked = masked_df[filter_dow]
            FC, Seasenality, Traffic, TF_time = get_tensors2_faketoday(
                Data_dow, Data_dow_masked, sea_col_Cap, use_channels, seasenality_one_dimension, window
            )

        # FC, Seasenality, Traffic, TF_time= get_tensors2_faketoday(Data_dow, Data_dow_masked ,  sea_col_Cap , use_channels , seasenality_one_dimension ,  window)
        for i, j in enumerate(Data_dow.index[::14][window:]):
            FC_dow[j] = FC[i]
            Seasenality_dow[j] = Seasenality[i]
            Traffic_dow[j] = Traffic[i]
            if traffic_time_series:
                TF_time_dow[j] = TF_time[i]

    FC_dow = np.stack(list(dict(sorted(FC_dow.items())).values()))
    Seasenality_dow = np.stack(list(dict(sorted(Seasenality_dow.items())).values()))
    Traffic_dow = np.stack(list(dict(sorted(Traffic_dow.items())).values()))
    if traffic_time_series:
        TF_time_dow = np.stack(list(dict(sorted(TF_time_dow.items())).values()))

    return FC_dow, Seasenality_dow, Traffic_dow, TF_time_dow


def create_masking_based_on_given_day(DataFrame, test_today, prdMaps):
    """This function maskes the data absed on a given date."""
    arr = prdMaps.iloc[:, 3].values
    try:
        test_today_index = int(DataFrame[DataFrame["forecastDepartureDate"] >= test_today].index[0] / 14)
    except Exception:
        print("No Date after the set fake today date")
        return DataFrame
    test_df = DataFrame.copy()

    current_period = 1
    day_from_today = 0
    for current_index in range(test_today_index, len(test_df) // 14):

        if current_period < 7:
            if day_from_today == arr[current_period]:
                current_period += 1
                # print(current_period)
        else:
            current_period = 7

        day_data_df = test_df[current_index * 14 : (current_index + 1) * 14]
        day_data_df.loc[day_data_df["forecastPeriod"] <= current_period, "trafficActual_1":"trafficActualAadv_10"] = -1

        day_from_today = day_from_today + 1
        # break
    return test_df


def get_tensors2_faketoday(
    DataFarame, DataFarame_Masked, sea_col_Cap, use_channels=False, seasenality_one_dimension=True, window=10
):
    """This function uses a masekd dataframe. (it is used when we want to set a fake_today for our test set"""

    len_sea_cap = len(sea_col_Cap)

    # fractional closure
    PRE_FC_L = DataFarame[["fracClosure_" + str(i + 1) for i in range(10)]].values.astype("float32")
    # seasonality
    PRE_Sea_L = DataFarame[sea_col_Cap].values.astype("float32")
    # actual traffic
    PRE_Traf_L = DataFarame[["trafficActual_" + str(i + 1) for i in range(10)]].values.astype("float32")
    # Masked Traffic
    PRE_Traf_L_Masked = DataFarame_Masked[["trafficActual_" + str(i + 1) for i in range(10)]].values.astype("float32")

    # reshape the data for CNNLSTM model
    FC = PRE_FC_L.reshape(int(PRE_FC_L.shape[0] / 14), 1, 14, 10)
    Seasenality = PRE_Sea_L.reshape(int(PRE_Sea_L.shape[0] / 14), 1, 14, len_sea_cap)
    Traffic = PRE_Traf_L.reshape(int(PRE_Traf_L.shape[0] / 14), 1, 14, 10)
    Traffic_Masked = PRE_Traf_L_Masked.reshape(int(PRE_Traf_L_Masked.shape[0] / 14), 1, 14, 10)

    # Remove Duplicates (from 2d to 1d vector)
    if seasenality_one_dimension:
        Seasenality = np.delete(Seasenality, slice(13), 2).reshape(Seasenality.shape[0], len_sea_cap)

    if use_channels:
        FC = FC.reshape(len(FC), 2, 7, 10)
        Traffic = Traffic.reshape(len(Traffic), 2, 7, 10)
        Traffic_Masked = Traffic_Masked.reshape(len(Traffic_Masked), 2, 7, 10)

    traffic_time_series_window = list()
    # Seasenality_times = list()
    for i in range(window, len(Traffic)):
        # Get Masked Matrix
        tf_window_masked = Traffic_Masked[i + 1 - window : i + 1]
        traffic_time_series_window.append(tf_window_masked)
        # Seasenality_times.append(Seasenality[i-window:i])
    TF_time = np.array(traffic_time_series_window)
    # Seasenality = np.array(Seasenality_times)
    Seasenality = Seasenality[window:]
    FC = FC[window:]

    Traffic = Traffic[window:]

    return FC, Seasenality, Traffic, TF_time


def get_train_test_samples2(
    Data_PRE,
    Data_POST,
    Data_FUTURE,
    sea_col_Cap,
    prdMaps,
    DOW=False,
    train_val_percentage=0.9,
    FC_time_series=True,
    traffic_time_series=False,
    use_channels=False,
    seasenality_one_dimension=True,
    window=10,
    test_random_masking=True,
    test_today=None,
):
    """
    test_today format =  yyyy-mm-dd
    """

    if DOW:
        PRE_FC, PRE_Seas, PRE_Traf, PRE_TF_timeseries = dow_get_tensors2(
            Data_PRE,
            sea_col_Cap,
            prdMaps,
            FC_time_series=FC_time_series,
            traffic_time_series=traffic_time_series,
            use_channels=use_channels,
            seasenality_one_dimension=seasenality_one_dimension,
            window=window,
            random_masking=True,
            test_today=None,
        )
        POST_FC, POST_Seas, POST_Traf, POST_TF_timeseries = dow_get_tensors2(
            Data_POST,
            sea_col_Cap,
            prdMaps,
            FC_time_series=FC_time_series,
            traffic_time_series=traffic_time_series,
            use_channels=use_channels,
            seasenality_one_dimension=seasenality_one_dimension,
            window=window,
            random_masking=test_random_masking,
            test_today=test_today,
        )
        # FUTURE_FC , FUTURE_Seas , FUTURE_Traf ,FUTUR_TF_timeseries = dow_get_tensors2(Data_FUTURE , sea_col_Cap, prdMaps  , FC_time_series = False , traffic_time_series = True ,  use_channels = True , seasenality_one_dimension = True ,   window = window)

    else:
        PRE_FC, PRE_Seas, PRE_Traf, PRE_TF_timeseries = get_tensors2(
            Data_PRE,
            sea_col_Cap,
            prdMaps,
            FC_time_series=FC_time_series,
            traffic_time_series=traffic_time_series,
            use_channels=use_channels,
            seasenality_one_dimension=seasenality_one_dimension,
            window=window,
        )

        if test_random_masking:
            POST_FC, POST_Seas, POST_Traf, POST_TF_timeseries = get_tensors2(
                Data_POST,
                sea_col_Cap,
                prdMaps,
                FC_time_series=FC_time_series,
                traffic_time_series=traffic_time_series,
                use_channels=use_channels,
                seasenality_one_dimension=seasenality_one_dimension,
                window=window,
            )
        else:
            masked_df = create_masking_based_on_given_day(Data_POST, test_today, prdMaps)
            POST_FC, POST_Seas, POST_Traf, POST_TF_timeseries = get_tensors2_faketoday(
                Data_POST, masked_df, sea_col_Cap, use_channels, seasenality_one_dimension, window
            )

        # FUTURE_FC , FUTURE_Seas , FUTURE_Traf , FUTURE_TF_timeseries = get_tensors2(Data_FUTURE, sea_col_Cap, prdMaps , FC_time_series = False , traffic_time_series = True ,  use_channels = True , seasenality_one_dimension = True ,   window = window)

    # Train/Val Spilit:
    # TODO: THIS SHOULD BE CHANGED TO RANDOMIZED.
    train_val_cutoff = round(PRE_FC.shape[0] * train_val_percentage)

    # prepare train/val/test datasets
    PRE_FC_train = PRE_FC[:train_val_cutoff, :]
    PRE_FC_val = PRE_FC[train_val_cutoff:, :]

    PRE_Seas_train = PRE_Seas[:train_val_cutoff, :]
    PRE_Seas_val = PRE_Seas[train_val_cutoff:, :]

    PRE_Traf_train = PRE_Traf[:train_val_cutoff, :]
    PRE_Traf_val = PRE_Traf[train_val_cutoff:, :]

    PRE_TF_timeseries_train = PRE_TF_timeseries[:train_val_cutoff, :]
    PRE_TF_timeseries_val = PRE_TF_timeseries[train_val_cutoff:, :]

    train = [PRE_FC_train, PRE_Seas_train, PRE_TF_timeseries_train, PRE_Traf_train]
    val = [PRE_FC_val, PRE_Seas_val, PRE_TF_timeseries_val, PRE_Traf_val]
    test = [POST_FC, POST_Seas, POST_TF_timeseries, POST_Traf]

    return train, val, test


(855, 2, 7, 10)

### Training Loop:

Run the model given one Hub:


In [None]:
from tqdm.notebook import trange, tqdm
from collections import defaultdict
from sklearn.utils import shuffle

orig = "DFW" #HUB
switch_orig_dist = False # When you want to Check flights to your HUb (VS. From your hub)

DOW= True

# If test_random_masking is False -> Means you have to give it a date for creating a fake today
test_random_masking = False
test_today = '2022-06-01'

yesterday =  datetime.today() - timedelta(days=2)
next_year_today = datetime.today() + timedelta(days=365)

pull_start = '2017-09-01'
pull_end = next_year_today.strftime("%Y-%m-%d")

# Pre: pre-covid period, used for train and validation
Pre_start, Pre_end = '2017-09-01', '2020-01-30'
# Post: post-covid period, used for test
Post_start, Post_end = '2021-07-01',  yesterday.strftime("%Y-%m-%d")
# Train on All:

# Future: Today till one year in future:
future_start , future_end = datetime.today().strftime("%Y-%m-%d") ,   next_year_today.strftime("%Y-%m-%d")

new_market = False # change this to True if it is a new market
ulcc_list = ['NK','SY','F9'] # Spirit SunCountry Frontier 

# Extracting for Seas:
sea_col_fcst = ['week_x', 'week_y', 'forecastDayOfWeek','avgrasm','dowavgrasm','seats_AA_fcst', 'holiday', 'forecastId'] #+ forecastDayOfWeek, FCST
sea_col_Cap = ['week_x', 'week_y','dow_x', 'dow_y', 'avgrasm','seats_AA_fcst','seats_OA_fcst','seats_ulcc_fcst' , 'seats_AA' , 'seats_OA' , 'seats_ulcc']
sea_col = ['week_x', 'week_y', 'dow_x', 'dow_y','avgrasm','dowavgrasm']

# Data reshaping parameters:
train_val_percentage = .9
time_series = False
seasenality_one_dimension = False 
window = 0 

# Model parameters;
epochs = 150
early_stop = 10
sea_dense = 128


kronos_3_timeseries = defaultdict()

hcrt, mos, az = connect_to_servers()

all_dest = find_all_dest_given_leg(orig , hcrt)
all_dest.pop(0) # Remove Austin

print(f"There are {len(all_dest)} FCSTs from {orig}")

if switch_orig_dist:
    main_dest = orig

for dest in tqdm(all_dest):
    if switch_orig_dist:
        dest , orig = main_dest, dest
    print(f" Flights from {orig} to {dest}:")
    # Pull all the FCSTs
    fcst_id_df = get_fcst_given_leg(orig, dest, hcrt )   

    # Pull OAG:
    oag_df = get_oag_data(orig, dest, pull_start, pull_end, ulcc_list, mos)

    #Pull prdMaps:
    prdMaps = get_prdMaps(orig, dest, hcrt)


    # Processing: OAG Per Day:
    oag_kl_total_Per_Day_and_AA = oag_per_day(oag_df)

    all_tensors = [[] for i in range(12)]

    for _,_ , fcst_id , fcst_start , fcst_end in tqdm(fcst_id_df.values):

        hcrt, mos, az = connect_to_servers()

        print(f" ------------- ****** {orig}-{dest}-{fcst_id}) ****** ------------- ")

        # print( fcst_id , fas, adf )
        print( f"fcst_start and fcst_end for {orig}-{dest} at FCST_ID {fcst_id} are: {fcst_start}, {fcst_end}") 

        #  Processing: OAG per FCST:
        oag_kl =  oag_per_fcst(oag_df, fcst_start, fcst_end )

        # Merge and Normalize: OAG per FCST and OAG per Day:
        oag_kl_fcst_total = pd.merge(oag_kl,oag_kl_total_Per_Day_and_AA ,on = "adj_dep_date", how='left',suffixes=('_fcst', '_day'))
        oag_kl_fcst_total = normalize_oag_kl_fcst_total(oag_kl_fcst_total)


        # Pull data from the file: pullData_FullPeriod.py
        df = pull_data(orig,dest,fcst_id,new_market)
        df = pull_seas(df, orig, dest)

        if len(df) < 100:
            print(f"insufficent data for market ({orig}-{dest}-{fcst_id}), IGNORED")
            continue


        df['flightDepartureDate'] = pd.to_datetime(df['flightDepartureDate'], format='%Y/%m/%d')

        # Merge new features (including the total day seats) into current Kronos dataset by dep_date
        df = pd.merge(df,oag_kl_fcst_total, left_on=['flightDepartureDate'],\
              right_on=['adj_dep_date'], how='left')
        df.dropna(inplace=True)

        # processing: Group and pad the DF:
        post = group_and_pad(df)

        # Cut the 'post' data into Pre-covid and Post-covid parts
        Data_PRE = post[ (post['flightDepartureDate']>=Pre_start) & (post['flightDepartureDate']<=Pre_end) ]
        Data_POST = post[ (post['flightDepartureDate']>=Post_start) & (post['flightDepartureDate']<=Post_end) ]
        Data_FUTURE = post[ (post['flightDepartureDate']>=future_start) & (post['flightDepartureDate']<=future_end) ]

        Data_PRE = Data_PRE.reset_index(drop = True)
        Data_POST = Data_POST.reset_index(drop = True)
        Data_FUTURE = Data_FUTURE.reset_index(drop = True)

        if test_today and not test_random_masking:
            if DOW:
                split_date = (datetime.strptime(test_today,'%Y-%m-%d') - timedelta(days=window*7)).strftime("%Y-%m-%d")
            else:
                split_date = (datetime.strptime(test_today,'%Y-%m-%d') - timedelta(days=window)).strftime("%Y-%m-%d")

            Data_PAST = pd.concat([Data_PRE,Data_POST])
            Data_PAST =  Data_PAST[(Data_PAST['flightDepartureDate']>=Post_start)]

            Data_PRE = Data_PAST[Data_PAST['flightDepartureDate']<split_date]
            Data_POST = Data_PAST[Data_PAST['flightDepartureDate']>=split_date]     

            Data_PRE = Data_PRE.reset_index(drop = True)
            Data_POST = Data_POST.reset_index(drop = True)

        print(f" For market ({orig}-{dest}-{fcst_id}) , we have {Data_PRE.shape[0]/14} Pre-Covid data, {Data_POST.shape[0]/14} Post-Covid data, and {Data_FUTURE.shape[0]/14} future data (from now to one year from now)")

        # TODO: This can be edited later, so we use the POST data to train....
        if len(Data_PRE) <= len(Data_POST):
            print(f" *** The Pre Covid data is less than the post covid data, so we ignore {orig}-{dest}-{fcst_id} market ***")
            continue

    #         # TODO: When merging models together this can be useefull for training porposes, but for now not usefull.
        if len(Data_FUTURE)//14 <= 5 * 7:
            print(f" *** There is no future flights for the {orig}-{dest}-{fcst_id} market, so ignored! *** ")
            continue

        if any([Data_PRE.shape[0]/14 <= 10 * 7 , Data_POST.shape[0]/14 <= 10 * 7]):
            print(f" *** Low amount of data for either PRE, or POST of {orig}-{dest}-{fcst_id} market, so ignored! *** ")
            continue

        # MErge FCSTs:

        # Train Kronos 2 (No Additional Feat (using sea_col)):
        use_channels = True
        seasenality_one_dimension = True
        DOW= True
        FC_time_series = False
        traffic_time_series = True 
        window = 10
        test_random_masking = False
        # test_today = '2022-07-01'
        train, val, test = get_train_test_samples2(Data_PRE, Data_POST, Data_FUTURE, sea_col_fcst, prdMaps , DOW= DOW , train_val_percentage = train_val_percentage ,  FC_time_series = FC_time_series , traffic_time_series = traffic_time_series,  use_channels = use_channels , seasenality_one_dimension = seasenality_one_dimension ,  window = window, test_random_masking = test_random_masking, test_today = test_today )

        [all_tensors[i].append(item) for i,item in enumerate( train + val +test)]

    if len(all_tensors[0]) == 0:
        print(f"No data for {orig}-{dest} -> No Model!")

    # Now concatinate and shuffle the data:
    train = [np.concatenate(all_tensors[i]) for i in range(0,4)] 
    val = [np.concatenate(all_tensors[i]) for i in range(4,8)] 
    test = [np.concatenate(all_tensors[i]) for i in range(8,12)] 

    # Shuffle:
    train = shuffle(train[0],train[1],train[2],train[3])
    val = shuffle(val[0],val[1],val[2],val[3])


    kronos32s_test_results3 , kronos32s_hist3,  kronos32s_model3 = kronos_32s_model(para_epochs = 500, para_early_stop =  True, para_model_name = 'kronos32s', para_sea_len = len(sea_col_fcst), para_sea_dense =  160, window = window ,
             train_list = train , val_list = val, test_list = test)

    _ , results_summary = test_acc(kronos32s_test_results3, test[3])

    kronos_3_timeseries[f"{orig}-{dest}"] = results_summary

    print(f"val plot {orig}-{dest}")
    plt.plot(kronos32s_hist3.history['loss'])
    plt.plot(kronos32s_hist3.history['val_loss'])
    plt.title('loss')
    plt.ylabel('MSE')
    plt.xlabel('epoch')
    plt.legend(['train','val'] , loc = "upper left")
    plt.show()
        

        