In [1]:
from datetime import timedelta
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

airports = [
    "KATL",
    "KCLT",
    "KDEN",
    "KDFW",
    "KJFK",
    "KMEM",
    "KMIA",
    "KORD",
    "KPHX",
    "KSEA",
]

In [2]:
airport = "KATL"

In [3]:
DATA_DIRECTORY = Path('prescreened train labels')

In [4]:
pushback = pd.read_csv(DATA_DIRECTORY / f"prescreened_train_labels_{airport}.csv.bz2")

In [5]:
pushback

Unnamed: 0,gufi,timestamp,airport,minutes_until_pushback
0,AAL1008.ATL.DFW.210403.1312.0051.TFM_TFDM,2021-04-03 19:30:00,KATL,114
1,AAL1008.ATL.DFW.210403.1312.0051.TFM_TFDM,2021-04-03 19:45:00,KATL,99
2,AAL1008.ATL.DFW.210403.1312.0051.TFM_TFDM,2021-04-03 20:00:00,KATL,84
3,AAL1008.ATL.DFW.210403.1312.0051.TFM_TFDM,2021-04-03 20:15:00,KATL,69
4,AAL1008.ATL.DFW.210403.1312.0051.TFM_TFDM,2021-04-03 20:30:00,KATL,54
...,...,...,...,...
3627352,XSR729.ATL.TLH.210426.2354.0037.TFM,2021-04-27 17:45:00,KATL,75
3627353,XSR729.ATL.TLH.210426.2354.0037.TFM,2021-04-27 18:00:00,KATL,60
3627354,XSR729.ATL.TLH.210426.2354.0037.TFM,2021-04-27 18:15:00,KATL,45
3627355,XSR729.ATL.TLH.210426.2354.0037.TFM,2021-04-27 18:30:00,KATL,30


# Pushback Prediction, 1 Airport & 1 Row

## ETD Data
### gufi: GUFI (Global Unique Flight Identifier)
### timestamp: The time that the prediction was generated
### departure_runway_estimated_time: Estimated time that the flight will depart from the runway

In [6]:
EXEC_DIRECTORY = Path('code execution development data')

In [7]:
etd = pd.read_csv(
    EXEC_DIRECTORY / airport / f"{airport}_etd.csv.bz2",
    parse_dates=["departure_runway_estimated_time", "timestamp"],
)
etd

Unnamed: 0,gufi,timestamp,departure_runway_estimated_time
0,DAL295.ATL.HSH.201112.1558.0055.TFM,2020-11-14 00:00:00,2020-11-13 16:19:00
1,SWA581.ATL.DEN.201112.1453.0101.TFM,2020-11-13 18:00:10,2020-11-13 15:29:00
2,DAL1149.ATL.DTW.201112.1738.0072.TFM,2020-11-13 18:00:21,2020-11-13 17:53:00
3,DAL1950.ATL.STT.201112.1438.0064.TFM,2020-11-13 18:00:21,2020-11-13 14:51:00
4,DAL1600.ATL.MCO.201112.1738.0059.TFM,2020-11-13 18:00:32,2020-11-13 17:53:00
...,...,...,...
27977,DAL1314.ATL.MCI.201113.2322.0056.TFM,2020-11-15 00:59:30,2020-11-14 23:32:00
27978,DAL1134.ATL.DTW.201114.0053.0003.TFM,2020-11-15 00:59:37,2020-11-15 01:01:00
27979,DAL340.ATL.SEA.201114.0037.0049.TFM,2020-11-15 00:59:42,2020-11-15 00:53:00
27980,EDV5477.ATL.SBN.201114.0007.0052.TFM,2020-11-15 00:59:43,2020-11-15 00:15:00


In [8]:
submission_format = pd.read_csv(
    EXEC_DIRECTORY / "submission_format.csv", parse_dates=["timestamp"]
)
submission_format

Unnamed: 0,gufi,timestamp,airport,minutes_until_pushback
0,AAL1227.ATL.MIA.201114.1242.0052.TFM,2020-11-15 11:15:00,KATL,0
1,AAL1227.ATL.MIA.201114.1242.0052.TFM,2020-11-15 12:00:00,KATL,0
2,AAL153.ATL.CLT.201114.1137.0016.TFM,2020-11-15 10:30:00,KATL,0
3,AAL153.ATL.CLT.201114.1137.0016.TFM,2020-11-15 11:15:00,KATL,0
4,AAL1567.ATL.DFW.201114.1257.0271.TFM,2020-11-15 12:00:00,KATL,0
...,...,...,...,...
2041,SKW4444.SEA.RDM.201114.0322.0013.TFM,2020-11-15 02:15:00,KSEA,0
2042,SKW4444.SEA.RDM.201114.0322.0013.TFM,2020-11-15 03:00:00,KSEA,0
2043,UAL214.SEA.SFO.201114.0242.0045.TFM,2020-11-15 01:30:00,KSEA,0
2044,UAL214.SEA.SFO.201114.0242.0045.TFM,2020-11-15 02:15:00,KSEA,0


In [9]:
row = submission_format.iloc[200]
row

gufi                      DAL1785.ATL.GSP.201114.0307.0041.TFM
timestamp                                  2020-11-15 02:15:00
airport                                                   KATL
minutes_until_pushback                                       0
Name: 200, dtype: object

In [10]:
etd.loc[etd.gufi == row.gufi]

Unnamed: 0,gufi,timestamp,departure_runway_estimated_time
13766,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-14 03:07:52,2020-11-15 03:14:00
13973,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-14 03:12:53,2020-11-15 03:14:00
24694,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-15 03:13:09,2020-11-15 03:16:00
24699,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-15 03:13:32,2020-11-15 03:16:00
24702,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-15 03:13:33,2020-11-15 03:16:00
24865,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-15 03:23:36,2020-11-15 03:16:00
25032,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-15 03:38:06,2020-11-15 03:16:00
25034,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-15 03:38:09,2020-11-15 03:16:00
25118,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-15 03:44:57,2020-11-15 03:16:00
25123,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-15 03:45:27,2020-11-15 03:16:00


In [11]:
now_etd = etd.loc[
    (etd.timestamp > row.timestamp - timedelta(hours=30))
    & (etd.timestamp <= row.timestamp)
    & (etd.gufi == row.gufi)
]
now_etd

Unnamed: 0,gufi,timestamp,departure_runway_estimated_time
13766,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-14 03:07:52,2020-11-15 03:14:00
13973,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-14 03:12:53,2020-11-15 03:14:00
26214,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-15 01:08:54,2020-11-15 03:14:00
26221,DAL1785.ATL.GSP.201114.0307.0041.TFM,2020-11-15 01:09:19,2020-11-15 03:14:00


In [12]:
flight_pushback = now_etd.iloc[-1].departure_runway_estimated_time - timedelta(
    minutes=15
)
flight_pushback

Timestamp('2020-11-15 02:59:00')

In [13]:
flight_minutes_to_pushback = np.round(
    (flight_pushback - row.timestamp).total_seconds() / 60
).astype(int)
flight_minutes_to_pushback

44

# Pushback Prediction, 1 Airport & All Rows

In [14]:
now_submission_format = submission_format.loc[
    (submission_format.timestamp == row.timestamp)
    & (submission_format.airport == airport)
].reset_index(drop=True)
now_submission_format

Unnamed: 0,gufi,timestamp,airport,minutes_until_pushback
0,DAL1084.ATL.MSP.201114.0307.0032.TFM,2020-11-15 02:15:00,KATL,0
1,DAL1113.ATL.PHX.201114.0252.0062.TFM,2020-11-15 02:15:00,KATL,0
2,DAL1131.ATL.DTW.201114.0312.0019.TFM,2020-11-15 02:15:00,KATL,0
3,DAL1307.ATL.EWR.201114.0257.0069.TFM,2020-11-15 02:15:00,KATL,0
4,DAL1334.ATL.PHL.201114.0252.0052.TFM,2020-11-15 02:15:00,KATL,0
5,DAL1335.ATL.MKE.201114.0222.0054.TFM,2020-11-15 02:15:00,KATL,0
6,DAL1374.ATL.ORD.201114.0337.0018.TFM,2020-11-15 02:15:00,KATL,0
7,DAL1415.ATL.FLL.201114.0252.0053.TFM,2020-11-15 02:15:00,KATL,0
8,DAL1422.ATL.MIA.201114.0317.0018.TFM,2020-11-15 02:15:00,KATL,0
9,DAL1471.ATL.IAD.201114.0257.0081.TFM,2020-11-15 02:15:00,KATL,0


In [15]:
etd.sort_values("timestamp", inplace=True)
now_etd = etd.loc[
    (etd.timestamp > row.timestamp - timedelta(hours=30))
    & (etd.timestamp <= row.timestamp)
]
now_etd
latest_now_etd = now_etd.groupby("gufi").last().departure_runway_estimated_time

In [16]:
departure_runway_estimated_time = now_submission_format.merge(
    latest_now_etd, how="left", on="gufi"
).departure_runway_estimated_time
departure_runway_estimated_time

0    2020-11-15 03:16:00
1    2020-11-15 03:06:00
2    2020-11-15 03:24:00
3    2020-11-15 03:16:00
4    2020-11-15 02:55:00
5    2020-11-15 02:35:00
6    2020-11-15 03:40:00
7    2020-11-15 02:58:00
8    2020-11-15 03:23:00
9    2020-11-15 03:08:00
10   2020-11-15 02:53:00
11   2020-11-15 03:10:00
12   2020-11-15 02:34:00
13   2020-11-15 03:22:00
14   2020-11-15 03:37:00
15   2020-11-15 02:36:00
16   2020-11-15 03:26:00
17   2020-11-15 03:29:00
18   2020-11-15 03:43:00
19   2020-11-15 02:56:00
20   2020-11-15 03:17:00
21   2020-11-15 03:42:00
22   2020-11-15 03:41:00
23   2020-11-15 03:40:00
24   2020-11-15 03:01:00
25   2020-11-15 03:23:00
26   2020-11-15 03:05:00
27   2020-11-15 03:39:00
28   2020-11-15 03:21:00
29   2020-11-15 03:22:00
30   2020-11-15 03:14:00
31   2020-11-15 03:13:00
32   2020-11-15 02:55:00
33   2020-11-15 03:47:00
34   2020-11-15 02:29:00
35   2020-11-15 03:04:00
36   2020-11-15 03:05:00
37   2020-11-15 03:05:00
38   2020-11-15 02:58:00
39   2020-11-15 03:01:00


In [17]:
estimated_pushback = (
    (
        departure_runway_estimated_time - now_submission_format.timestamp
    ).dt.total_seconds()
    / 60
) - 15

In [18]:
estimated_pushback = estimated_pushback.clip(lower=0).astype(int)
estimated_pushback

0     46
1     36
2     54
3     46
4     25
5      5
6     70
7     28
8     53
9     38
10    23
11    40
12     4
13    52
14    67
15     6
16    56
17    59
18    73
19    26
20    47
21    72
22    71
23    70
24    31
25    53
26    35
27    69
28    51
29    52
30    44
31    43
32    25
33    77
34     0
35    34
36    35
37    35
38    28
39    31
40    26
41    26
42    40
43    27
44    41
45    34
46    31
47    27
dtype: int32

In [32]:
def estimate_pushback(now: pd.Timestamp) -> pd.Series:

    # subset submission format to the current prediction time
    now_submission_format = submission_format.loc[
        submission_format.timestamp == now
    ].reset_index(drop=True)

    # filter features to 30 hours before prediction time to prediction time
    now_etd = etd.loc[(etd.timestamp > now - timedelta(hours=30)) & (etd.timestamp <= now)]

    # get the latest ETD for each flight
    latest_now_etd = now_etd.groupby("gufi").last().departure_runway_estimated_time

    # merge the latest ETD with the flights we are predicting
    departure_runway_estimated_time = now_submission_format.merge(
        latest_now_etd, how="left", on="gufi"
    ).departure_runway_estimated_time

    now_prediction = now_submission_format.copy()

    now_prediction["minutes_until_pushback"] = (
        (departure_runway_estimated_time - now_submission_format.timestamp).dt.total_seconds() / 60
    ) - 15

    return now_prediction

In [33]:
# ! pip install tqdm

In [34]:
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map

In [44]:
pd.to_datetime(airport_submission_format.timestamp.unique())

DatetimeIndex(['2020-11-15 11:15:00', '2020-11-15 12:00:00',
               '2020-11-15 10:30:00', '2020-11-15 09:45:00',
               '2020-11-15 00:00:00', '2020-11-15 00:45:00',
               '2020-11-15 01:30:00', '2020-11-15 02:15:00',
               '2020-11-15 03:00:00', '2020-11-15 03:45:00'],
              dtype='datetime64[ns]', freq=None)

In [47]:
estimate_pushback(pd.to_datetime('2020-11-15 11:15:00'))

Unnamed: 0,gufi,timestamp,airport,minutes_until_pushback
0,AAL1227.ATL.MIA.201114.1242.0052.TFM,2020-11-15 11:15:00,KATL,90.0
1,AAL153.ATL.CLT.201114.1137.0016.TFM,2020-11-15 11:15:00,KATL,23.0
2,AAL251.ATL.ORD.201114.1122.0010.TFM,2020-11-15 11:15:00,KATL,11.0
3,DAL1143.ATL.DTW.201114.1202.0055.TFM,2020-11-15 11:15:00,KATL,45.0
4,DAL1492.ATL.MCO.201114.1207.0044.TFM,2020-11-15 11:15:00,KATL,52.0
...,...,...,...,...
79,UAL1754.ORD.IAH.201114.1158.0068.TFM,2020-11-15 11:15:00,KORD,
80,UAL2822.ORD.NRT.201114.1227.0135.TFM,2020-11-15 11:15:00,KORD,
81,UAL2824.ORD.NRT.201114.1157.0035.TFM,2020-11-15 11:15:00,KORD,
82,UPS859.PHX.SDF.201114.2109.0014.TFM,2020-11-15 11:15:00,KPHX,
