In [39]:
import os
import pandas as pd
import numpy as np
import glob
import datetime
from datetime import datetime
import h5py
import bz2
import re
import glob
from tqdm import tqdm

In [13]:
def load_actual_arrivals(airport_code, data_dir='data'):
    # runways_files = glob.glob(os.path.join(data_dir, 'FUSER_test', airport_code, '**', f'{airport_code}_*runways_data_set.csv'), recursive=True)
    runways_files = glob.glob(os.path.join(data_dir, airport_code, '**', f'{airport_code}_*runways_data_set.csv'), recursive=True)

    arrival_times = []

    for file in runways_files:
        df = pd.read_csv(file, parse_dates=['arrival_runway_actual_time'])
        df = df[['arrival_runway_actual_time']].dropna()
        arrival_times.append(df)

    arrivals_df = pd.concat(arrival_times)
    arrivals_df['arrival_runway_actual_time'] = pd.to_datetime(arrivals_df['arrival_runway_actual_time'])
    arrivals_df.set_index('arrival_runway_actual_time', inplace=True)

    return arrivals_df

actual_arrival_df = load_actual_arrivals('KJFK')

2023-01-07 00:01:37
2023-01-07 00:03:36
2023-01-07 00:05:52
2023-01-07 00:09:12
2023-01-07 00:11:14
...
2022-12-12 23:54:47
2022-12-12 23:55:00
2022-12-12 23:56:33
2022-12-12 23:57:18
2022-12-12 23:58:06


In [16]:
def load_estimated_arrivals(airport_code, data_dir='data'):
    tfm_files = glob.glob(os.path.join(data_dir, airport_code, '**', f'{airport_code}_*TFM_track_data_set.csv'), recursive=True)
    est_arrivals = []

    for file in tqdm(tfm_files, desc='Loading TFM track data [estimated arrivals]'):
        df = pd.read_csv(file, parse_dates=['timestamp', 'arrival_runway_estimated_time'])
        df = df[['timestamp', 'arrival_runway_estimated_time']].dropna()
        est_arrivals.append(df)

    est_arrivals_df = pd.concat(est_arrivals)
    est_arrivals_df['timestamp'] = pd.to_datetime(est_arrivals_df['timestamp'])
    est_arrivals_df['arrival_runway_estimated_time'] = pd.to_datetime(est_arrivals_df['arrival_runway_estimated_time'])

    return est_arrivals_df
est_arrival_df = load_estimated_arrivals('KJFK')

100%|██████████| 279/279 [01:27<00:00,  3.19it/s]


Unnamed: 0,timestamp,arrival_runway_estimated_time
0,2023-07-03 00:00:00,2023-07-03 00:26:48
1,2023-07-03 08:00:00,2023-07-03 10:03:28
2,2023-07-03 08:00:03,2023-07-03 11:03:08
3,2023-07-03 08:00:05,2023-07-03 12:08:26
4,2023-07-03 08:00:05,2023-07-03 11:31:30
...,...,...
175501,2023-07-22 01:59:55,2023-07-22 02:10:31
175502,2023-07-22 01:59:56,2023-07-22 02:13:15
175503,2023-07-22 01:59:57,2023-07-22 02:27:37
175504,2023-07-22 01:59:57,2023-07-22 02:46:04


In [89]:
from metar import Metar

def parse_metar_string(metar_str):
    try:
        m = Metar.Metar(metar_str)
        wind_speed = m.wind_speed.value() if m.wind_speed else np.nan
        wind_dir = m.wind_dir.value() if m.wind_dir else np.nan
        visibility = m.vis.value() if m.vis else np.nan
        temperature = m.temp.value(units='C') if m.temp else np.nan
        dewpoint = m.dewpt.value(units='C') if m.dewpt else np.nan
        pressure = m.press.value('hPa') if m.press else np.nan
        try:
            weather = m.present_weather() if m.weather else ''
        except KeyError as e:
            print(f'Error parsing weather data {e}, setting to None')
            weather = None
        # cloud = ';'.join([','.join([str(i) for i in alt]) for alt in m.sky])
        cloud = m.sky
        return {
            'wind_speed': wind_speed,
            'wind_dir': wind_dir,
            'visibility': visibility,
            'temperature': temperature,
            'dewpoint': dewpoint,
            'pressure': pressure,
            'weather': weather,
            'cloud': cloud
        }
    except Metar.ParserError:
        return {
            'wind_speed': None,
            'wind_dir': None,
            'visibility': None,
            'temperature': None,
            'dewpoint': None,
            'pressure': None,
            'weather': None,
            'cloud': None
        }

def filter_file_time(files, start, end):
    file_times = []
    # TODO: move this to an __init__
    for filename in files:
        match =  re.match(r'.*(\d{8})\.(\d{2})Z.*', os.path.basename(filename))
        date_part = match.group(1)  # '20221011'
        time_part = match.group(2)  # '17'
        dt = datetime.strptime(f"{date_part}{time_part}", "%Y%m%d%H")
        file_times.append((filename, dt))

    return [filename for filename, dt in file_times if start <= dt <= end]

def get_df(data, start, end):
    df = pd.DataFrame(data)
    df.dropna(subset=['timestamp'], inplace=True)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df[(df['timestamp'] >= start) & (df['timestamp'] <= end)]
    df.set_index('timestamp', inplace=True)
    return df

def load_metar_data(airport_code, start, end, data_dir='data'):
    metar_files = glob.glob(os.path.join(data_dir, 'METAR_train', '**', '*.txt'))


    metar_files = filter_file_time(metar_files, start, end)

    metar_data = []
    for file in (pbar := tqdm(metar_files, desc='LOADING METAR DATA')):
        pbar.set_postfix_str(f'LOADING METAR DATA: {file}')
        # encoding = detect_file_encoding(file)
        encoding = 'utf-8'
        try:
            with open(file, 'r', encoding=encoding , errors='ignore') as f:
                lines = f.readlines()
                for i in range(0, len(lines), 3):
                    line = i
                    if i+1 >= len(lines):
                        # Skip if there's an incomplete pair
                        continue
                    date_line = lines[i].strip()
                    data_line = lines[i+1].strip()
                    # print(date_line, data_line, sep='\n')
                    date = pd.to_datetime(date_line, format='%Y/%m/%d %H:%M')
                    if airport_code in data_line:
                        parsed_data = parse_metar_string(data_line)
                        parsed_data['timestamp'] = date
                        metar_data.append(parsed_data)
        except Exception as e:
            print(f"Error reading file {file} with encoding {encoding}: {e}")
            continue

    return get_df(metar_data, start, end)


metar_df = load_metar_data("JFK", datetime(2022, 9, 1, 10, 0), datetime(2022, 9, 1, 10, 30))
metar_df

LOADING METAR DATA: 100%|██████████| 1/1 [00:05<00:00,  5.56s/it, LOADING METAR DATA: data/METAR_train/METAR_train_part_1/metar.20220901.10Z.txt]


Unnamed: 0_level_0,wind_speed,wind_dir,visibility,temperature,dewpoint,pressure,weather,cloud
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-09-01 10:00:00,6.0,130.0,10000.0,27.0,24.0,1011.0,,"[(FEW, 1500 feet, None), (SCT, 2500 feet, None..."
2022-09-01 10:00:00,6.0,130.0,10000.0,27.0,24.0,1011.0,,"[(FEW, 1500 feet, None), (SCT, 2500 feet, None..."
2022-09-01 10:00:00,6.0,130.0,10000.0,27.0,24.0,1011.0,,"[(FEW, 1500 feet, None), (SCT, 2500 feet, None..."
2022-09-01 10:00:00,6.0,130.0,10000.0,27.0,24.0,1011.0,,"[(FEW, 1500 feet, None), (SCT, 2500 feet, None..."
2022-09-01 10:00:00,6.0,130.0,10000.0,27.0,24.0,1011.0,,"[(FEW, 1500 feet, None), (SCT, 2500 feet, None..."
2022-09-01 10:00:00,6.0,130.0,10000.0,27.0,24.0,1011.0,,"[(FEW, 1500 feet, None), (SCT, 2500 feet, None..."


In [None]:
def load_taf_data(airport_code, start, end, data_dir):
    taf_files = glob.glob(os.path.join(data_dir, 'taf.*.txt'))
    filter_file_time(taf_files, start, end)
    taf_data = []

    for file in tqdm(taf_files):
        with open(file, 'r') as f:
            lines = f.readlines()
            for i in range(0, len(lines), 3):
                try:
                    date = pd.to_datetime(lines[i].strip(), format='%Y/%m/%d %H:%M')
                except Exception as e:
                    pass

                data_line = lines[i+1].strip()
                if airport_code in data_line:
                    # Extract relevant forecast info
                    # Parsing TAF is complex; for this example, store raw data
                    taf_data.append({'timestamp': date, 'taf': data_line})

    return get_df(taf_data, start, end)

load_taf_data('KJFK', datetime(2022, 9, 1, 10, 0), datetime(2022, 9, 1, 10, 30), './data/TAF_train')


  0%|          | 0/1108 [00:00<?, ?it/s]