In [2]:
import os
import pandas as pd
import numpy as np
import glob
import datetime
from datetime import timedelta
import h5py
import bz2
import re
import glob
from tqdm import tqdm

In [13]:
def load_actual_arrivals(airport_code, data_dir='data'):
    # runways_files = glob.glob(os.path.join(data_dir, 'FUSER_test', airport_code, '**', f'{airport_code}_*runways_data_set.csv'), recursive=True)
    runways_files = glob.glob(os.path.join(data_dir, airport_code, '**', f'{airport_code}_*runways_data_set.csv'), recursive=True)

    arrival_times = []

    for file in runways_files:
        df = pd.read_csv(file, parse_dates=['arrival_runway_actual_time'])
        df = df[['arrival_runway_actual_time']].dropna()
        arrival_times.append(df)

    arrivals_df = pd.concat(arrival_times)
    arrivals_df['arrival_runway_actual_time'] = pd.to_datetime(arrivals_df['arrival_runway_actual_time'])
    arrivals_df.set_index('arrival_runway_actual_time', inplace=True)

    return arrivals_df

actual_arrival_df = load_actual_arrivals('KJFK')

2023-01-07 00:01:37
2023-01-07 00:03:36
2023-01-07 00:05:52
2023-01-07 00:09:12
2023-01-07 00:11:14
...
2022-12-12 23:54:47
2022-12-12 23:55:00
2022-12-12 23:56:33
2022-12-12 23:57:18
2022-12-12 23:58:06


In [16]:
def load_estimated_arrivals(airport_code, data_dir='data'):
    tfm_files = glob.glob(os.path.join(data_dir, airport_code, '**', f'{airport_code}_*TFM_track_data_set.csv'), recursive=True)
    est_arrivals = []

    for file in tqdm(tfm_files, desc='Loading TFM track data [estimated arrivals]'):
        df = pd.read_csv(file, parse_dates=['timestamp', 'arrival_runway_estimated_time'])
        df = df[['timestamp', 'arrival_runway_estimated_time']].dropna()
        est_arrivals.append(df)

    est_arrivals_df = pd.concat(est_arrivals)
    est_arrivals_df['timestamp'] = pd.to_datetime(est_arrivals_df['timestamp'])
    est_arrivals_df['arrival_runway_estimated_time'] = pd.to_datetime(est_arrivals_df['arrival_runway_estimated_time'])

    return est_arrivals_df
est_arrival_df = load_estimated_arrivals('KJFK')

100%|██████████| 279/279 [01:27<00:00,  3.19it/s]


Unnamed: 0,timestamp,arrival_runway_estimated_time
0,2023-07-03 00:00:00,2023-07-03 00:26:48
1,2023-07-03 08:00:00,2023-07-03 10:03:28
2,2023-07-03 08:00:03,2023-07-03 11:03:08
3,2023-07-03 08:00:05,2023-07-03 12:08:26
4,2023-07-03 08:00:05,2023-07-03 11:31:30
...,...,...
175501,2023-07-22 01:59:55,2023-07-22 02:10:31
175502,2023-07-22 01:59:56,2023-07-22 02:13:15
175503,2023-07-22 01:59:57,2023-07-22 02:27:37
175504,2023-07-22 01:59:57,2023-07-22 02:46:04


In [41]:
from metar import Metar

def parse_metar_string(metar_str):
    try:
        m = Metar.Metar(metar_str)
        # Extract desired attributes
        wind_speed = m.wind_speed.value() if m.wind_speed else np.nan
        visibility = m.vis.value() if m.vis else np.nan
        temperature = m.temp.value(units='C') if m.temp else np.nan
        dewpoint = m.dewpt.value(units='C') if m.dewpt else np.nan
        pressure = m.press.value('hPa') if m.press else np.nan
        return {
            'wind_speed': wind_speed,
            'visibility': visibility,
            'temperature': temperature,
            'dewpoint': dewpoint,
            'pressure': pressure
        }
    except Metar.ParserError:
        return {
            'wind_speed': np.nan,
            'visibility': np.nan,
            'temperature': np.nan,
            'dewpoint': np.nan,
            'pressure': np.nan
        }

def load_metar_data(airport_code, data_dir='data'):
    metar_files = glob.glob(os.path.join(data_dir, 'METAR_train', '**', '*.txt'))
    metar_data = []

    for file in (pbar := tqdm(metar_files, desc='LOADING METAR DATA')):
        pbar.set_postfix_str(f'LOADING METAR DATA: {file}')
        # encoding = detect_file_encoding(file)
        encoding = 'utf-8'
        try:
            with open(file, 'r', encoding=encoding , errors='ignore') as f:
                lines = f.readlines()
                # line 1 is teh date, line 2 is data, line3 is blank
                for i in range(0, len(lines), 3):
                    line = i
                    if i+1 >= len(lines):
                        # Skip if there's an incomplete pair
                        continue
                    date_line = lines[i].strip()
                    data_line = lines[i+1].strip()
                    # print(date_line, data_line, sep='\n')
                    date = pd.to_datetime(date_line, format='%Y/%m/%d %H:%M')
                    if airport_code in data_line:
                        parsed_data = parse_metar_string(data_line)
                        parsed_data['timestamp'] = date
                        metar_data.append(parsed_data)
        except Exception as e:
            print(f"Error reading file {file} with encoding {encoding}: {e}")
            continue

    metar_df = pd.DataFrame(metar_data)
    metar_df.dropna(subset=['timestamp'], inplace=True)
    metar_df['timestamp'] = pd.to_datetime(metar_df['timestamp'])
    metar_df.set_index('timestamp', inplace=True)

    return metar_df
metar_df = load_metar_data('KJFK')

LOADING METAR DATA:   0%|          | 13/6639 [00:50<7:08:09,  3.88s/it, LOADING METAR DATA: data/METAR_train/METAR_train_part_1/metar.20220909.19Z.txt]


KeyboardInterrupt: 