In [107]:
import pandas as pd
import numpy as np
import os
import tarfile
import urllib.request
import gzip
import shutil
import re
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

In [112]:
airline_icao_codes = [
    # Star Alliance
    'AEE', 'ACA', 'CCA', 'AIC', 'ANZ', 'ANA', 'AAR', 'AUA', 'AVA', 'BEL', 'CMP', 'CTN', 
    'MSR', 'ETH', 'EVA', 'LOT', 'DLH', 'CSZ', 'SIA', 'SAA', 'SWR', 'TAP', 'THA', 'THY',

    # Oneworld
    'BAW', 'CPA', 'FJI', 'FIN', 'IBE', 'JAL', 'MAS', 'QFA', 'QTR', 'RAM', 'RJA', 'ALK',

    # SkyTeam
    'ARG', 'AMX', 'AEA', 'AFR', 'CAL', 'CES', 'GIA', 'KQA', 'KLM', 'KAL', 'MEA', 'SVA',
    'SAS', 'ROT', 'HVN', 'VIR', 'CXA', 'AFL', 

    # Other flag carriers
    'BBC', 'TAM', 'EIN', 'ELY', 'BWA', 'PIA', 'ETD', 'UAE', 'TUA', 'UZB', 'VCV', 'PAL', 
    'MGL', 'KZR', 'GFA', 'AUI', 'TAR', 'DAH',

    # Low cost carriers
    'RYR', 'IGO', 'EZY', 'AXM', 'GLO', 'NOZ', 'VLG', 'WZZ', 'JST'
]

In [14]:
#import pandas as pd
#import numpy as np
#import os
#import tarfile
#import urllib.request
#import gzip
#import shutil
#import re
#from sklearn.preprocessing import StandardScaler

def is_valid_callsign(callsign):
  if not isinstance(callsign, str):
    return False

  callsign = callsign.strip().upper()

  # must be at least 4 chrs (eg 'UAL1') and at most 8
  if not (4 <= len(callsign) <= 8):
    return False

  prefix = callsign[:3]
  suffix = callsign[3:]

  # prefix must be all letters (UAL, SIA, BAW)
  if not prefix.isalpha():
    return False

  # suffix: 1–4 digits, possibly ending in 1 letter
  if not re.fullmatch(r'\d{1,4}[A-Z]?', suffix):
    return False

  return True

BASE_URL = "https://s3.opensky-network.org/data-samples/states/.2019-07-15"
HOURS = [f"{h:02d}" for h in range(24)]
MASTER_DF = []

for h in tqdm(HOURS): 
    # define download URL
    #print(f"processing hour {h}...")
    filename = f"states_2019-07-15-{h}.csv.tar"
    url = f"{BASE_URL}/{h}/{filename}"
    local_tar = f"./temp/{filename}"
    
    # download tarball from OpenSky
    os.makedirs("./temp", exist_ok=True)
    urllib.request.urlretrieve(url, local_tar)

    # extract .csv.gz
    with tarfile.open(local_tar, "r") as tar:
        tar.extractall("./temp")
        
    for name in os.listdir("./temp"):
        if name.endswith(".csv.gz") and name.startswith("states_2019-07-15"):
            gz_path = f"./temp/{name}"
            csv_path = gz_path[:-3]
    
            # decompress .gz
            with gzip.open(gz_path, 'rb') as f_in:
                with open(csv_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
    
            # load to pandas and filter
            df = pd.read_csv(csv_path)
            df.dropna(subset=["time", "callsign", "lat", "lon", "velocity", "heading", "baroaltitude"], inplace=True)
            MASTER_DF.append(df)
            os.remove(gz_path)
            os.remove(csv_path)

    os.remove(local_tar)

master_df = pd.concat(MASTER_DF, ignore_index=True)

master_df["callsign"] = master_df["callsign"].str.strip()
    
# scale altitude and velocity globally
scaler = StandardScaler()
scaler.fit(master_df[["velocity", "baroaltitude"]].values)
velocity = master_df["velocity"].values.reshape(-1, 1)
altitude = master_df["baroaltitude"].values.reshape(-1, 1)
scaled = scaler.transform(np.hstack([velocity, altitude]))
velocity_z = scaled[:, 0]
altitude_z = scaled[:, 1]
master_df["velocity"] = velocity_z
master_df["altitude"] = altitude_z
    
# transform heading to sin/cos 
heading_rad = np.deg2rad(master_df["heading"].values)
sin_heading = np.sin(heading_rad)
cos_heading = np.cos(heading_rad)
master_df["sin_heading"] = sin_heading
master_df["cos_heading"] = cos_heading
    
cleaned_data = master_df[["time", "callsign", "lat", "lon", "velocity", "sin_heading", "cos_heading", "altitude"]]

valid_mask = cleaned_data['callsign'].apply(is_valid_callsign)
cleaned_data = cleaned_data[valid_mask].copy()

# save locally as parquet (~50M rows, ~2GB)
cleaned_data.to_parquet("opensky_2019-07-15_raw.parquet")
print("All hourly files processed and combined.")

processing hour 00...
processing hour 01...
processing hour 02...
processing hour 03...
processing hour 04...
processing hour 05...
processing hour 06...
processing hour 07...
processing hour 08...
processing hour 09...
processing hour 10...
processing hour 11...
processing hour 12...
processing hour 13...
processing hour 14...
processing hour 15...
processing hour 16...
processing hour 17...
processing hour 18...
processing hour 19...
processing hour 20...
processing hour 21...
processing hour 22...
processing hour 23...
All hourly files processed and combined.


In [67]:
MAX_FLIGHT_DURATION = 8 * 3600  # 8 hours in seconds
RESAMPLE_INTERVAL = 60  # seconds
TARGET_LENGTH = 200 # target vector dimension for similarity

def preprocess_flight(df_flight):
    df_flight = df_flight.sort_values("time")
    duration = df_flight["time"].iloc[-1] - df_flight["time"].iloc[0]
    
    if duration > MAX_FLIGHT_DURATION:
        return None
    
    start_time = df_flight["time"].iloc[0]
    df_flight["elapsed"] = df_flight["time"] - start_time
    # resample to 200 steps
    idxs = np.linspace(0, len(df_flight) - 1, TARGET_LENGTH).astype(int)
    df_flight = df_flight.iloc[idxs]
    
    origin_lat = df_flight.iloc[0]["lat"]
    origin_lon = df_flight.iloc[0]["lon"]
    df = df_flight.copy()
    df["delta_lat"] = df["lat"] - origin_lat
    df["delta_lon"] = df["lon"] - origin_lon

    return df[["delta_lat", "delta_lon", "velocity", "sin_heading", "cos_heading", "altitude"]]

def extract_flight(data, callsign):
    return data[data["callsign"] == callsign]

In [16]:
master_data = pd.read_parquet("opensky_2019-07-15_raw.parquet")

In [83]:
sfo_lax = ['UAL2757', 'DAL409', 'UAL1200', 'UAL257', 'DAL2861', 'UAL613', 'DAL664', 'UAL256', 'DAL2600', 'AAL1851']
den_ord = ['UAL1938', 'UAL781', 'UAL301', 'AAL2771', 'UAL532', 'UAL682', 'AAL2780', 'UAL336', 'AAL2470', 'AAL773']
dfw_atl = ['DAL2010', 'AAL2749', 'DAL1890', 'DAL1966', 'AAL1309', 'DAL2310', 'DAL2269', 'AAL333', 'DAL1513', 'AAL2403']
dca_bos = ['AAL2150', 'AAL2148', 'AAL2160', 'AAL2169', 'AAL2139', 'AAL2170', 'AAL2119', 'AAL2149', 'AAL2120', 'AAL2134']
ord_lga = ['UAL1823', 'UAL1606', 'AAL129', 'AAL398', 'AAL527', 'DAL379', 'DAL585', 'UAL509', 'DAL2775', 'UAL639'] 
lax_jfk = ['AAL10', 'DAL1908', 'DAL1436', 'AAL118', 'DAL1258', 'AAL2', 'DAL2164', 'AAL238', 'AAL4', 'DAL816']
iah_ord = ['UAL2131', 'UAL1854', 'UAL2246', 'ENY3331', 'UAL1835', 'ENY3621', 'UAL1403', 'UAL1160', 'AAL869', 'UAL1899']
sfo_sea = ['UAL800', 'DAL2787', 'UAL2161', 'DAL0856', 'UAL1074', 'UAL351', 'UAL618', 'DAL2490', 'DAL2429', 'DAL1470']
atl_mco = ['DAL1418', 'DAL863', 'DAL804', 'DAL1883', 'DAL1905', 'DAL2428', 'DAL897', 'DAL768', 'DAL1118', 'DAL186']
lax_atl = ['DAL1901', 'AAL1071', 'DAL2213', 'DAL2270', 'DAL954', 'DAL1592', 'DAL2714', 'DAL1954', 'DAL1140', 'DAL516']
routes = [sfo_lax, den_ord, dfw_atl, dca_bos, ord_lga, lax_jfk, iah_ord, sfo_sea, atl_mco, lax_atl]

In [85]:
def aggregate_route(data, callsigns):
    matrices = []
    for call in callsigns:
        df_flight = extract_flight(data, call)
        traj = preprocess_flight(df_flight)
        if traj is None:
            continue
        matrices.append(traj)

    if len(matrices) == 0:
        return None

    # shape: (num_flights, 200, 6)
    stacked = np.stack(matrices, axis=0)
    aggregate = stacked.mean(axis=0) 

    return aggregate.T 


In [86]:
# vectors for 10 trunk routes
route_vectors = []
for r in tqdm(routes, desc = 'Aggregating Routes'):
    agg = aggregate_route(master_data, r)
    if agg is not None:
        route_vectors.append(agg)

np.save('route_vectors', route_vectors)

In [103]:
np.unique(master_data['callsign'])

array(['AAB416', 'AAB553', 'AAB576', ..., 'ZAM47', 'ZAM49', 'ZAM53'],
      dtype=object)

In [111]:
# vectors for ~60k flights in dataset
flight_vectors = []
all_callsigns = np.unique(master_data['callsign'])
for c in tqdm(range(len(all_callsigns))):
    vec = preprocess_flight(extract_flight(master_data, all_callsigns[c]))
    flight_vectors.append(vec)

np.save('flight_vectors', flight_vectors)

  0%|▏                                   | 317/68211 [09:55<35:25:54,  1.88s/it]


KeyboardInterrupt: 

In [None]:
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

all_callsigns = np.unique(master_data['callsign'])

for call in tqdm(all_callsigns):
    flight_df = preprocess_flight(extract_flight(master_data, call))
    