## Load API keys

In [1]:
import os
import yaml
import pandas as pd
import numpy as np
import requests
import json
from datetime import datetime, timedelta
import time
import random
from IPython.display import clear_output

In [2]:
def load_api_keys(yaml_path=None):
    """
    Load API keys from a YAML file.

    :param yaml_path: Path to the YAML file (optional).
    :return: A dictionary with API keys.
    """
    # Default path to the keys.yml file
    if not yaml_path:
        yaml_path = os.path.expanduser("../project_keys.yml")
    
    try:
        with open(yaml_path, 'r') as file:
            data = yaml.safe_load(file)
            return data.get('api_keys', {})
    except FileNotFoundError:
        raise FileNotFoundError(f"API keys file not found at: {yaml_path}")
    except yaml.YAMLError as e:
        raise Exception(f"Error parsing YAML file: {e}")

In [3]:
# Load API keys
api_keys = load_api_keys()

# Access individual keys
flightaware_key = api_keys.get('flightaware_flight-lab-01')

### Prmary Flightaware Function & Helper Functions

In [4]:
def generate_date_ranges(start_date: datetime, end_date: datetime, interval_days: int = 7) -> list:
    """
    Generate a list of date ranges between a start and end date, spaced by a given interval.

    Parameters:
    start_date (datetime): The starting datetime.
    end_date (datetime): The ending datetime.
    interval_days (int): The number of days for each interval (default is 7).

    Returns:
    list: A list of strings representing date ranges in the format "yyyy-mm-ddThh:mm:ssZ---yyyy-mm-ddThh:mm:ssZ".
    """
    date_ranges = []
    current_start = start_date

    while current_start < end_date:
        current_end = min(current_start + timedelta(days=interval_days) - timedelta(seconds=1), end_date)
        formatted_start = current_start.strftime("%Y-%m-%dT%H:%M:%SZ")
        formatted_end = current_end.strftime("%Y-%m-%dT%H:%M:%SZ")
        date_ranges.append(f"{formatted_start}---{formatted_end}")
        current_start = current_end + timedelta(seconds=1)

    return date_ranges

In [None]:
def flightaware_call_new(path_to_fr24_df,query_identifier,start_date,end_date,max_ids=4,target_save_path='default',verbose = True):
    """
    Queries the FlightAware AeroAPI to retrieve historical flight data based on flight identifiers 
    extracted from a given flight tracking dataset (FR24 data). The function filters flights based on 
    specified date ranges and routes, then retrieves details from FlightAware.

    Parameters:
    -----------
    path_to_fr24_df : str
        Path to the CSV file containing the FR24 flight tracking data.
    query_identifier : str
        Column name in the FR24 dataset that contains flight identifiers (e.g., callsigns).
    start_date : str
        Start date for the query in ISO format (YYYY-MM-DDTHH:MM:SSZ).
    end_date : str
        End date for the query in ISO format (YYYY-MM-DDTHH:MM:SSZ).
    max_ids : int, optional, default=4
        Maximum number of flight identifiers to query per route and date range.
    target_save_path : str, optional, default='default'
        Path where the retrieved data will be saved. If 'default', the function uses a predefined directory structure.
    verbose : bool, optional, default=True
        Whether to print detailed progress updates.

    Returns:
    --------
    pd.DataFrame
        A DataFrame containing flight data retrieved from the FlightAware AeroAPI.

    Notes:
    ------
    - The function reads the FR24 dataset, extracts flight routes, and defines query date ranges.
    - It iterates through routes and dates, shuffling and selecting flight identifiers.
    - API calls are made sequentially, with a rate-limiting delay (0.01s per request).
    - Query progress and errors are logged, with intermediate results saved.
    - The final dataset is saved to a CSV file.

    Raises:
    -------
    - Handles HTTP and general exceptions, logging errors with relevant metadata.
    """
    
    # Define your API token, the base URL and the Authorization header with your API token
    API_TOKEN = flightaware_key
    BASE_URL = 'https://aeroapi.flightaware.com/aeroapi'
    headers = {
        'x-apikey': API_TOKEN,
    }

    # Open FR24 dataframe & extract routes
    df_fr24 = pd.read_csv(path_to_fr24_df)
    df_fr24['routes'] = df_fr24.apply(lambda row: str(row['orig_icao']) + '---' + str(row['dest_icao']), axis=1)
    df_fr24['timestamp'] = pd.to_datetime(df_fr24['timestamp'], format='%Y-%m-%dT%H:%M:%SZ')
    routes = set(df_fr24['routes'])

    # Dateranges and query tracker
    macro_date_range = generate_date_ranges(start_date, end_date)
    query_no = 0
    
    # Initialize the DataFrame to store data
    df_flightaware_by_callsign = pd.DataFrame()

    # Define the save path
    if target_save_path == 'default' :
        save_path_root = f"../data/processed/df_flightaware_by_callsign_from_{start_date}_to_{end_date}_"
        save_path_root_pne = f"../data/processed/progress_and_errors/df_flightaware_by_callsign_from_{start_date}_to_{end_date}_"
    else :
        save_path_root = target_save_path.split('.csv')[0] + '_'
        save_path_root_pne = save_path_root.split('/')[0] + '/' + save_path_root.split('/')[1] + '/' + save_path_root.split('/')[2] + '/progress_and_errors/' + save_path_root.split('/')[3] 

    # Run loop for every date range
    for num_date, sub_date_range in enumerate(macro_date_range) :     
        # Define query date parameters
        sub_start = sub_date_range.split('---')[0]
        sub_end = sub_date_range.split('---')[1]
        params = {
                "ident_type": "designator",
                "start": sub_start,
                "end": sub_end,
                "max_pages" : 1
            }
        # Reformat to datetime for extraction of active flight identifiers
        sub_start = datetime.strptime(sub_start, "%Y-%m-%dT%H:%M:%SZ")
        sub_end = datetime.strptime(sub_end, "%Y-%m-%dT%H:%M:%SZ")

        # Run loop for every route    
        for num_route, route in enumerate(routes) :
            # Pull the flight identifiers w query_identifier which are active in the sub_date_range for the active route, shuffle callsigns and extract first 4
            flight_identifiers = list(set(df_fr24[(df_fr24['timestamp']>=sub_start) & (df_fr24['timestamp']<=sub_end) & (df_fr24['routes']==route)][query_identifier]))
            random.shuffle(flight_identifiers)
            if (len(flight_identifiers)) >= max_ids:
                flight_identifiers = flight_identifiers[0:max_ids]
            
            # Run loop for every active flight identifier  
            for num_ident, ident in enumerate(flight_identifiers) :
                try:
                    # Define endpoint URL based on callsign identification
                    ENDPOINT = f"/history/flights/{ident}"
                    url = f"{BASE_URL}{ENDPOINT}"
                    query_no += 1

                    # API restriction
                    time.sleep(0.01)
                    
                    #Debugging Line
                    if verbose :
                        clear_output(wait=True)
                        print(f"You can expect a maximum of {len(macro_date_range)*len(routes)*4} query calls for this package:")
                        print(f"\t-{len(macro_date_range)} date ranges between {start_date} and {end_date} \n\t-{len(routes)} routes \n\t-Maximum of {max_ids} callsigns per route and date range")
                        print(f"\nActive query no. {query_no} is requesting the following information:")
                        print(f"\t-Date range {num_date+1} of {len(macro_date_range)}") 
                        print(f"\t\t-{sub_date_range}")
                        print(f"\t-Route {num_route+1} of {len(routes)}")
                        print(f"\t\t-{route}")
                        print(f"\t-Flight Identifier {num_ident+1} of {len(flight_identifiers)}")     
                        print(f"\t\t-{ident}\n")

                    # Query the API response
                    response = requests.get(url, params=params, headers=headers)
                    response.raise_for_status()
                    data = response.json()
                    
                    # If data is found, append it to the DataFrame #THIS NEEDS DEBUGGING AGAINST THE KNOWN RESPONSE TYPES!!!!!
                    if isinstance(data, dict) and "flights" in data and isinstance(data["flights"], list) and data["flights"]:
                        # Debugging line
                        if verbose :
                            print(json.dumps(data, indent=4))  

                        # Append to Dataframe
                        df_flightaware_by_callsign = pd.concat([df_flightaware_by_callsign, pd.json_normalize(data["flights"])], ignore_index=True)
                    
                    # Otherwise let the user know
                    else:
                        print(f"\nUSER NOTICE: no data found on \n\t-Date range: {sub_date_range} \n\t-Route: {route} \n\t-Flight identifier: {ident}")
                
                # Exception handling
                except requests.exceptions.HTTPError as http_err:
                        print(f"HTTP error occurred: {http_err}")
                        time_of_error = int(time.time())
                        save_path = save_path_root_pne + str(sub_date_range) + "_" + str(route) + "_" + str(ident) + "_" + str(time_of_error) + "_http_error.csv"
                        df_flightaware_by_callsign.to_csv(save_path, index=False)
        
                except Exception as err:
                        print(f"An error occurred: {err}")
                        time_of_error = int(time.time())
                        save_path = save_path_root_pne + str(sub_date_range) + "_" + str(route) + "_" + str(ident) + "_" + str(time_of_error) + "_unknown_error.csv"
                        df_flightaware_by_callsign.to_csv(save_path, index=False)
        
        # Save in process
        time_of_progress_save = int(time.time())
        save_path = save_path_root_pne + str(sub_date_range) + "_" + str(time_of_progress_save) + "_inprogress.csv"
        df_flightaware_by_callsign.to_csv(save_path, index=False)

    # Save and return dataframe
    time_of_run = int(time.time())
    save_path = save_path_root + str(time_of_run) + "_final.csv"
    df_flightaware_by_callsign.to_csv(save_path, index=False)
    print("\nGREAT SUCCESS!!!!!!!!!!")
    
    return df_flightaware_by_callsign

### Usage Example

In [None]:
# Define the path to the Flightradar24 file containing the target callsigns
path_to_fr24_df = '../data/full_v1/df_flight_position_full_v1.csv'

# Define your query identifier ('flight' or 'callsign')
query_identifier = 'callsign'

# Define the start and end dates for the query and make sure they match the FR24 file being provided
start_date = datetime(2022, 6, 28, 0, 0, 0)
end_date = datetime(2024, 8, 31, 23, 59, 59)

# Define the maximum number of callsigns to consider for each route on a given date range
max_ids = 4

# Define the target save path (or name 'default')
target_save_path = f"../data/full_v1/df_flightaware_by_callsign_full_v1.csv"

# Define verbosity
verbose = True

# Call the flightaware API: UNCOMMENT, RUN, RECOMMENT TO ENSURE NO UNINTENDED CALLS ARE MADE!!!!
'''
# SERIOUSLY, TRIPLE CHECK YOUR FR24 FILEPATH AND DATES!!!!!!!!!!!!!
'''
# df_flightaware_by_callsign = flightaware_call_new(path_to_fr24_df,query_identifier,start_date,end_date,max_ids,target_save_path,verbose)

You can expect a maximum of 29184 query calls for this package:
	-114 date ranges between 2022-06-28 00:00:00 and 2024-08-31 23:59:59 
	-64 routes 
	-Maximum of 4 callsigns per route and date range

Active query no. 20041 is requesting the following information:
	-Date range 81 of 114
		-2024-01-09T00:00:00Z---2024-01-15T23:59:59Z
	-Route 32 of 64
		-EGLL---EIDW
	-Flight Identifier 4 of 4
		-EIN169



### Sanity Checks

In [13]:
# Extract routes from both the FR24 query and the Flightaware result
df_flightaware = pd.read_csv('../data/full_v1/df_flightaware_by_callsign_jan_2025_1737753215_final.csv')
df_fr24 = pd.read_csv('../data/full_v1/df_flight_position_jan_2025_final_1737751394.csv')

df_flightaware['route'] = df_flightaware.apply(lambda row: str(row['origin.code_icao'])+'-'+str(row['destination.code_icao']), axis=1)
df_flightaware['scheduled_off'] = pd.to_datetime(df_flightaware['actual_off'], format='%Y-%m-%dT%H:%M:%SZ')
df_flightaware['scheduled_on'] = pd.to_datetime(df_flightaware['actual_on'], format='%Y-%m-%dT%H:%M:%SZ')
flightware_routes = set(df_flightaware['route'])

df_fr24['route'] = df_fr24.apply(lambda row: str(row['orig_icao'])+'-'+str(row['dest_icao']), axis=1)
df_fr24['eta'] = pd.to_datetime(df_fr24['eta'], format='%Y-%m-%dT%H:%M:%SZ')
fr24_routes = set(df_fr24['route'])

print(f"\nFR24: There are {len(fr24_routes)} routes in the query")
print(f"FlightAware: There are {len(flightware_routes)} routes in the response")


FR24: There are 64 routes in the query
FlightAware: There are 125 routes in the response


In [14]:
# Verify that all routes from the FR24 query are covered in the Flightaware result
exceptions = [element for element in fr24_routes if element not in flightware_routes]

# Print the exceptions
if exceptions:
    print("ERROR: The following elements of the FR24-route set are not in the Flightware route-set:\n", exceptions)
else:
    print("SUCCESS: All individual FR24-routes are contained in the Flightaware route-set")

ERROR: The following elements of the FR24-route set are not in the Flightware route-set:
 ['RCTP-KSFO']


In [15]:
# Breakdown of intended query routes (FR24) by number of flights for the route, number of callsigns, earliest departure, latest arrival
df_intended_query_breakdown = df_flightaware[df_flightaware['route'].isin(fr24_routes)].groupby('route').agg(unique_flight_count=('fa_flight_id','nunique'),
                                                                                                             unique_callsign_count=('ident_icao','nunique'),
                                                                                                             unique_ac_type_count=('aircraft_type','nunique'),
                                                                                                             earliest_departure=('scheduled_off','min'),
                                                                                                             latest_arrival = ('scheduled_on','max')
                                                                                                            )
df_intended_query_breakdown

Unnamed: 0_level_0,unique_flight_count,unique_callsign_count,unique_ac_type_count,earliest_departure,latest_arrival
route,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BIKF-LFPG,43,3,6,2025-01-01 06:16:47,2025-01-21 11:02:00
CYUL-LFPG,65,6,6,2025-01-01 01:07:39,2025-01-22 05:42:29
CYVR-CYYZ,67,10,9,2025-01-01 06:49:26,2025-01-21 22:28:10
DAAG-LFPG,63,10,6,2025-01-01 07:23:46,2025-01-21 09:21:04
DAAG-LFPO,61,7,3,2025-01-01 08:35:06,2025-01-21 19:31:25
...,...,...,...,...,...
VTBS-VHHH,75,10,8,2025-01-01 01:45:12,2025-01-21 21:24:53
VTBS-WSSS,84,10,8,2025-01-01 01:39:13,2025-01-21 17:09:21
WIII-WMKK,82,10,6,2025-01-01 03:15:45,2025-01-21 23:41:36
WIII-WSSS,82,11,4,2025-01-01 04:28:28,2025-01-21 15:51:32


In [16]:
df_fr24_query_breakdown = df_fr24.groupby('route').agg(unique_flight_count=('fr24_id','nunique'),
                                                       unique_callsign_count=('callsign','nunique'),
                                                       unique_ac_type_count=('type','nunique'),
                                                       latest_arrival=('eta','max')
                                                       )
df_fr24_query_breakdown

Unnamed: 0_level_0,unique_flight_count,unique_callsign_count,unique_ac_type_count,latest_arrival
route,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BIKF-LFPG,24,3,5,2025-01-24 11:26:56
CYUL-LFPG,24,8,6,2025-01-24 09:23:12
CYVR-CYYZ,24,16,7,2025-01-24 11:52:32
DAAG-LFPG,24,12,4,2025-01-24 12:41:36
DAAG-LFPO,24,8,3,2025-01-24 13:14:00
...,...,...,...,...
VTBS-VHHH,24,17,9,2025-01-24 06:13:20
VTBS-WSSS,24,13,6,2025-01-24 14:34:40
WIII-WMKK,24,16,4,2025-01-23 00:57:36
WIII-WSSS,24,18,5,2025-01-24 10:57:04


In [12]:
df_fr24.head()

Unnamed: 0,fr24_id,flight,callsign,timestamp,source,hex,type,reg,painted_as,operating_as,orig_iata,orig_icao,dest_iata,dest_icao,eta,route
0,382f010e,MS671,MSR671,2024-12-01T13:59:59Z,ADSB,01015E,A333,SU-GDV,MSR,MSR,CAI,HECA,JED,OEJN,2024-12-01 14:43:12,HECA-OEJN
1,38310526,SV308,SVA308,2024-12-02T01:59:58Z,ADSB,710051,A333,HZ-AQH,SVA,SVA,CAI,HECA,JED,OEJN,2024-12-02 02:27:12,HECA-OEJN
2,3835f687,SV330,SVA330,2024-12-03T14:59:59Z,ADSB,7100BD,A333,HZ-AQ19,SVA,SVA,CAI,HECA,JED,OEJN,NaT,HECA-OEJN
3,38396ee8,MS669,MSR669,2024-12-04T17:59:59Z,ADSB,01010B,B738,SU-GDC,MSR,MSR,CAI,HECA,JED,OEJN,2024-12-04 18:40:00,HECA-OEJN
4,383bdbba,F3756,FAD756,2024-12-05T10:59:58Z,ADSB,711506,A20N,HZ-FBJ,FAD,FAD,CAI,HECA,JED,OEJN,2024-12-05 11:39:44,HECA-OEJN


In [13]:
df_flightaware.head()

Unnamed: 0,ident,ident_icao,ident_iata,actual_runway_off,actual_runway_on,fa_flight_id,operator,operator_icao,operator_iata,flight_number,...,origin.airport_info_url,destination.code,destination.code_icao,destination.code_iata,destination.code_lid,destination.timezone,destination.name,destination.city,destination.airport_info_url,destination
0,APZ101,APZ101,YP101,33L,24R,APZ101-1733372573-schedule-111p,APZ,APZ,YP,101,...,/airports/RKSI,KLAX,KLAX,LAX,LAX,America/Los_Angeles,Los Angeles Intl,Los Angeles,/airports/KLAX,
1,APZ101,APZ101,YP101,33L,24R,APZ101-1733286645-schedule-920p,APZ,APZ,YP,101,...,/airports/RKSI,KLAX,KLAX,LAX,LAX,America/Los_Angeles,Los Angeles Intl,Los Angeles,/airports/KLAX,
2,APZ101,APZ101,YP101,,24R,APZ101-1733199944-schedule-1321p,APZ,APZ,YP,101,...,/airports/RKSI,KLAX,KLAX,LAX,LAX,America/Los_Angeles,Los Angeles Intl,Los Angeles,/airports/KLAX,
3,APZ101,APZ101,YP101,33L,24R,APZ101-1733113490-schedule-528p,APZ,APZ,YP,101,...,/airports/RKSI,KLAX,KLAX,LAX,LAX,America/Los_Angeles,Los Angeles Intl,Los Angeles,/airports/KLAX,
4,APZ101,APZ101,YP101,,24R,APZ101-1733027879-schedule-340p,APZ,APZ,YP,101,...,/airports/RKSI,KLAX,KLAX,LAX,LAX,America/Los_Angeles,Los Angeles Intl,Los Angeles,/airports/KLAX,
