# METAR Data from AVWX API

### Step 0: Importing **Libraries**

In [2]:
import pandas as pd
import os
import yaml
import requests
import json
from datetime import datetime

### Step 1: Loading **API Keys**

In [2]:
def load_api_keys(yaml_path=None):
    """
    Load API keys from a YAML file.

    :param yaml_path: Path to the YAML file (optional).
    :return: A dictionary with API keys.
    """
    # Default path to the keys.yml file
    if not yaml_path:
        yaml_path = os.path.expanduser("../project_keys.yml")
    
    try:
        with open(yaml_path, 'r') as file:
            data = yaml.safe_load(file)
            return data.get('api_keys', {})
    except FileNotFoundError:
        raise FileNotFoundError(f"API keys file not found at: {yaml_path}")
    except yaml.YAMLError as e:
        raise Exception(f"Error parsing YAML file: {e}")

Use the function to fetch API keys:

In [3]:
# Load API keys
api_keys = load_api_keys()

# Access individual keys
flightradar_sandbox_key = api_keys.get('Flightradar24_sandbox')
flightradar_key = api_keys.get('Flightradar24_flight-lab-01')
avwx_sandbox_key = api_keys.get('AVWX_dev')
avwx_key = api_keys.get('AVWX_flight-lab-avwx-01')

### Step 2: Importing and Preparing **FlightAware Data**

First, let's load the FlightAware Data and drop unnecessary rows.

In [None]:
# Loading csv file
df_flightaware_by_callsign_1_month_sample = pd.read_csv('yourcsv.csv', low_memory=False)

# Dropping 'position_only' rows. These carry no useful information for our analysis.
position_only_indices = df_flightaware_by_callsign_1_month_sample[df_flightaware_by_callsign_1_month_sample['position_only']==True]['scheduled_on'].index
df_drop_position_only = df_flightaware_by_callsign_1_month_sample.drop(index=position_only_indices)
df_drop_position_only.reset_index(inplace=True, drop=True)

# Dropping rows that have no 'scheduled_on' time.
scheduled_on_nan = df_drop_position_only[df_drop_position_only['scheduled_on'].isna()].index
df_flightaware = df_drop_position_only.drop(index=scheduled_on_nan)
df_flightaware.reset_index(inplace=True, drop=True)

df_flightaware.head()

Next, we define a function which prepares the FlightAware DataFrame (df_flightaware) to be put into the METAR query.

In [5]:
def flightaware_to_metar_prep(df_flightaware_trial):
    """ Prepare the FlightAware DataFrame for METAR data retrieval.

    Args:
        df_flightaware_trial (DataFrame): Flight data with airports and departure/arrival times.

    Returns:
        list: List of tuples with airport_date and latest time.
    """
    df_copy = df_flightaware_trial.copy()

    # Convert datetime columns in the copy
    df_copy['actual_off'] = pd.to_datetime(df_copy['actual_off'], errors='coerce')
    df_copy['estimated_off'] = pd.to_datetime(df_copy['estimated_off'], errors='coerce')  # Add estimated
    df_copy['scheduled_off'] = pd.to_datetime(df_copy['scheduled_off'], errors='coerce')
    df_copy['actual_on'] = pd.to_datetime(df_copy['actual_on'], errors='coerce')
    df_copy['estimated_on'] = pd.to_datetime(df_copy['estimated_on'], errors='coerce')  # Add estimated
    df_copy['scheduled_on'] = pd.to_datetime(df_copy['scheduled_on'], errors='coerce')

    # Use actual -> estimated -> scheduled times
    df_copy['departure_time'] = (
        df_copy['actual_off']
        .combine_first(df_copy['estimated_off'])
        .combine_first(df_copy['scheduled_off'])
    )
    df_copy['arrival_time'] = (
        df_copy['actual_on']
        .combine_first(df_copy['estimated_on'])
        .combine_first(df_copy['scheduled_on'])
    )

    # Create new columns for departure and arrival airport dates
    df_copy['departure_airport_date'] = (
        df_copy['origin.code_icao'] + ' ' +
        df_copy['departure_time'].dt.date.astype(str)
    )

    df_copy['arrival_airport_date'] = (
        df_copy['destination.code_icao'] + ' ' +
        df_copy['arrival_time'].dt.date.astype(str)
    )

    # Rearrange columns
    cols = [
        'origin.code_icao', 'actual_off', 'estimated_off', 'scheduled_off',
        'departure_airport_date', 'destination.code_icao', 'actual_on',
        'estimated_on', 'scheduled_on', 'arrival_airport_date'
    ]
    df_metar_prep = df_copy[cols]

    # Create a new DataFrame to hold the airport_date and times
    airport_date_df = pd.concat([
        df_copy[['departure_airport_date', 'departure_time']].rename(
            columns={'departure_airport_date': 'airport_date', 'departure_time': 'time'}),
        df_copy[['arrival_airport_date', 'arrival_time']].rename(
            columns={'arrival_airport_date': 'airport_date', 'arrival_time': 'time'})
    ])

    # Drop rows where 'time' is NaN
    airport_date_df = airport_date_df.dropna(subset=['time'])
    
    # Group by airport_date and find the latest time
    latest_time_df = airport_date_df.groupby('airport_date')['time'].max().reset_index()

    # Convert to list of tuples
    airport_date_tuples = list(latest_time_df.itertuples(index=False, name=None))

    return airport_date_tuples

In [None]:
# Creating a list of inputs for the AVWX query.
airport_date_tuples = flightaware_to_metar_prep(df_flightaware)
airport_date_tuples[0:5]

There are two ways to query:

**1:** Select an airport -> Query dates

**2:** Select a date -> Query on airports

**Option 1**

Let's create a dictionary with airports as keys, and dates as values:

In [7]:
airport_date_list = [x for (x,_) in airport_date_tuples]

airport_date_dict = {}

for item in airport_date_list:
    # Each string has a station code and a date separated by space
    station, date = item.split()
    airport_date_dict.setdefault(station, []).append(date)

# Print the first 5 items in the dictionary
for airport, date in list(airport_date_dict.items())[:5]:
    print(f'Airport: {airport}, Dates: {date}')

**Option 2**

Let's create a dictionary with dates as keys, and airports as values:

In [None]:
inverted_dict = {}

for airport, date_list in airport_date_dict.items():
    for date in date_list:
        if date not in inverted_dict:
            inverted_dict[date] = []
        inverted_dict[date].append(airport)

### Step 3: Querying **AVWX**

Next, let's query AVWX API using the dictionary from **Option 2**. Every month of data pulled from the API will be stored in a separate .csv files (checkpoints).

In [None]:
API_TOKEN = avwx_key
BASE_URL = 'https://history.avwx.rest/api/metar/'

# DataFrame to accumulate results
df = pd.DataFrame()

# If you want chronological processing, sort the dates first
sorted_dates = sorted(inverted_dict.keys(), reverse=True)

# Track the current (month, year) as we iterate through dates
current_month_year = None

for d_idx, date in enumerate(sorted_dates):
    try:
        # Parse 'YYYY-MM-DD' into a datetime
        dt = datetime.strptime(date, "%Y-%m-%d")
        month_year = (dt.month, dt.year)
    except ValueError:
        print(f"Skipping invalid date format: {date}")
        continue

    # If it's the first iteration or the month-year changed,
    # save the previous month's DataFrame and reset.
    if current_month_year is None:
        current_month_year = month_year
    elif month_year != current_month_year:
        old_month, old_year = current_month_year
        # Save the accumulated data for the old month-year
        if not df.empty:
            df.to_csv(f"METAR_{old_month}_{old_year}.csv", index=False)
        # Reset df for the new month
        df = pd.DataFrame()
        current_month_year = month_year

    # Process each airport for this date
    airports = inverted_dict[date]
    for a_idx, airport in enumerate(airports):
        try:
            # Construct URL and headers
            url = f"{BASE_URL}{airport}?date={date}&remove=spoken,repr"
            headers = {'Authorization': API_TOKEN}
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                data = response.json()
                if 'results' in data:
                    df = pd.concat(
                        [df, pd.json_normalize(data['results'])],
                        ignore_index=True
                    )
                else:
                    print(f"No 'results' key in API response for {url}")
            else:
                print(f"API Error {response.status_code} for URL: {url}")
                print(response.text)
        except Exception as e:
            print(f"Error processing airport: {airport}, date: {date}: {e}")
        # Verbose progress
        print(
            f"\rPulling date: {date}. Dates: {d_idx} out of {len(sorted_dates)}. "
            f"Airports: {a_idx} out of {len(airports)}.",
            end=""
        )

# After the loop, save any remaining data for the last month-year
if not df.empty and current_month_year is not None:
   last_month, last_year = current_month_year
   df.to_csv(f"your_path_to_data_folder/METAR_{last_month}_{last_year}.csv", index=False)