## Web Scraping & Adding Weather Data

Since the original motivation for this project was to see if we can generate a model to predict flight delays based on weather, we need to add the departure and arrival airport weather at the originally scheduled time of departure and arrival. Luckily, Iowa State University's Mesonet service allows users to download historical airport weather data as .csv files.

Rather than hammer their servers with ~10 million requests (two per row in our dataframe), I opted to download the weather for the entire year at each of the airports we're concerned with, then look up the weather for each flight ourself.

We'll start by downloading the weather files we care about, using their python example script. This script takes a .csv of airports, so we'll create that.

In [4]:
# Import our data from the previous notebook
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_columns', 35)

flights = pd.read_csv('flights_reduced.csv', 
                      index_col=0, 
                      dtype = {'AIRLINE': 'category', 
                                'ORIGIN_AIRPORT': 'category', 
                                'DESTINATION_AIRPORT': 'category', 
                                'DEPARTURE_DELAY': 'float32', 
                                'ARRIVAL_DELAY': 'float32', 
                                'DIVERTED': 'uint8', 
                                'CANCELLED': 'uint8', 
                                'CANCELLATION_REASON': 'category'}, 
                      parse_dates = ['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL']).reset_index(drop=True)
#--------------------------------------------------------------------------------------------------------

all_airports = np.unique((np.append(flights['ORIGIN_AIRPORT'].unique(), flights['DESTINATION_AIRPORT'].unique())))

# This is because Mesonet uses ICAO codes for airports outside of the CONUS (namely, Hawaii, 
# Alaska, and Puerto Rico) instead of IATA codes, and because Yuma's IATA code recently changed from YUM to NYL. 
all_airports = np.append(all_airports, ['PANC', 'PHTO', 'PHKO', 'PHLI', 'PHOG', 'TSJS', 'NYL'])
all_airports[np.isin(all_airports, ['ANC', 'HNL', 'LIH', 'ITO', 'OGG', 'SJU', 'YUM'], invert=True)]

import csv
with open('all_airports.txt', 'w', newline='\n') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter='\n')
    spamwriter.writerow(all_airports)

In [None]:

"""
Example script that scrapes data from the IEM ASOS download service
Source: https://github.com/akrherz/iem/blob/master/scripts/asos/iem_scraper_example.py
Utilizes this service: https://mesonet.agron.iastate.edu/request/download.phtml
"""
from __future__ import print_function
import json
import time
import datetime
# Python 2 and 3: alternative 4
try:
    from urllib.request import urlopen
except ImportError:
    from urllib2 import urlopen

# Number of attempts to download data
MAX_ATTEMPTS = 6
# HTTPS here can be problematic for installs that don't have Lets Encrypt CA
SERVICE = "http://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"


def download_data(uri):
    """Fetch the data from the IEM

    The IEM download service has some protections in place to keep the number
    of inbound requests in check.  This function implements an exponential
    backoff to keep individual downloads from erroring.

    Args:
      uri (string): URL to fetch

    Returns:
      string data
    """
    attempt = 0
    while attempt < MAX_ATTEMPTS:
        try:
            data = urlopen(uri, timeout=300).read().decode('utf-8')
            if data is not None and not data.startswith('ERROR'):
                return data
        except Exception as exp:
            print("download_data(%s) failed with %s" % (uri, exp))
            time.sleep(5)
        attempt += 1

    print("Exhausted attempts to download, returning empty data")
    return ""


def get_stations_from_filelist(filename):
    """Build a listing of stations from a simple file listing the stations.

    The file should simply have one station per line.
    """
    stations = []
    for line in open(filename):
        stations.append(line.strip())
    return stations


def get_stations_from_networks():
    """Build a station list by using a bunch of IEM networks."""
    stations = []
    states = """AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA MD ME
     MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT
     WA WI WV WY"""
    # IEM quirk to have Iowa AWOS sites in its own labeled network
    networks = ['AWOS']
    for state in states.split():
        networks.append("%s_ASOS" % (state,))

    for network in networks:
        # Get metadata
        uri = ("https://mesonet.agron.iastate.edu/"
               "geojson/network/%s.geojson") % (network,)
        data = urlopen(uri)
        jdict = json.load(data)
        for site in jdict['features']:
            stations.append(site['properties']['sid'])
    return stations


def main():
    """Our main method"""
    # timestamps in UTC to request data for
    startts = datetime.datetime(2015, 1, 1)
    endts = datetime.datetime(2015, 12, 31)

    service = SERVICE + "data=sknt&data=p01i&data=vsby&data=gust&data=skyc1&data=skyc2&data=skyc3&data=skyl1&data=skyl2&data=skyl3&data=wxcodes&tz=Etc/UTC&format=onlycomma&latlon=no&"

    service += startts.strftime('year1=%Y&month1=%m&day1=%d&')
    service += endts.strftime('year2=%Y&month2=%m&day2=%d&')

    # Two examples of how to specify a list of stations
    # stations = get_stations_from_networks()
    stations = get_stations_from_filelist("Final Product Files//all_airports.txt")
    for station in stations:
        uri = '%s&station=%s' % (service, station)
        print('Downloading: %s' % (station, ))
        data = download_data(uri)
        outfn = 'Final Product Files//weather\%s.csv' % (station)
        out = open(outfn, 'w')
        out.write(data)
        out.close()


if __name__ == '__main__':
    main()


Next, for each flight's departure and arrival airport, we have to get the weather at the scheduled takeoff and landing times. Unfortunately, for ~5 million items, there's no fast way to do this. The following script appends Visibility (Miles), Cloud Ceiling (ft), Wind Speed (knots), and Hourly Precipitation (inches) for the Departure and Arrival airports of each flight.

In [None]:
import datetime

# Uncomment this line to only work on a portion of the flights
#flights = flights.iloc[0:10000]

# Create a list of all the airports in our flights dataframe
all_airports = np.unique((np.append(flights['ORIGIN_AIRPORT'].unique(), flights['DESTINATION_AIRPORT'].unique())))

# Create a dictionary where each key is a unique airport, and each value is a dataframe containing all the weather data for the 
# entire year at that airport.
airport_wx_dict = {}
for i in all_airports:
    i_wx = pd.read_csv('weather\\%s.csv' % i, 
        parse_dates=['valid'],
        dtype= {'station' : 'category',
               '  skyc1 ': 'category',
               '  skyc2 ': 'category',
               '  skyc3 ': 'category',
               '  wxcodes': 'category'}
        ).rename(columns=lambda x: x.strip())
    airport_wx_dict[i] = i_wx
    
   
def get_wx(airport, dt):
    
    # For a given airport and datetime, find the weather
    smallest_delta = datetime.timedelta(days=365)
    closest_time = 0
    airport_wx_df = airport_wx_dict[airport]
    for i in airport_wx_df['valid']:
        delta = abs(i-dt)
        if delta < smallest_delta: 
            smallest_delta = delta
            closest_time = i
        if delta > smallest_delta:
            break
    airport_wx = airport_wx_df[airport_wx_df['valid'] == closest_time]
    
    # Get the cloud ceiling (defined as the lowest Overcast or Broken cloud layer)
    if (airport_wx['skyc1'][0] == 'OVC') or (airport_wx['skyc1'][0] == 'BKN'):
        ceiling = pd.to_numeric(airport_wx['skyl1'], downcast='unsigned')
    elif airport_wx['skyc2'][0] == 'OVC' or airport_wx['skyc2'][0] == 'BKN':
        ceiling = pd.to_numeric(airport_wx['skyl2'], downcast='unsigned')
    elif airport_wx['skyc3'][0] == 'OVC' or airport_wx['skyc3'][0] == 'BKN':
        ceiling = pd.to_numeric(airport_wx['skyl3'], downcast='unsigned')
    else: ceiling = 25000
    
    # set missing wind values ('M') to 0 knots
    try:
        wind = pd.to_numeric(airport_wx['sknt'], downcast='unsigned')
    except:
        wind = 0  
    
    # set missing visibility values to 10 miles
    try:
        visibility = pd.to_numeric(airport_wx['vsby'], downcast='unsigned')
    except:
        visibility = 10
    
    # set missing precipitation values to 0 inches
    try:
        precip = pd.to_numeric(airport_wx['p01i'], downcast='unsigned')
    except: 
          precip=0
    
    return pd.DataFrame({'CEILING' : ceiling, 'VISIBILITY': visibility, 'WIND_SPEED': wind, 'PRECIPITATION': precip})    

def get_2wx(df):
    # for a single flight, return a Series containing the origin and destination weather
    origin_airport = df['ORIGIN_AIRPORT']
    scheduled_departure = df['SCHEDULED_DEPARTURE']
    destination_airport = df['DESTINATION_AIRPORT']
    scheduled_arrival = df['SCHEDULED_ARRIVAL']
    origin_wx = get_wx(origin_airport, scheduled_departure).add_prefix('ORIGIN_').reset_index(drop=True)
    destination_wx = get_wx(destination_airport, scheduled_arrival).add_prefix('DESTINATION_').reset_index(drop=True)
    return pd.concat([origin_wx, destination_wx], axis=1)

# Get the weather for the entire flights dataframe
wx_data = flights.apply(lambda x: get_2wx(x), axis = 1)

# Convert weather data to a DataFrame
wx_df = pd.DataFrame()
for i in wx_data: 
    wx_df = wx_df.append(i)
wx_df.reset_index(drop=True, inplace=True)

# Concatenate the flight and weather dataframes
flights.reset_index(drop=True, inplace=True)
flights_with_wx = pd.concat([flights, wx_df], axis=1)
flights_with_wx.to_csv('flights_with_wx.csv')
