Source of data -> https://www.londonair.org.uk/LondonAir/Default.aspx

API for hourly returns data in format (see `hourly.json`):
```
LocalAuthority (Borough)
    Site
        Species (CO2, NO2)
```

Not all Boroughs have sites, and generally each site might have a different combination of species.

In [1]:
from datetime import timedelta
import requests
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from typing import List, Set, Dict, Tuple, Optional

In [2]:
CONF_LOCATIONS = "locations"

ALL_AUTHORITIES = [
     'Barking and Dagenham',
     'Barnet',
     'Bexley',
     'Brent',
     'Bromley',
     'Camden',
     'City of London',
     'Croydon',
     'Ealing',
     'Enfield',
     'Greenwich',
     'Hackney',
     'Hammersmith and Fulham',
     'Haringey',
     'Harrow',
     'Havering',
     'Hillingdon',
     'Hounslow',
     'Islington',
     'Kensington and Chelsea',
     'Kingston',
     'Lambeth',
     'Lewisham',
     'Merton',
     'Newham',
     'Redbridge',
     'Richmond',
     'Southwark',
     'Sutton',
     'Tower Hamlets',
     'Waltham Forest',
     'Wandsworth',
     'Westminster'
]

## Use only authorities with data, this might change over time
AUTHORITIES = [
    "Barking and Dagenham",
    "Bexley",
    "Brent",
    "Camden",
    "City of London",
    "Croydon",
    "Ealing",
    "Enfield",
    "Greenwich",
    "Hackney",
    "Haringey",
    "Harrow",
    "Havering",
    "Hillingdon",
    "Islington",
    "Kensington and Chelsea",
    "Kingston",
    "Lambeth",
    "Lewisham",
    "Merton",
    "Redbridge",
    "Richmond",
    "Southwark",
    "Sutton",
    "Tower Hamlets",
    "Wandsworth",
    "Westminster",
]

LAQ_HOURLY_URL = "http://api.erg.kcl.ac.uk/AirQuality/Hourly/MonitoringIndex/GroupName=London/Json"

TIMEOUT = 10

In [3]:
AUTHORITIES = ALL_AUTHORITIES # Check functionality with all 

In [4]:
len(AUTHORITIES)

33

Helper for making the request

In [5]:
def request_data(url : str, timeout : int = TIMEOUT) -> Dict:
    """
    Request data from a URL and return valid data as dictionary.
    """
    try:
        response = requests.get(url, timeout=TIMEOUT)
        if response.status_code == 200:
            return response.json()
        else:
            raise LondonAirQualityException(
                f"Status code {response.status_code} returned from {url}")

    except requests.exceptions.Timeout:
        raise LondonAirQualityException(
            f"Request timeout, current timeout is {timeout} seconds"
        )

    except requests.exceptions.ConnectionError as exc:
        raise LondonAirQualityException(f"Internet connection error: {exc}")

Most of the functionality of this package is parsing this raw data. 

In [6]:
def parse_species(species_data):
    """Iterate over list of species at each site."""
    parsed_species_data = []
    quality_list = []
    for species in species_data:
        if species["@AirQualityBand"] != "No data":
            species_dict = {}
            species_dict["description"] = species["@SpeciesDescription"]
            species_dict["code"] = species["@SpeciesCode"]
            species_dict["quality"] = species["@AirQualityBand"]
            species_dict["index"] = species["@AirQualityIndex"]
            species_dict["summary"] = (
                species_dict["code"] + " is " + species_dict["quality"]
            )
            parsed_species_data.append(species_dict)
            quality_list.append(species_dict["quality"])
    return parsed_species_data, quality_list


def parse_site(entry_sites_data):
    """Iterate over all sites at an local authority and tidy the data."""
    authority_data = []
    for site in entry_sites_data:
        site_data = {}
        species_data = []

        site_data["updated"] = site["@BulletinDate"]
        site_data["latitude"] = site["@Latitude"]
        site_data["longitude"] = site["@Longitude"]
        site_data["site_code"] = site["@SiteCode"]
        site_data["site_name"] = site["@SiteName"].split("-")[-1].lstrip()
        site_data["site_type"] = site["@SiteType"]

        if isinstance(site["Species"], dict):
            species_data = [site["Species"]]
        else:
            species_data = site["Species"]

        parsed_species_data, quality_list = parse_species(species_data)

        if not parsed_species_data:
            parsed_species_data.append("no_species_data")
        site_data["pollutants"] = parsed_species_data

        if quality_list:
            site_data["pollutants_status"] = max(
                set(quality_list), key=quality_list.count
            )
            site_data["number_of_pollutants"] = len(quality_list)
        else:
            site_data["pollutants_status"] = "no_species_data"
            site_data["number_of_pollutants"] = 0

        authority_data.append(site_data)
    return authority_data


def parse_hourly_response(hourly_response : Dict) -> Dict:
    """Return data indexed by Borough."""
    data = dict.fromkeys(AUTHORITIES)
    for authority in AUTHORITIES:
        try:
            for entry in hourly_response["HourlyAirQualityIndex"]["LocalAuthority"]:
                if entry["@LocalAuthorityName"] == authority:

                    if isinstance(entry["Site"], dict):
                        entry_sites_data = [entry["Site"]]
                    else:
                        entry_sites_data = entry["Site"]

                    data[authority] = parse_site(entry_sites_data)
        except Exception as exc: 
            # catch misformatted or missing data
            # print(exc)
            data[authority] = {}
    return data

class LondonAirQualityException(Exception):
    pass

In [7]:
try:
    hourly_data_raw = request_data(LAQ_HOURLY_URL)
except LondonAirQualityException as exc:
    print(exc)

## Demo inner workings

The API response lists 33 `LocalAuthority` but only 27 return data, so in applications it is useful to not allow configuration to display info for the boroughs with no data

In [8]:
len(hourly_data_raw['HourlyAirQualityIndex']['LocalAuthority'])

33

Show the raw data for a single `LocalAuthority`

In [9]:
hourly_data_raw['HourlyAirQualityIndex']['LocalAuthority'][0]

{'@LocalAuthorityCode': '1',
 '@LocalAuthorityName': 'Barking and Dagenham',
 '@LaCentreLatitude': '51.538435',
 '@LaCentreLongitude': '0.11467',
 '@LaCentreLatitudeWGS84': '6717095.01808',
 '@LaCentreLongitudeWGS84': '12765.0060093',
 'Site': [{'@BulletinDate': '2019-09-26 08:00:00',
   '@SiteCode': 'BG1',
   '@SiteName': 'Barking and Dagenham - Rush Green',
   '@SiteType': 'Suburban',
   '@Latitude': '51.563752',
   '@Longitude': '0.177891',
   '@LatitudeWGS84': '6721627.34498',
   '@LongitudeWGS84': '19802.7355367',
   '@OwnerID': '1',
   'Species': [{'@SpeciesCode': 'NO2',
     '@SpeciesDescription': 'Nitrogen Dioxide',
     '@AirQualityIndex': '1',
     '@AirQualityBand': 'Low',
     '@IndexSource': 'Measurement'},
    {'@SpeciesCode': 'SO2',
     '@SpeciesDescription': 'Sulphur Dioxide',
     '@AirQualityIndex': '1',
     '@AirQualityBand': 'Low',
     '@IndexSource': 'Measurement'}]},
  {'@BulletinDate': '2019-09-26 08:00:00',
   '@SiteCode': 'BG2',
   '@SiteName': 'Barking and 

Parse out the sites data for this local authority

In [10]:
sites_data = parse_site(hourly_data_raw['HourlyAirQualityIndex']['LocalAuthority'][0]['Site'])
sites_data

[{'updated': '2019-09-26 08:00:00',
  'latitude': '51.563752',
  'longitude': '0.177891',
  'site_code': 'BG1',
  'site_name': 'Rush Green',
  'site_type': 'Suburban',
  'pollutants': [{'description': 'Nitrogen Dioxide',
    'code': 'NO2',
    'quality': 'Low',
    'index': '1',
    'summary': 'NO2 is Low'},
   {'description': 'Sulphur Dioxide',
    'code': 'SO2',
    'quality': 'Low',
    'index': '1',
    'summary': 'SO2 is Low'}],
  'pollutants_status': 'Low',
  'number_of_pollutants': 2},
 {'updated': '2019-09-26 08:00:00',
  'latitude': '51.529389',
  'longitude': '0.132857',
  'site_code': 'BG2',
  'site_name': 'Scrattons Farm',
  'site_type': 'Suburban',
  'pollutants': [{'description': 'Nitrogen Dioxide',
    'code': 'NO2',
    'quality': 'Low',
    'index': '1',
    'summary': 'NO2 is Low'},
   {'description': 'PM10 Particulate',
    'code': 'PM10',
    'quality': 'Low',
    'index': '1',
    'summary': 'PM10 is Low'}],
  'pollutants_status': 'Low',
  'number_of_pollutants': 2

## Package Usage

In [11]:
hourly_data = parse_hourly_response(hourly_data_raw)

In [12]:
hourly_data['Barking and Dagenham']

[{'updated': '2019-09-26 08:00:00',
  'latitude': '51.563752',
  'longitude': '0.177891',
  'site_code': 'BG1',
  'site_name': 'Rush Green',
  'site_type': 'Suburban',
  'pollutants': [{'description': 'Nitrogen Dioxide',
    'code': 'NO2',
    'quality': 'Low',
    'index': '1',
    'summary': 'NO2 is Low'},
   {'description': 'Sulphur Dioxide',
    'code': 'SO2',
    'quality': 'Low',
    'index': '1',
    'summary': 'SO2 is Low'}],
  'pollutants_status': 'Low',
  'number_of_pollutants': 2},
 {'updated': '2019-09-26 08:00:00',
  'latitude': '51.529389',
  'longitude': '0.132857',
  'site_code': 'BG2',
  'site_name': 'Scrattons Farm',
  'site_type': 'Suburban',
  'pollutants': [{'description': 'Nitrogen Dioxide',
    'code': 'NO2',
    'quality': 'Low',
    'index': '1',
    'summary': 'NO2 is Low'},
   {'description': 'PM10 Particulate',
    'code': 'PM10',
    'quality': 'Low',
    'index': '1',
    'summary': 'PM10 is Low'}],
  'pollutants_status': 'Low',
  'number_of_pollutants': 2

## Dataframe
We can also process the data into a pandas dataframe

In [13]:
def get_hourly_dataframe(hourly_data : Dict) -> pd.DataFrame:
    all_data = []
    for authority in hourly_data.keys():
        for site in hourly_data[authority]:        
            for pollutant in site['pollutants']:
                try:
                    pollutant['borough'] = authority
                    pollutant['site_code'] = site['site_code']
                    pollutant['site_name'] = site['site_name']
                    pollutant['latitude'] = site['latitude']
                    pollutant['longitude'] = site['longitude']
                    pollutant['updated'] = site['updated']
                    all_data.append(pollutant)
                except:
                    pass
    df = pd.DataFrame(all_data)
    return df

In [14]:
df = get_hourly_dataframe(hourly_data)

In [15]:
df.head()

Unnamed: 0,borough,code,description,index,latitude,longitude,quality,site_code,site_name,summary,updated
0,Barking and Dagenham,NO2,Nitrogen Dioxide,1,51.563752,0.177891,Low,BG1,Rush Green,NO2 is Low,2019-09-26 08:00:00
1,Barking and Dagenham,SO2,Sulphur Dioxide,1,51.563752,0.177891,Low,BG1,Rush Green,SO2 is Low,2019-09-26 08:00:00
2,Barking and Dagenham,NO2,Nitrogen Dioxide,1,51.529389,0.132857,Low,BG2,Scrattons Farm,NO2 is Low,2019-09-26 08:00:00
3,Barking and Dagenham,PM10,PM10 Particulate,1,51.529389,0.132857,Low,BG2,Scrattons Farm,PM10 is Low,2019-09-26 08:00:00
4,Bexley,NO2,Nitrogen Dioxide,1,51.4946486813055,0.137279111232178,Low,BQ7,Belvedere West,NO2 is Low,2019-09-26 08:00:00


In [16]:
boroughs_with_data = df['borough'].unique()
print(boroughs_with_data)
len(boroughs_with_data)

['Barking and Dagenham' 'Bexley' 'Brent' 'Camden' 'City of London'
 'Croydon' 'Ealing' 'Enfield' 'Greenwich' 'Hackney' 'Haringey' 'Harrow'
 'Havering' 'Hillingdon' 'Islington' 'Kensington and Chelsea' 'Kingston'
 'Lambeth' 'Lewisham' 'Merton' 'Redbridge' 'Richmond' 'Southwark' 'Sutton'
 'Tower Hamlets' 'Wandsworth' 'Westminster']


27

In [17]:
boroughs_without_data = list(set(ALL_AUTHORITIES) - set(boroughs_with_data))
boroughs_without_data

['Waltham Forest',
 'Newham',
 'Hammersmith and Fulham',
 'Bromley',
 'Hounslow',
 'Barnet']

In [18]:
df['code'].value_counts()

NO2     77
PM10    61
PM25    18
O3      16
SO2      6
Name: code, dtype: int64

In [19]:
df['quality'].value_counts()

Low    178
Name: quality, dtype: int64

In [20]:
df['index'].value_counts()

1    156
2     22
Name: index, dtype: int64