# Overview

This notebook shows you how to put Landsat data into an AWS deployment of the Open Data Cube.

## Credentials

We use the Boto3 library to work with AWS. Saving data to S3 requires credentials with permission to put data into an S3 bucket. Obviously, if you're contributing to this notebook, you shouldn't commit credentials to the repository. 

In [1]:
import boto3

In [2]:
s3 = boto3.resource('s3')
bucket_name = 'test-odc-bucket'
bucket = s3.Bucket(bucket_name)
for obj in bucket.objects.all():
    print(obj.key)

LT05_L1TP_043027_19860805_20161004_01_T1.xml
LT05_L1TP_043027_19860805_20161004_01_T1_ANG.txt
LT05_L1TP_043027_19860805_20161004_01_T1_GCP.txt
LT05_L1TP_043027_19860805_20161004_01_T1_MTL.txt
LT05_L1TP_043027_19860805_20161004_01_T1_VER.jpg
LT05_L1TP_043027_19860805_20161004_01_T1_VER.txt
LT05_L1TP_043027_19860805_20161004_01_T1_bqa.tif
LT05_L1TP_043027_19860805_20161004_01_T1_cfmask.tif
LT05_L1TP_043027_19860805_20161004_01_T1_cfmask_conf.tif
LT05_L1TP_043027_19860805_20161004_01_T1_sr_atmos_opacity.tif
LT05_L1TP_043027_19860805_20161004_01_T1_sr_band1.tif
LT05_L1TP_043027_19860805_20161004_01_T1_sr_band2.tif
LT05_L1TP_043027_19860805_20161004_01_T1_sr_band3.tif
LT05_L1TP_043027_19860805_20161004_01_T1_sr_band4.tif
LT05_L1TP_043027_19860805_20161004_01_T1_sr_band5.tif
LT05_L1TP_043027_19860805_20161004_01_T1_sr_band7.tif
LT05_L1TP_043027_19860805_20161004_01_T1_sr_cloud_qa.tif
LT05_L1TP_043027_19860805_20161004_01_T1_toa_band1.tif
LT05_L1TP_043027_19860805_20161004_01_T1_toa_band2.tif

## 2. Saving a Scene

In [3]:
import requests
import gzip
import tarfile
import io

In [4]:
def download(url):
    """"""
    r = requests.get(url, stream=True)
    r.raw.decode_content = True
    data = io.BytesIO(r.raw)
    return data

In [5]:
def expand_and_put(path):
    """"""
    with tarfile.open(path) as tar:
        for name in tar.getnames():
            print(name)
            data = tar.extractfile(name).read()
            bucket.put_object(Key=name, Body=data)

In [None]:
url = 'data/foo.tar.gz'
data = expand_and_put(url)

## 3. Build a Dataset YAML

In [None]:
import re
import uuid
import dateutil
from datetime import timedelta
from xml.etree import ElementTree
ns = {
    'espa': 'http://espa.cr.usgs.gov/v2'
}

In [None]:
def get_s3_url(bucket_name, obj_key):
    return 's3://{bucket_name}/{obj_key}'.format(
        bucket_name=bucket_name, 
        obj_key=obj_key)

In [None]:
def get_defined_metadata(url):
    fields = {
        'id': str(uuid.uuid5(uuid.NAMESPACE_URL, url)),
        'processing_level': 'ARD',
        'product_type': 'LS_USGS_ARD',
    }
    return fields

In [None]:
def parse_product_id(key):
    fields = re.match(
    (
        r"(?P<code>LC08|LE07|LT05|LT04)_"
        r"(?P<processing_level>L1TP|L1GT|L1GS)_"
        r"(?P<path>[0-9]{3})(?P<row>[0-9]{3})_"
        r"(?P<acquisition_year>[0-9]{4})(?P<acquisition_month>[0-9]{2})(?P<acquisition_day>[0-9]{2})_"
        r"(?P<processing_year>[0-9]{4})(?P<processing_month>[0-9]{2})(?P<processing_day>[0-9]{2})_"
        r"(?P<collection_number>[0-9]{2})_"
        r"(?P<tier>\w+)"
    ), key).groupdict()
    
    fields['processing_date'] = '{processing_year}-{processing_month}-{processing_day}'.format(**fields)
    
    return fields

In [None]:
def parse_mtl(xml_doc):
    lpgs_metadata_file = xml_doc.find('.//espa:lpgs_metadata_file', ns).text
    #TODO: Actually read this file
    return {'groundstation': 'XXX'}

In [None]:
import rasterio
from rasterio.errors import RasterioIOError
import rasterio.features
import shapely.affinity
import shapely.geometry
import shapely.ops

def safe_valid_region(images, mask_value=None):
    try:
        return valid_region(images, mask_value)
    except (OSError, RasterioIOError):
        return None


def valid_region(images, mask_value=None):
    mask = None

    for fname in images:
        # ensure formats match
        with rasterio.open(str(fname), 'r') as ds:
            transform = ds.transform
            img = ds.read(1)

            if mask_value is not None:
                new_mask = img & mask_value == mask_value
            else:
                new_mask = img != ds.nodata
            if mask is None:
                mask = new_mask
            else:
                mask |= new_mask

    shapes = rasterio.features.shapes(mask.astype('uint8'), mask=mask)
    shape = shapely.ops.unary_union([shapely.geometry.shape(shape) for shape, val in shapes if val == 1])

    geom = shape.convex_hull
    geom = geom.buffer(1, join_style=3, cap_style=3)
    geom = geom.simplify(1)
    geom = geom.intersection(shapely.geometry.box(0, 0, mask.shape[1], mask.shape[0]))

    # transform from pixel space into CRS space
    geom = shapely.affinity.affine_transform(geom, (transform.a, transform.b, transform.d,
                                                    transform.e, transform.xoff, transform.yoff))

    output = shapely.geometry.mapping(geom)
    output['coordinates'] = _to_lists(output['coordinates'])
    return output

In [None]:
def parse_xml_metadata(xmlDoc):
    fields = {}
    fields['product_id'] = xmlDoc.find('.//espa:product_id', ns).text
    fields.update(parse_product_id(fields['product_id']))
    
    fields['satellite'] = xmlDoc.find('.//espa:satellite', ns).text
    fields['instrument'] = xmlDoc.find('.//espa:instrument', ns).text
    
    acquisition_date = xmlDoc.find('.//espa:acquisition_date', ns).text.replace("-", "")
    scene_center_time = xmlDoc.find('.//espa:scene_center_time', ns).text[:8]
    
    center_dt = dateutil.parser.parse(acquisition_date + "T" + scene_center_time)
    aos = dateutil.parser.parse(acquisition_date + "T" + scene_center_time) - timedelta(seconds=(24 / 2))
    los = aos + timedelta(seconds=24)
    fields['start_time'] = str(aos)
    fields['end_time'] = str(los)
    fields['center_dt'] = str(center_dt)
    
    fields['creation_dt'] = str(dateutil.parser.parse(xmlDoc.find('.//espa:level1_production_date', ns).text))
    
    pr = xmlDoc.find('.//espa:wrs', ns)
    fields['path'] = pr.attrib['path']
    fields['row'] = pr.attrib['row']
    
    return fields

In [None]:
def lookup_band_name(band):
    band_name = band.attrib['name']
    # TODO: Simon wants to rename the bands
#     band_lookup = {
#         'bqa': '',
#         'toa_band1': '',
#         'toa_band2': '',
#         'toa_band3': '',
#         'toa_band4': '',
#         'toa_band5': '',
#         'toa_band6': '',
#         'toa_band7': '',
#         'toa_band6_qa': '',
#         'toa_qa': '',
#         'sr_band1': '',
#         'sr_band2': '',
#         'sr_band3': '',
#         'sr_band4': '',
#         'sr_band5': '',
#         'sr_band7': '',

#         'sr_atmos_opacity': '',
#         'sr_cloud_qa': '',
#         'cfmask': '',
#         'cfmask_conf': '',
#     }
#     return band_lookup.get(band_name, band_name)
    return band_name


def get_bands(xml_doc):
    bands = xml_doc.findall('.//espa:band', ns)
    band_dict = {lookup_band_name(band): get_s3_url(bucket_name, band.find('.//espa:file_name', ns).text) 
                 for band in bands}
    return band_dict

In [None]:
xml_doc = ElementTree.parse('/home/andrew/Data/usgs/LT050430271984116-SC20170101123907/LT05_L1TP_043027_19840425_20161004_01_T1.xml')
west = float(xml_doc.find('.//espa:bounding_coordinates/espa:west', ns).text)
east = float(xml_doc.find('.//espa:bounding_coordinates/espa:east', ns).text)
north = float(xml_doc.find('.//espa:bounding_coordinates/espa:north', ns).text)
south = float(xml_doc.find('.//espa:bounding_coordinates/espa:south', ns).text)
north, south, east, west

In [None]:
def get_projection(path):
    with rasterio.open(str(path)) as img:
        left, bottom, right, top = img.bounds
        return {
            'spatial_reference': str(str(getattr(img, 'crs_wkt', None) or img.crs.wkt)),
            'geo_ref_points': {
                'ul': {'x': left, 'y': top},
                'ur': {'x': right, 'y': top},
                'll': {'x': left, 'y': bottom},
                'lr': {'x': right, 'y': bottom},
            }
        }

In [None]:
def get_coords(spatial_reference, geo_ref_points):
    spatial_ref = osr.SpatialReference(spatial_reference)
    t = osr.CoordinateTransformation(spatial_ref, spatial_ref.CloneGeogCS())

    def transform(p):
        lon, lat, z = t.TransformPoint(p['x'], p['y'])
        return {'lon': lon, 'lat': lat}
    return {key: transform(p) for key, p in geo_ref_points.items()}

In [None]:
def make_metadata_doc(fields):
    doc = {
        'id': fields['id'],
        'processing_level': fields["processing_level"],
        'product_type': fields["product_type"],
        'creation_dt': fields["creation_dt"],
        'platform': {'code': fields['satellite']},
        'instrument': {'name': fields["instrument"]},
        'acquisition': {
            'groundstation': {
                'code': fields['groundstation'],    
            },
            'aos': fields["start_time"],
            'los': fields["end_time"],
        },
        'extent': {
            'from_dt': fields["start_time"],
            'to_dt': fields["end_time"],
            'center_dt': fields["center_dt"],
            'coord': fields['coord'],
        },
        'format': {'name': 'GeoTiff'},
        'grid_spatial': {
            'projection': fields['projection'] 
        },
        'image': {
            'satellite_ref_point_start': {'x': int(fields["path"]), 'y': int(fields["row"])},
            'satellite_ref_point_end': {'x': int(fields["path"]), 'y': int(fields["row"])},
            'bands': {key: {'path': value} for (key, value) in fields['bands'].items()},
        },

        'lineage': {'source_datasets': {}}
    }
    return doc

In [None]:
def get_metadata_docs(bucket):
    for obj in bucket.objects.all():
        if obj.key[-3:].lower() == 'xml':
            response = obj.get()
            body = response['Body']
            xml_doc = ElementTree.parse(body)

            fields = {}
            fields.update(get_defined_metadata(get_s3_url(bucket.name, obj.key)))
            fields.update(parse_xml_metadata(xml_doc))

            bands = get_bands(xml_doc)
            fields['bands'] = bands
            sample_band = next(iter(bands.values()))
            projection = get_projection(sample_band)
            fields['projection'] = projection
            fields['coord'] = get_coords(**projection)
            fields.update(parse_mtl(xml_doc))

            metadata_doc = make_metadata_doc(fields)
            yield obj.key, metadata_doc

## 4. Build an Index

In [None]:
!datacube system init --no-init-users

In [None]:
!datacube product add datacube-core/docs/config_samples/dataset_types/usgs_ard.yaml

In [None]:
!datacube product list

In [6]:
%matplotlib inline
import datacube
dc = datacube.Datacube()

In [7]:
dc.list_products()

Unnamed: 0_level_0,name,description,time,format,platform,lon,product_type,lat,instrument,crs,resolution,tile_size,spatial_dimensions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,ls5_usgs_ard,Landsat 5 USGS ARD 30 metre tile,,GeoTiff,LANDSAT_5,,LS_USGS_ARD,,TM,"PROJCS[""Albers"", GEOGCS[""NAD83"", DATUM[""North_...","[-30, 30]",,"(y, x)"


In [8]:
from datacube.scripts.dataset import create_dataset, load_rules_from_types
index = dc.index

product = index.products.get_by_name('ls5_usgs_ard')
product

DatasetType(name='ls5_usgs_ard', id_=1)

In [None]:
for metadata_path, metadata_doc in get_metadata_docs(bucket):
    uri = 'file:///tmp/{obj_key}'.format(bucket=bucket.name, obj_key=metadata_path)
    print(metadata_path)
    d = datacube.model.Dataset(product, metadata_doc, local_uri=uri, sources={})
    index.datasets.add(d)

## 5. Loading some data

In [17]:
data = dc.load('ls5_usgs_ard', dask_chunks={'time':1, 'x': 1000, 'y': 1000})

In [18]:
data.sizes

Frozen(SortedKeysDict({'time': 147, 'y': 5001, 'x': 5001}))

In [23]:
data.time[0].values, data.time[-1].values

(numpy.datetime64('1986-08-05T17:59:06.000000000'),
 numpy.datetime64('1992-12-11T17:58:23.000000000'))

In [24]:
import dask
import dask.multiprocessing

In [25]:
%%time
spaghetti1 = data.sr_band1[5:10,2500, 2500]
with dask.set_options(get=dask.async.get_sync):
    spaghetti1.load()

CPU times: user 1.12 s, sys: 1.06 s, total: 2.18 s
Wall time: 1min 18s


In [31]:
%%time
spaghetti2 = data.sr_band1[0:20,2500, 2500]
with dask.set_options(get=dask.multiprocessing.get):
    spaghetti2.load()

CPU times: user 1.44 s, sys: 68 ms, total: 1.5 s
Wall time: 5min 40s


In [32]:
ds = dc.find_datasets(product='ls5_usgs_ard')

In [33]:
dataset = ds[0]

In [34]:
dataset.metadata_doc

{'acquisition': {'aos': '1986-08-05 17:58:54',
  'groundstation': {'code': 'XXX'},
  'los': '1986-08-05 17:59:18'},
 'creation_dt': '2016-10-04 00:28:09+00:00',
 'extent': {'center_dt': '1986-08-05 17:59:06',
  'from_dt': '1986-08-05 17:58:54',
  'to_dt': '1986-08-05 17:59:18'},
 'format': {'name': 'GeoTiff'},
 'grid_spatial': {'projection': {'geo_ref_points': {'ll': {'x': -1815585.0,
     'y': 2864805.0},
    'lr': {'x': -1665585.0, 'y': 2864805.0},
    'ul': {'x': -1815585.0, 'y': 3014805.0},
    'ur': {'x': -1665585.0, 'y': 3014805.0}},
   'spatial_reference': 'PROJCS["unnamed",GEOGCS["NAD83",DATUM["North_American_Datum_1983",SPHEROID["GRS 1980",6378137,298.257222101,AUTHORITY["EPSG","7019"]],TOWGS84[0,0,0,0,0,0,0],AUTHORITY["EPSG","6269"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4269"]],PROJECTION["Albers_Conic_Equal_Area"],PARAMETER["standard_parallel_1",29.5],PARAMETER["standard_parallel_2",45.5],

In [None]:
dataset.metadata_doc['extent']['coord'] = dataset.metadata_doc