In [53]:
import matplotlib
import boto3
from botocore.exceptions import ClientError
from netCDF4 import Dataset
import datetime as dt
import json
import geopandas as gpd
import pandas as pd
from shapely.geometry import Polygon
import xmltodict
import yaml
import os
import subprocess
from datetime import date
import logging
logger = logging.getLogger()
import os
import netCDF4
import numpy as np

In [56]:
logger.setLevel(level=logging.INFO)

In [60]:
with open(r'config.yaml') as file:
    configuration =  yaml.load(file, Loader=yaml.FullLoader)

In [None]:
for config in configuration['configurations']:
    print(config)
    username = config['username']
    password = config['password']
    url = config['url']
    aoi = config['aoi']
    awss3bucket = config['awss3bucket']
    awskeyid = config['awskeyid']
    awskeypass = config['awskeypass']

In [3]:
def filter(sensors):

    for sensordict in sensors:
        
        filter_string = ''
        count = 0
        
        for sensor in sensordict.keys():
            filter_string = filter_string+'(sensor=%27'+sensor+'%27%20AND%20(product=%27'
            product_count = 0
            for product in sensordict[sensor]:
                filter_string = filter_string+product+'%27'
                if product_count < (len(sensordict[sensor])-1):
                    filter_string = filter_string+'%20OR%20product=%27'
                else:
                    filter_string = filter_string+'))' 
                product_count = product_count + 1
            if count < (len(sensordict.keys())-1):        
                filter_string = filter_string+'%20OR%20'
            count = count+1

    return(filter_string)

In [4]:
def get_polygon_from_gml(gml_dict):
    listoftuples = []
    for i in list(gml_dict.split(" ")):
        pair = (float(i.split(',')[1]), float(i.split(',')[0]))
        listoftuples.append(pair)
    return(listoftuples)

In [69]:
def load_hotspots(filter_string, time_period, bbox, max_features, min_confidence, to_date):
    y_max = bbox[0]
    x_min = bbox[1]
    y_min = bbox[2]
    x_max = bbox[3]
    if to_date is None:
        
        to_date = dt.datetime.now()
    
    
    from_date = (to_date - dt.timedelta(days=time_period)).strftime('%Y-%m-%d')
    
    # trim datetime to enable WFS 
    to_date = to_date.strftime('%Y-%m-%d')
    logger.info(str(from_date)+' '+str(to_date))
    
    # TODO - sort out paging - looks like there is a limit to WFS requests number returned per query
    logger.info(f"https://hotspots.dea.ga.gov.au/geoserver/public/wfs?service=WFS&version=1.1.0&request=GetFeature&typeName=public:hotspots&outputFormat=application/json&CQL_FILTER=({filter_string})%20AND%20datetime%20%3E%20%27{from_date}%27%20AND%20datetime%20%3C%20%27{to_date}%27%20AND%20INTERSECTS(location,%20POLYGON(({y_max}%20{x_min},%20{y_max}%20{x_max},%20{y_min}%20{x_max},%20{y_min}%20{x_min},%20{y_max}%20{x_min})))&maxFeatures={max_features}&startIndex=0&sortBy=sensor%20A")
    url = f"https://hotspots.dea.ga.gov.au/geoserver/public/wfs?service=WFS&version=1.1.0&request=GetFeature&typeName=public:hotspots&outputFormat=application/json&CQL_FILTER=({filter_string})%20AND%20datetime%20%3E%20%27{from_date}%27%20AND%20datetime%20%3C%20%27{to_date}%27%20AND%20INTERSECTS(location,%20POLYGON(({y_max}%20{x_min},%20{y_max}%20{x_max},%20{y_min}%20{x_max},%20{y_min}%20{x_min},%20{y_max}%20{x_min})))&maxFeatures={max_features}&startIndex=0&sortBy=sensor%20A"
    
    hotspots_gdf = gpd.read_file(url)
    #logger.info(str(hotspots_gdf['stop_dt']))
    
    # TODO - improved None value handling  -currently just look at first and apply that to all
    if hotspots_gdf['confidence'][0] == None:
        logger.info('Skipping confidence filter as confidence not populated')
    else:

        # Filter by confidence
        hotspots_gdf = hotspots_gdf.loc[hotspots_gdf.confidence >= min_confidence]

    # Fix datetime
    if hotspots_gdf['start_dt'][0] == None:
        logger.info('Start date field is not populated')
        hotspots_gdf['datetime'] = pd.to_datetime(hotspots_gdf['datetime'])
    else:
        hotspots_gdf['datetime'] = pd.to_datetime(hotspots_gdf['start_dt'])

    # Extract required columns
    hotspots_gdf = hotspots_gdf.loc[:, [
            'datetime', 'latitude', 'longitude', 'confidence', 'geometry', 'product', 'satellite', 'sensor', 'power'
            ]]
    hotspots_gdf.sort_values('datetime', ascending=True, inplace=True)
    logger.info('Hotspots loaded successfully '+str(hotspots_gdf.geometry.total_bounds))

    return(hotspots_gdf)

In [7]:
# Assess inventory against AWS bucket listing

s3 = boto3.resource('s3', aws_access_key_id=awskeyid,
                    aws_secret_access_key=awskeypass)

s3folderlist = []
s3geojsonlist = []
s3bucket = s3.Bucket('s3vtaustralia')

for bucket_object in s3bucket.objects.all():
    s3bucketobject = str(bucket_object.key).split("/")[2]
    if '.SEN3' in s3bucketobject:
        s3folderlist.append(s3bucketobject)
    if '.FRP.geojson' in s3bucketobject:
        s3geojsonlist.append(bucket_object.key)

print(len(s3geojsonlist), "S3 Hotspot files From ESA available in AWS S3") 

197 S3 Hotspot files From ESA available in AWS S3


In [8]:
# Read inventory to geopandas - write to geojson       
        
with open('s3vt_inventory.json') as inventory:
    frames = []
    for p in inventory:
        pages = json.loads(p)
                
        for page in pages:
            for entry in page['feed']['entry']:
                        
                df = pd.DataFrame.from_dict(entry, orient='index')
                        
                polygon = get_polygon_from_gml(xmltodict.parse(entry['str'][2]['content'])['gml:Polygon']['gml:outerBoundaryIs']['gml:LinearRing']['gml:coordinates'])
                
                df = df.transpose()
                df['Coordinates'] = Polygon(polygon)
                for d in entry['str']:
                    if d['name'] ==  'orbitdirection':
                        df['orbitdirection'] = d['content']
                    if d['name'] ==  'platformidentifier':
                        df['platformidentifier'] = d['content'] 
                    if d['name'] ==  'filename':
                        df['filename'] = d['content']
                    if d['name'] ==  'instrumentshortname':
                        df['instrumentshortname'] = d['content']
                    if d['name'] ==  'passnumber':
                        df['passnumber'] = d['content']        
                s3vtdf = gpd.GeoDataFrame(df, geometry='Coordinates')
                
                frames.append(s3vtdf) 
                    
s3vtgpd = pd.concat(frames)

# Not sure why we need to index but do it anyway
s3vtgpd = s3vtgpd.reset_index(drop=True)
s3vtgpd['date'] = pd.to_datetime(s3vtgpd.summary.str.split(",", expand= True)[0].str.split(' ', expand=True)[1])
# Some fields are lists and geojson translation doesn't like it

s3vtgpd = s3vtgpd.drop(['link', 'int', 'str', 'summary'], axis=1)
s3vtgpd.to_file('s3vt_geometry.geojson', driver='GeoJSON')

In [9]:
dataframelength = len(s3vtgpd)
# Add field to enable monitoring
s3vtgpd['hotspot'] = 0
s3vtgpd['download'] = 0
s3vtgpd['s3bucket'] = 0

s3vthostpotsgpdlist = []

In [10]:
# Check if folder already downloaded and flag in gpd
for i in range(dataframelength):
    if s3vtgpd.loc[i]['title']+'.SEN3' in set(s3folderlist):
        s3vtgpd.at[i, 'download'] = 1
    if s3vtgpd.loc[i]['title']+'.FRP.geojson' in set(s3folderlist):
        s3vtgpd.at[i, 'hotspot'] = 1
        s3vthostpotsgpdlist.append(s3hotspotsgpd)

In [11]:
# Copy all S3 geojson files locally and load to GPD

for i in s3geojsonlist:
    subprocess.call(['aws', 's3', 'cp', 's3://s3vtaustralia/'+i, i])

In [14]:
frames = []
for i in s3geojsonlist:
    df1 = gpd.read_file(i)
    if 'S3A' in i:
        df1['satellite'] = 'S3A'
    else:
        df1['satellite'] = 'S3B'
    df1['sensor'] = 'SLSTR'
    df2 = df1.query("FRP_MWIR>0")
    if len(df2) > 0:
        frames.append(df2)
        
s3vthotspots = pd.concat(frames)
#for i in s3geojsonlist:
#    s3vtdf = gpd.GeoDataFrame(df, geometry='Coordinates')

In [16]:
s3vthotspots['date'] = pd.to_datetime(netCDF4.num2date(s3vthotspots.time, units='microseconds since 2000-01-01T00:00:00Z'))

In [17]:
s3vthotspots.to_file('s3vt_hotspots.geojson', driver='GeoJSON')

In [20]:
bounds = list(s3vthotspots.geometry.total_bounds)
bbox = [bounds[3], bounds[0], bounds[1], bounds[2]]
bbox

[11.751792970329575,
 -171.17838656781612,
 -44.941232886534294,
 176.88213855341553]

In [22]:
# Get time bounds for DEA Hotspots query
# TODO - AHI hotspots

maxdate = s3vthotspots.date.max().to_datetime64()
mindate = s3vthotspots.date.min().to_datetime64()
to_date = dt.datetime.strptime(str(s3vthotspots.date.max().to_datetime64()), '%Y-%m-%dT%H:%M:%S.%f000') # '2018-01-01T00:00:00.000Z'

In [23]:
time_period = int(str(np.timedelta64(maxdate - mindate, 'D')).split(' ')[0])

In [70]:
for config in configuration['configurations']:
    
    hotspots_gdf = load_hotspots(filter(config['sensors']),
                                         time_period,
                                         bbox,
                                         config['max_features'], 
                                         config['min_confidence'],
                                         to_date)

INFO:root:2020-02-27 2020-03-01
INFO:root:https://hotspots.dea.ga.gov.au/geoserver/public/wfs?service=WFS&version=1.1.0&request=GetFeature&typeName=public:hotspots&outputFormat=application/json&CQL_FILTER=((sensor=%27AVHRR%27%20AND%20(product=%27SRSS%27%20OR%20product=%27GA%27))%20OR%20(sensor=%27MODIS%27%20AND%20(product=%27MOD14%27%20OR%20product=%27SRSS%27))%20OR%20(sensor=%27VIIRS%27%20AND%20(product=%27AFMOD%27%20OR%20product=%27EDR%27)))%20AND%20datetime%20%3E%20%272020-02-27%27%20AND%20datetime%20%3C%20%272020-03-01%27%20AND%20INTERSECTS(location,%20POLYGON((11.751792970329575%20-171.17838656781612,%2011.751792970329575%20176.88213855341553,%20-44.941232886534294%20176.88213855341553,%20-44.941232886534294%20-171.17838656781612,%2011.751792970329575%20-171.17838656781612)))&maxFeatures=500000&startIndex=0&sortBy=sensor%20A
INFO:root:Hotspots loaded successfully [101.06687927 -44.438      176.81         7.913     ]


In [25]:
os.remove('DEAHotspots_hotspots.geojson')
hotspots_gdf.to_file('DEAHotspots_hotspots.geojson', driver='GeoJSON')

In [73]:
hotspots_gdf.datetime.max()

Timestamp('2020-02-29 23:38:50')

In [34]:
s3vthotspots.date.max()

Timestamp('2020-03-01 15:24:53.078015')