In [None]:
import gc
import matplotlib.pyplot as plt
import seaborn
import boto3
from botocore.exceptions import ClientError
from netCDF4 import Dataset
import datetime as dt
import json
import geopandas as gpd
import pandas as pd
from shapely.geometry import Polygon
import shapely.speedups
shapely.speedups.enable()
import xmltodict
import yaml
import os
import subprocess
from datetime import date
from datetime import datetime
from datetime import timedelta
import logging
logger = logging.getLogger()
import os
import netCDF4
import numpy as np
from scipy.spatial import cKDTree
from shapely.geometry import Point, LineString
from geopy.distance import distance
from pathlib import Path
import shutil
import requests
from requests.auth import HTTPBasicAuth
import folium

In [None]:
logger.setLevel(level=logging.INFO)
logging.basicConfig(filename='notebook.log',level=logging.INFO)

# Open configuration file and read parameters

In [None]:
with open(r'config.yaml') as file:
    configuration =  yaml.load(file, Loader=yaml.FullLoader)

In [None]:
for config in configuration['configurations']:
    username = config['username']
    password = config['password']
    url = config['url']
    aoi = config['aoi']
    awss3bucket = config['awss3bucket']
    awskeyid = config['awskeyid']
    awskeypass = config['awskeypass']
    hotspotslogin = config['hotspots_login']
    hotspotspassword = config['hotspots_password']


# Functions

In [None]:
def get_satellite_swaths(configuration, start, period, solar_day):
    """
    Function to determine the common imaging footprint of a pair of sensors
    
    Returns the imaging footprints of a pair of sensors for a period from a starting datetime
    """
    output = Path('output')
    dirpath = Path.joinpath(output, solar_day)
    
    if (dirpath.exists()):
        logger.info(str(solar_day)+" exists - skipping swath generation")
        success = True
    else:
        dirpath.mkdir(parents=True, exist_ok=True)

    
        try:
            logger.info('Generating swaths '+str(['python', 'swathpredict.py', '--configuration', configuration, '--start', start, '--period', period, '--output_path', str(dirpath)])
)
            subprocess.call(['python', 'swathpredict.py', '--configuration', configuration, '--start', start, '--period', period, '--output_path', str(dirpath)])
            
            success = True
        except:
            success = False
            logger.info('Swath generation failed')
    
    return(success)

In [None]:
def pairwise_swath_intersect(satsensorsA, satsensorsB, solar_day):
    logging.info("Running intersection for "+str(satsensorsA)+' '+str(satsensorsB))
    satsensorsA = [w.replace(' ', '_') for w in satsensorsA]
    satsensorsB = [w.replace(' ', '_') for w in satsensorsB]
        
    filesA = []
    filesB = []
    
    output = Path('output')
    dirpath = Path.joinpath(output, solar_day)
    
    for sat in satsensorsA:
   
        filesA.extend([f for f in os.listdir(str(dirpath)) if (sat in f )and ('swath.geojson' in f)])
        
    for sat in satsensorsB:
    
        filesB.extend([f for f in os.listdir(str(dirpath)) if sat in f and 'swath.geojson' in f])
    
    gpdlistA = []
    for file in filesA:
        df = gpd.read_file(Path.joinpath(dirpath, file))
        gpdlistA.append(df)
    gpdlistB = []
    for file in filesB:
        df = gpd.read_file(Path.joinpath(dirpath, file))
        gpdlistB.append(df)        
    return(pd.concat(gpdlistA),pd.concat(gpdlistB) )

In [None]:
def get_file_list(username, password, aoi, startrecord):
    # rows returned is limited to 100, add pagination but looking at number of records and incrementing by 100 each iteration
    """
    Function to determine available files for download from S3 expert hub via wget from a start record
    
    Returns success if the wget function has executed 
    """
    try:
        subprocess.call(['wget','--no-check-certificate', '--user='+username, '--password='+password, '--output-document=filelist.txt', 'https://131.176.236.38/dhus/search?q=footprint:"Intersects('+aoi+')" AND platformname:Sentinel-3 AND producttype:SL_2_FRP___&rows=100&start='+startrecord+'&format=json'])
        success = True
    except:
        success = False
    return(success)

In [None]:
def filter(sensors):
    """
    Function to construct a filter statement for input to geoserver WFS query
    
    Returns string for input to geoserver WFS query
    """
    for sensordict in sensors:
        
        filter_string = ''
        count = 0
        
        for sensor in sensordict.keys():
            filter_string = filter_string+'(sensor=%27'+sensor+'%27%20AND%20(product=%27'
            product_count = 0
            for product in sensordict[sensor]:
                filter_string = filter_string+product+'%27'
                if product_count < (len(sensordict[sensor])-1):
                    filter_string = filter_string+'%20OR%20product=%27'
                else:
                    filter_string = filter_string+'))' 
                product_count = product_count + 1
            if count < (len(sensordict.keys())-1):        
                filter_string = filter_string+'%20OR%20'
            count = count+1

    return(filter_string)

In [None]:
def get_polygon_from_gml(gml_dict):
    """
    Function to construct a polygon from the GML within the S3 Expert hub wget query response
    
    Returns a list of tuples representing the polygon for the imaging extent
    """
    listoftuples = []
    for i in list(gml_dict.split(" ")):
        pair = (float(i.split(',')[1]), float(i.split(',')[0]))
        listoftuples.append(pair)
    return(listoftuples)

In [None]:
def load_hotspots(filter_string, time_period, bbox, max_features, min_confidence, to_date, login, password):
    """
    Function to find available DEA Hotspots from a WFS query within bounding box for given time range
    and set of sensor
    
    Returns a geopandas dataframe for the DEA Hotspot points matching the query criteria
    """
    y_max = bbox[0]
    x_min = bbox[1]
    y_min = bbox[2]
    x_max = bbox[3]
    if to_date is None:
        
        to_date = dt.datetime.now()
    
    
    from_date = (to_date - dt.timedelta(days=time_period)).strftime('%Y-%m-%d')
    
    # trim datetime to enable WFS 
    to_date = to_date.strftime('%Y-%m-%d')
    logger.info(str(from_date)+' '+str(to_date))
    # TODO - sort out paging - looks like there is a limit to WFS requests number returned per query
    # First query - count how many records in response - if = add to start record in query from previous loop
    
    start_index = 0
    #feature_count = max_features
    gpd_list = []
    logger.info(f"https://{login}:{password}@hotspots.dea.ga.gov.au/geoserver/public/wfs?service=WFS&version=1.1.0&request=GetFeature&typeName=public:hotspots&outputFormat=application/json&CQL_FILTER=({filter_string})%20AND%20datetime%20%3E%20%27{from_date}%27%20AND%20datetime%20%3C%20%27{to_date}%27%20AND%20INTERSECTS(location,%20POLYGON(({y_max}%20{x_min},%20{y_max}%20{x_max},%20{y_min}%20{x_max},%20{y_min}%20{x_min},%20{y_max}%20{x_min})))&maxFeatures=1&startIndex=0&sortBy=sensor%20A")
    url = f"https://hotspots.dea.ga.gov.au/geoserver/public/wfs?service=WFS&version=1.1.0&request=GetFeature&typeName=public:hotspots&outputFormat=application/json&CQL_FILTER=({filter_string})%20AND%20datetime%20%3E%20%27{from_date}%27%20AND%20datetime%20%3C%20%27{to_date}%27%20AND%20INTERSECTS(location,%20POLYGON(({y_max}%20{x_min},%20{y_max}%20{x_max},%20{y_min}%20{x_max},%20{y_min}%20{x_min},%20{y_max}%20{x_min})))&maxFeatures=1&startIndex=0&sortBy=sensor%20A"
    
    try:
        data = requests.get(url, auth=HTTPBasicAuth(login, password))
    except:
        logger.info("URL request rejected")
        
    totalfeatures = data.json()['totalFeatures']
    
    while start_index <= totalfeatures:
        logger.info(f"https://{login}:{password}@hotspots.dea.ga.gov.au/geoserver/public/wfs?service=WFS&version=1.1.0&request=GetFeature&typeName=public:hotspots&outputFormat=application/json&CQL_FILTER=({filter_string})%20AND%20datetime%20%3E%20%27{from_date}%27%20AND%20datetime%20%3C%20%27{to_date}%27%20AND%20INTERSECTS(location,%20POLYGON(({y_max}%20{x_min},%20{y_max}%20{x_max},%20{y_min}%20{x_max},%20{y_min}%20{x_min},%20{y_max}%20{x_min})))&maxFeatures={max_features}&startIndex={start_index}&sortBy=sensor%20A")
        url = f"https://hotspots.dea.ga.gov.au/geoserver/public/wfs?service=WFS&version=1.1.0&request=GetFeature&typeName=public:hotspots&outputFormat=application/json&CQL_FILTER=({filter_string})%20AND%20datetime%20%3E%20%27{from_date}%27%20AND%20datetime%20%3C%20%27{to_date}%27%20AND%20INTERSECTS(location,%20POLYGON(({y_max}%20{x_min},%20{y_max}%20{x_max},%20{y_min}%20{x_max},%20{y_min}%20{x_min},%20{y_max}%20{x_min})))&maxFeatures={max_features}&startIndex={start_index}&sortBy=sensor%20A"
    
        try:
            data = requests.get(url, auth=HTTPBasicAuth(login, password))
        except:
            logger.info("URL request rejected")
        
        gpd_list.append(gpd.read_file(json.dumps(data.json())))
        
        start_index = start_index + max_features
        
    hotspots_gdf = pd.concat(gpd_list)
    
    # Reset the index because you just concatenated dataframes
    hotspots_gdf = hotspots_gdf.reset_index()
    
    # TODO - improved None value handling  -currently just look at first and apply that to all
    if hotspots_gdf['confidence'][0] == None:
        logger.info('Skipping confidence filter as confidence not populated')
    else:
        # Filter by confidence
        hotspots_gdf = hotspots_gdf.loc[hotspots_gdf.confidence >= min_confidence]

    # Fix datetime
    if hotspots_gdf['start_dt'][0] == None:
        logger.info('Start date field is not populated')
        hotspots_gdf['datetime'] = pd.to_datetime(hotspots_gdf['datetime'])
    else:
        hotspots_gdf['datetime'] = pd.to_datetime(hotspots_gdf['start_dt'])

    # Extract required columns
    hotspots_gdf = hotspots_gdf.loc[:, [
            'datetime', 'latitude', 'longitude', 'confidence', 'geometry', 'product', 'satellite', 'sensor', 'power'
            ]]
    hotspots_gdf.sort_values('datetime', ascending=True, inplace=True)
    logger.info('Hotspots loaded successfully '+str(hotspots_gdf.geometry.total_bounds))

    return(hotspots_gdf)

In [None]:
def solar_day_start_stop_period(longitude_east, longitude_west, solar_day):
    """
    Function solar day start time from longitude and solar day in utc 
    
    Returns datetime start stop in utc and period between in minutes
    """
    # Solar day time relative to UTC and local longitude
    SECONDS_PER_DEGREE = 240
    # Offset for eastern limb
    offset_seconds_east = int(longitude_east * SECONDS_PER_DEGREE)
    offset_seconds_east = np.timedelta64(offset_seconds_east, 's')
    # offset for wester limb
    offset_seconds_west = int(longitude_west * SECONDS_PER_DEGREE)
    offset_seconds_west = np.timedelta64(offset_seconds_west, 's')
    # time between two limbs
    offset_day = np.timedelta64(1440, 'm') + abs(offset_seconds_east - offset_seconds_west)
    #ten_am_crossing_adjustment = np.timedelta64(120, 'm')
    # Solar day start at eastern limb
    solar_day_start_utc = (np.datetime64(solar_day) - offset_seconds_east ).astype(datetime)
    # Solar day finish at western limb
    solar_day_finish_utc = ((np.datetime64(solar_day)+offset_day)  - offset_seconds_east ).astype(datetime)
    # Duration of solar day
    solar_day_duration = np.timedelta64((solar_day_finish_utc - solar_day_start_utc), 'm' )
    
    return(solar_day_start_utc, solar_day_finish_utc , solar_day_duration.astype(datetime))

In [None]:
startutc, endutc, duration = solar_day_start_stop_period(150, 110, datetime(2020, 5, 2, 0))

In [None]:
startutc, endutc, duration

In [None]:
def solar_day(utc, longitude):
    """
    Function solar day for a given UTC time and longitude input
    
    Returns datetime object representing solar day
    """
    SECONDS_PER_DEGREE = 240
    offset_seconds = int(longitude * SECONDS_PER_DEGREE)
    offset = np.timedelta64(offset_seconds, 's')
    return (np.datetime64(utc) + offset).astype(datetime)

In [None]:
solar_day(startutc, 110)

In [None]:
def ckdnearest(gdA, gdB):
    """
    Function to find points in "B" nearest to "A" geopandas dataframe
    
    Returns geopandas dataframe with records representing matches
    """
    nA = np.array(list(zip(gdA.geometry.x, gdA.geometry.y)) )
    nB = np.array(list(zip(gdB.geometry.x, gdB.geometry.y)) )
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k=1)
    gdf = pd.concat(
        [gdA.reset_index(drop=True), gdB.loc[idx].reset_index(drop=True).add_prefix('2_'),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

In [None]:
# Set the area of interest
westlon = 110.0
southlat = -50.0
eastlon = 160.0
northlat = -10.0
bbox = (westlon, southlat, eastlon, northlat)

# Skip to "Load Hotspots to GeoPandas" if testing analytics

# Assess inventory against AWS bucket listing

In [None]:
# Get bucket listing of image granules and hotspot files

s3 = boto3.resource('s3', aws_access_key_id=awskeyid,
                    aws_secret_access_key=awskeypass)

s3folderlist = []
s3geojsonlist = []
s3bucket = s3.Bucket('s3vtaustralia')

for bucket_object in s3bucket.objects.all():
    s3bucketobject = str(bucket_object.key).split("/")[2]
    if '.SEN3' in s3bucketobject:
        s3folderlist.append(s3bucketobject)
    if '.FRP.geojson' in s3bucketobject:
        s3geojsonlist.append(bucket_object.key)

print(len(s3geojsonlist), "S3 Hotspot files From ESA available in AWS S3") 

In [None]:
print(len(set(s3folderlist)), "S3 Granules From ESA available in AWS S3") 

# Retrieve the file list inventory used to gather the S3 files

In [None]:
# Get File List from server
# Run this if no local inventory exists
if not os.path.exists('s3vt_inventory.json'):
    startrecord = 0

    responselist = [] 

    # Determine number of records to retrieve
    get_file_list(username, password, aoi, str(startrecord))
    with open('filelist.txt') as results:
        for i in results: 
            response = json.loads(i)
            responselist.append(response)

    upperlimit = int(response['feed']['opensearch:totalResults'])
    upperlimit = 200
    # Get the full list of records

    while startrecord <= upperlimit:
        startrecord = startrecord+100

        get_file_list(username, password, aoi, str(startrecord)) 
        with open('filelist.txt') as results:
            for i in results: responselist.append(json.loads(i))

            # Dump the results to an inventory file
            with open('s3vt_inventory.json', 'w') as f:
                json.dump(responselist, f)

# From the inventory - generate a vector fooprint

In [None]:
# Read inventory to geopandas - write to geojson       
        
with open('s3vt_inventory.json') as inventory:
    frames = []
    for p in inventory:
        pages = json.loads(p)
                
        for page in pages:
            for page in pages:
                try:
                    
                    for entry in page['feed']['entry']:

                        df = pd.DataFrame.from_dict(entry, orient='index')

                        polygon = get_polygon_from_gml(xmltodict.parse(entry['str'][2]['content'])['gml:Polygon']['gml:outerBoundaryIs']['gml:LinearRing']['gml:coordinates'])

                        df = df.transpose()
                        df['Coordinates'] = Polygon(polygon)
                        for d in entry['str']:
                            if d['name'] ==  'orbitdirection':
                                df['orbitdirection'] = d['content']
                            if d['name'] ==  'platformidentifier':
                                df['platformidentifier'] = d['content'] 
                            if d['name'] ==  'filename':
                                df['filename'] = d['content']
                            if d['name'] ==  'instrumentshortname':
                                df['instrumentshortname'] = d['content']
                            if d['name'] ==  'passnumber':
                                df['passnumber'] = d['content']        
                        s3vtdf = gpd.GeoDataFrame(df, geometry='Coordinates')

                        frames.append(s3vtdf)
                                
                except KeyError:
                        logger.info("KeyError exception for get_polygon_from_gml()")
                        
s3vtgpd = pd.concat(frames)

# Not sure why we need to index but do it anyway
s3vtgpd = s3vtgpd.reset_index(drop=True)
s3vtgpd['date'] = pd.to_datetime(s3vtgpd.summary.str.split(",", expand= True)[0].str.split(' ', expand=True)[1])
# Some fields are lists and geojson translation doesn't like it

s3vtgpd = s3vtgpd.drop(['link', 'int', 'str', 'summary'], axis=1)
s3vtgpd.to_file('s3vt_geometry.geojson', driver='GeoJSON')

# Set up for checking inventory against files on AWS

In [None]:
dataframelength = len(s3vtgpd)
# Add field to enable monitoring
s3vtgpd['hotspot'] = 0
s3vtgpd['download'] = 0
s3vtgpd['s3bucket'] = 0

s3vthostpotsgpdlist = []

# Run inventory check

In [None]:
# Check if folder already downloaded and flag in gpd
for i in range(dataframelength):
    if s3vtgpd.loc[i]['title']+'.SEN3' in set(s3folderlist):
        s3vtgpd.at[i, 'download'] = 1
    if s3vtgpd.loc[i]['title']+'.FRP.geojson' in set(s3folderlist):
        s3vtgpd.at[i, 'hotspot'] = 1
        s3vthostpotsgpdlist.append(s3hotspotsgpd)

# Sync GeoJSON to local machine

In [None]:
# Copy all S3 geojson files locally and load to GPD
# TODO - fix the below subprocess as it doesn't seem to accept the exclude include parameters and syncs everything
#subprocess.call(['echo', 'aws', 's3', 'sync', 's3://s3vtaustralia/', '.', '--exclude', '\"*\"', '--include', '\"*.geojson\"', '--dryrun'])
# Going with this in the interim
!aws s3 sync s3://s3vtaustralia/ . --exclude "*" --include "*.geojson"

# Filter hotspots based on limitations stated by UCL

In [None]:
frames = []
for i in s3geojsonlist:
    df1 = gpd.read_file(i, bbox=bbox)
    if 'S3A' in i:
        df1['satellite'] = 'SENTINEL_3A'
    else:
        df1['satellite'] = 'SENTINEL_3B'
    df1['sensor'] = 'SLSTR'
    df2 = df1.query("FRP_MWIR>0")
    if len(df2) > 0:
        frames.append(df2)
        
s3vthotspots = pd.concat(frames)


# Cleanup

In [None]:
frames = None
s3geojsonlist = None
df = None
df1 = None
df2 = None
s3vtgpd = None
s3vthostpotsgpdlist = None
gc.collect()

# Convert netcdf CF time to something pandas understands

In [None]:
s3vthotspots['date'] = pd.to_datetime(netCDF4.num2date(s3vthotspots.time, units='microseconds since 2000-01-01T00:00:00Z', only_use_cftime_datetimes=False, only_use_python_datetimes=True))

# Add solar day column to enable group by function

In [None]:
s3vthotspots['solar_day'] = s3vthotspots.apply(lambda row: solar_day(row.date, row.longitude), axis = 1)

# Write S3 Hotspots to GeoJSON

In [None]:
s3vthotspots.to_file('s3vt_hotspots.geojson', driver='GeoJSON')

In [None]:
bounds = list(s3vthotspots.geometry.total_bounds)
wfsbbox = [bounds[3], bounds[0], bounds[1], bounds[2]]
wfsbbox

In [None]:
#s3vthotspots = gpd.read_file('s3vt_hotspots.geojson', bbox=bbox)
#s3vthotspots['datetime'] = pd.to_datetime(s3vthotspots['date'])
#s3vthotspots['solar_day'] = pd.to_datetime(s3vthotspots['solar_day'])

In [None]:
# Get time bounds for DEA Hotspots query

maxdate = s3vthotspots.date.max().to_datetime64()
mindate = s3vthotspots.date.min().to_datetime64()
to_date = dt.datetime.strptime(str(s3vthotspots.date.max().to_datetime64()), '%Y-%m-%dT%H:%M:%S.%f000') # '2018-01-01T00:00:00.000Z'
time_period = int(str(np.timedelta64(maxdate - mindate, 'D')).split(' ')[0])

# Cleanup

In [None]:
s3vthotspots = None
gc.collect()

In [None]:
# For testing DEA Hotspots load
#hotspots_gdf = load_hotspots('(sensor=%27AVHRR%27%20AND%20(product=%27SRSS%27%20OR%20product=%27GA%27))%20OR%20(sensor=%27MODIS%27%20AND%20(product=%27MOD14%27%20OR%20product=%27SRSS%27))%20OR%20(sensor=%27VIIRS%27%20AND%20(product=%27AFMOD%27%20OR%20product=%27AFIMG%27%20OR%20product=%27EDR%27%20OR%20product=%27SRSS%27))',88, [12.234104969111854, -179.98885754557182, -46.4811018826465, 179.9494145456714], 300000, 0 , dt.datetime.strptime('2020-05-01 02:23:39.493931', '%Y-%m-%d %H:%M:%S.%f'), 'hotspots', 'F1r3f1ght3R')

In [None]:
for config in configuration['configurations']:
    
    hotspots_gdf = load_hotspots(filter(config['sensors']),
                                         time_period,
                                         wfsbbox,
                                         config['max_features'], 
                                         config['min_confidence'],
                                         to_date,
                                         config['hotspots_login'],
                                         config['hotspots_password'])

In [None]:
hotspots_gdf['solar_day'] = hotspots_gdf.apply(lambda row: solar_day(row.datetime, row.longitude), axis = 1)

In [None]:
try:
    os.remove('DEAHotspots_hotspots.geojson')
except:
    logger.info('DEA Hotspots geojson does not exist or cannot be deleted')
    
hotspots_gdf.to_file('DEAHotspots_hotspots.geojson', driver='GeoJSON')

# Load Hotspots to GeoPandas

In [None]:
hotspots_gdf = gpd.read_file('DEAHotspots_hotspots.geojson', bbox=bbox)
hotspots_gdf['datetime'] = pd.to_datetime(hotspots_gdf['datetime'])
hotspots_gdf['solar_day'] = pd.to_datetime(hotspots_gdf['solar_day'])

In [None]:
s3vthotspots = gpd.read_file('s3vt_hotspots.geojson', bbox=bbox)
s3vthotspots['datetime'] = pd.to_datetime(s3vthotspots['date'])
s3vthotspots['solar_day'] = pd.to_datetime(s3vthotspots['solar_day'])

# Clean up S3 Hotspots to allow single GeoDataFrame

In [None]:
s3vthotspots.rename(columns={'F1_Fire_pixel_radiance':'power'}, inplace=True)

In [None]:
s3vthotspots = s3vthotspots.drop(['FRP_MWIR', 'FRP_SWIR', 'FRP_uncertainty_MWIR',
       'FRP_uncertainty_SWIR', 'Glint_angle', 'IFOV_area', 'Radiance_window',
       'S7_Fire_pixel_radiance', 'TCWV', 'classification',  'i',
       'j', 'n_SWIR_fire', 'n_cloud', 'n_water',
       'n_window', 'time', 'transmittance_MWIR', 'transmittance_SWIR',
       'used_channel', 'date'], axis=1)

In [None]:
s3vthotspots['satellite_sensor_product'] = s3vthotspots['satellite']+'_'+s3vthotspots['sensor']+'_ESA'

In [None]:
hotspots_gdf['satellite_sensor_product'] = hotspots_gdf['satellite']+'_'+hotspots_gdf['sensor']+'_'+hotspots_gdf['product']

In [None]:
hotspots_gdf = hotspots_gdf.drop(['product'], axis=1)

In [None]:
hotspots_gdf = pd.concat([hotspots_gdf, s3vthotspots])

In [None]:
# Empty the S3 geodataframe object
s3vthotspots = None
gc.collect()

In [None]:
# Concatenating doesn't update the index automatically
hotspots_gdf.reset_index(drop=True, inplace=True)

In [None]:
# Map plot hotspots
hotspots_gdf.plot(column='satellite_sensor_product', legend=True, legend_kwds={'loc': 'upper right'}, figsize=(20, 20))

# Index by solar day to enable groupby

In [None]:
hotspots_gdf.index

In [None]:
hotspots_gdf =  hotspots_gdf.set_index(pd.DatetimeIndex(hotspots_gdf.solar_day.values))

In [None]:
hotspots_gdf['satellite_sensor_product'].count()

# Temporal subset (by solar day) to enable rapid testing

In [None]:
start_date = '2019-11-01'
end_date = '2020-05-01'
comparison_prefix = '20191101_20200501'
hotspots_gdf = hotspots_gdf.loc[start_date:end_date]

In [None]:
hotspots_gdf.loc[start_date:end_date]['datetime'].min(), hotspots_gdf.loc[start_date:end_date]['datetime'].max()

In [None]:
hotspots_gdf['satellite_sensor_product'].count()

# Run comparison matrix

In [None]:
# Comparison sets
setA = set(hotspots_gdf['satellite_sensor_product'])
setAlist = []
[setAlist.append(i) for i in setA]
setAlist.sort()
setAlist[7:9]

In [None]:
comparison_prefix = '-'.join(setAlist[7:9])+'-'+comparison_prefix
    

In [None]:
comparison_prefix

In [None]:
# Compare all hotspot sources to each other

appended_dataframe = []
satellite_sensor_product_intersections = {}

#for productA in set(hotspots_gdf['satellite_sensor_product']):
for productA in setAlist[7:9]:
    for productB in set(hotspots_gdf['satellite_sensor_product']):
        
        gdfA = hotspots_gdf[(hotspots_gdf['satellite_sensor_product'] == productA)]       
        gdfB = hotspots_gdf[(hotspots_gdf['satellite_sensor_product'] == productB)]       

        # For each solar day group in gdfA
        for Aname, Agroup in gdfA.resample('D', on='solar_day'):
            
            minutctime, maxutctime, deltautctime = solar_day_start_stop_period(eastlon, westlon, Aname)
     
            # For each solar day group in gdfB
            for Bname, Bgroup in gdfB.resample('D', on='solar_day'):      

                # Do where the solar days are the same in gdfA and B
                if (Aname == Bname):
                    logger.info(productA+' '+productB)

                    satellite_sensor_product_intersections['solar_day'] = Aname
                    
                    logger.info(str(Aname)+' '+str(minutctime)+' '+str(minutctime)+' '+str(deltautctime))
                    
                    # Generate the GeoJSON for each satellite in s3vtconfig.yaml
                    
                    get_satellite_swaths('s3vtconfig.yaml', minutctime.strftime("%Y-%m-%dT%H:%M:%SZ"), str(int(deltautctime.total_seconds()/60)), str(Aname.date()))
                    
                    # Geostationary satellites need an exception
                    if not (('AHI' in [productA, productB]) or ('INS1' in [productA, productB])):

                        # Include a try except to counteract failures where swath intersect fails
                        try:
                            
                            # Get geometries for satellite sensors in gpdA and gpdB
                            gpd1, gpd2 = pairwise_swath_intersect(set(Agroup['satellite']), set(Bgroup['satellite']), str(Aname.date()))
                            
                            # TODO - limit the swath geometries used for intersection

                            # Union before intersect
                            gpd1 = gpd1.unary_union
                            gpd2 = gpd2.unary_union
                            
                            # Intersect geometries
                            intersection = gpd1.intersection(gpd2)
                            logger.info(str(intersection))
  
                            
                            if intersection == None:
                                logger.info("Intersection is None")
                            else:
                                logger.info("Intersection successful")
                            # Use intersection results to subset points (compare common imaged area)
                            
                            logger.info("Before intersection "+str(Aname)+' '+str(Agroup['satellite_sensor_product'].count())+' '+str(Bgroup['satellite_sensor_product'].count()))
                            
                            pip_mask = Agroup.within(intersection)
                            Agroup = Agroup.loc[pip_mask]                                
                            Agroup.reset_index(drop=True, inplace=True)
                            
                            pip_mask = Bgroup.within(intersection)
                            Bgroup = Bgroup.loc[pip_mask]
                            Bgroup.reset_index(drop=True, inplace=True)
                            logger.info("After intersection "+str(Aname)+' '+str(Agroup['satellite_sensor_product'].count())+' '+str(Bgroup['satellite_sensor_product'].count()))
                            
                            if (Agroup['solar_day'].count() == 0) or (Bgroup['solar_day'].count() == 0):
                                logger.info("Nothing to input to ckdnearest")

                            per_solarday_nearest_hotspots = ckdnearest(Agroup , Bgroup)
                            
                            #print("Matched ",per_solarday_nearest_hotspots['solar_day'].count(), " to ", per_solarday_nearest_hotspots['2_geometry'].count())
                            
                            appended_dataframe.append(per_solarday_nearest_hotspots)
                            #print(len(appended_dataframe))
                            
                        except:
                            logger.info('Skipping')
                    else:
                        # Himawari AHI or INS1 geostationary case
                        # A better approach here is to check if either has a swath available
                        # If not - defer to the intersection of the one with a geometry
                        # TODO - improve for Himawari
                        try:
                            per_solarday_nearest_hotspots = ckdnearest(Agroup.reset_index(drop=True, inplace=True), Bgroup.reset_index(drop=True, inplace=True))
                            print(len(appended_dataframe))
                            appended_dataframe.append(per_solarday_nearest_hotspots)
                        except:
                            logger.info('Skipping')

nearest_points = pd.concat(appended_dataframe)
appended_dataframe = None

# Add metres distance between two points
nearest_points['dist_m'] = nearest_points.apply(lambda row: distance((row.latitude, row.longitude),(row['2_latitude'], row['2_longitude'])).meters, axis = 1)
# Add time delta between points
nearest_points['timedelta'] = (abs(nearest_points['datetime'] - nearest_points['2_datetime']))
nearest_points['count'] = 1

In [None]:
nearest_points.keys()

In [None]:
output = comparison_prefix+'-'+'nearest_points.geojson'
nearest_points.to_file(output, driver='GeoJSON')

In [None]:
nearest_points.set_index('solar_day', inplace=True)

In [None]:
australia = gpd.GeoDataFrame.from_file('vectors/australia.geojson')

In [None]:
# DEBUGGING For testing intersect results for swath and hotspots

satelliteA = 'SENTINEL_3B' # NOAA 19, NOAA 20, TERRA, AQUA, SENTINEL_3A, SENTINEL_3B
satelliteB = 'NOAA 19'
solar_date = '2020-01-04'
gpd1, gpd2 = pairwise_swath_intersect([satelliteA], [satelliteB], solar_date)

start_date = solar_date
end_date = solar_date
# uncomment below to examine data prior to intersect
hotspots_sample = hotspots_gdf.loc[start_date:end_date]

# uncomment below to examine data post intersect
#nearest_sample = nearest_points[(nearest_points['satellite'] == satelliteA) & (nearest_points['2_satellite'] == satelliteB) ]
#nearest_sample = pd.concat([nearest_sample, nearest_points[(nearest_points['satellite'] == satelliteB) & (nearest_points['2_satellite'] == satelliteA) ]])
#hotspots_sample = nearest_sample.loc[start_date:end_date]

fig, ax = plt.subplots(figsize=(15, 15))
gpd1.plot(ax=ax, facecolor='red');
gpd2.plot(ax=ax, color='green');
hotspots_sample[hotspots_sample['satellite'] == satelliteA].plot(ax=ax, color='tomato', markersize=2)
hotspots_sample[hotspots_sample['satellite'] == satelliteB].plot(ax=ax, color='lime', markersize=2)

gpd1int = gpd1.unary_union
gpd2int = gpd2.unary_union

intersection = gpd1int.intersection(gpd2int)

#intersection = intersection[~intersection.is_empty]
australia.plot(ax=ax, facecolor="None", edgecolor='white')
#gpd.GeoDataFrame(intersection).rename(columns={0:'geometry'}).set_geometry('geometry').plot(ax=ax, facecolor="None", edgecolor='black', hatch="///")
gpd.GeoDataFrame(gpd.GeoSeries(intersection)).rename(columns={0:'geometry'}).set_geometry('geometry').plot(ax=ax, facecolor="None", edgecolor='gray', hatch="///")
# Prepare for labels on GPD1
gpd1['coords'] = gpd1['geometry'].apply(lambda x: x.representative_point().coords[:])
gpd1['coords'] = [coords[0] for coords in gpd1['coords']]
for idx, row in gpd1.iterrows():
    plt.annotate(s=row['Transit time                 :'], xy=row['coords'], horizontalalignment='center')
# Plot it
plt.tight_layout()


In [None]:
# Graph it to confirm intersect of swath and hotspots (all won't match - not hotspots in the sea etc.)
gpd1['Transit time                 :'] = pd.to_datetime(gpd1['Transit time                 :'])
df = hotspots_sample[(hotspots_sample['satellite'] == satelliteA)].sort_values('datetime', ascending=True)
dfswath = gpd1.sort_values('Transit time                 :', ascending=True)
plt.plot(df['datetime'], df['datetime'], '*', color='blue', markersize=10)
plt.plot(dfswath['Transit time                 :'], dfswath['Transit time                 :'], 'o', color='red', markersize=5)
plt.xticks(rotation='vertical')

In [None]:
nearest_points['geometry'].count()

# Results

In [None]:
# Count of hotspot matches < 2000m
numerator = pd.pivot_table(nearest_points[(nearest_points['dist_m'] < 2000)],values='count', index=['satellite_sensor_product'], columns=['2_satellite_sensor_product'], aggfunc={'count':len})

In [None]:
# Count of hotspot matches - total
denominator = pd.pivot_table(nearest_points,values='count', index=['2_satellite_sensor_product'], columns=['satellite_sensor_product'], aggfunc={'count':np.sum})

In [None]:
# Set seaborn styling for matrix
denominatortable = denominator.style.format("{:g}")
cm = seaborn.light_palette("blue", as_cmap=True)
s = denominatortable.background_gradient(cmap=cm)
s

In [None]:
output = comparison_prefix+'-'+"matches_count.html"
html = open(output,"w")
html.write(s.render())
html.close()

In [None]:
# Count of hotspot matches < 2000m
numerator = pd.pivot_table(nearest_points[(nearest_points['dist_m'] < 2000)],values='count', index=['2_satellite_sensor_product'], columns=['satellite_sensor_product'], aggfunc={'count':len})

In [None]:
# Set seaborn styling for matrix
numeratortable = numerator.style.format("{:g}")
cm = seaborn.light_palette("red", as_cmap=True)
s = numeratortable.background_gradient(cmap=cm)
s

In [None]:
output = comparison_prefix+'-'+"matches_2000m.html"
html = open(output,"w")
html.write(s.render())
html.close()

In [None]:
# Percentage of matched points closer than 2000m
difference = (denominator - numerator).style.format("{:g}") 
# Set seaborn styling for matrix
cm = seaborn.light_palette("red", as_cmap=True)
s = difference.background_gradient(cmap=cm)
s

In [None]:
output = comparison_prefix+'-'+"count_difference.html"
html = open(output,"w")
html.write(s.render())
html.close()

In [None]:
# Percentage of matched points closer than 2000m
percentage = (numerator / denominator).style.format("{:.0%}") 

In [None]:
# Set seaborn styling for matrix
cm = seaborn.light_palette("green", as_cmap=True)
s = percentage.background_gradient(cmap=cm)
s

In [None]:
output = comparison_prefix+'-'+"percentage.html"
html = open(output,"w")
html.write(s.render())
html.close()

In [None]:
# Maximum time between matched points < 2000m
timemax = pd.pivot_table(nearest_points[(nearest_points['dist_m'] < 2000)],values='timedelta', index=['satellite_sensor_product'], columns=['2_satellite_sensor_product'], aggfunc={'timedelta':np.max})
# Set seaborn styling for matrix
timemaxtable = timemax.style.format("{:}")
cm = seaborn.light_palette("gray", as_cmap=True)
s = timemaxtable.background_gradient(cmap=cm)
s

In [None]:
output = comparison_prefix+'-'+"max_time_matched_points.html"
html = open(output,"w")
html.write(s.render(annot=True))
html.close()

In [None]:
# Minimum time between matched points < 2000m
timemin = pd.pivot_table(nearest_points[(nearest_points['dist_m'] < 2000)],values='timedelta', index=['2_satellite_sensor_product'], columns=['satellite_sensor_product'], aggfunc={'timedelta':np.min})
 
# Set seaborn styling for matrix
timemintable = timemin.style.format("{:}")
cm = seaborn.light_palette("purple", as_cmap=True)
s = timemintable.background_gradient(cmap=cm)
s

In [None]:
output = comparison_prefix+'-'+"min_time_matched_points.html"
html = open(output,"w")
html.write(s.render())
html.close()

In [None]:
# Average distance (m) between matched points < 2000m
averagedist = pd.pivot_table(nearest_points[(nearest_points['dist_m'] < 2000)],values='dist_m', index=['2_satellite_sensor_product'], columns=['satellite_sensor_product'], aggfunc={'dist_m':np.mean})
# Set seaborn styling for matrix
averagedisttable = averagedist.style.format("{:}")
cm = seaborn.light_palette("olive", as_cmap=True)
s = averagedisttable.background_gradient(cmap=cm)
s

In [None]:
output = comparison_prefix+'-'+"avg_distance_2000m.html"
html = open(output,"w")
html.write(s.render())
html.close()

In [None]:
# Done - TODO polygon intersection (using satellite footprint tool and S3 geojson)
# TODO match available Landsat and Sentinel 2 datasets with hotspots
# TODO Use WCS to interact with GSKY for DEA data queries - https://gsky.readthedocs.io/en/latest/_notebook/Notebook_GSKY_WCS.html
# TODO run fire detection on Landsat and Sentinel 2 - update attributes
# Done - TODO rerun the nearest test with S3 as the primary, as well as DEA Hotspots.
# Done - TODO add Himawari Hotspots from SRSS and WFABBA.(Done - see config.yaml, added password and request handling, concatenated GPD added)
# Done - TODO add persistent hotspots and compare frequency of detection
# TODO - get feedback on the length of day between midnight on the east side and midnight on the west side

In [None]:
# Questions to answer
# How to define coincidence?
# How to constrain results - based on confidence? and minimum allowable radius?
# Perhaps groupby time should be a moving window of two days or only look at matching hotspots first detected early in the day?

In [None]:
# TODO - Add moving window function i.e shift solar day by x hours forward or back and redo analysis
# Done TODO - Add time delta for matches as well - average time delta might be a useful statistic
# Done TODO - Add product group attribute to DEA dataframe
# Done TODO roll this up to the nearest comparison to simplify product handling

In [None]:
nearest_points.to_csv('nearest_points.csv')

In [None]:
# TODO - improve Results presentation
# Plot time against count of matches
# Plot vectors from one hotspot source to its match
# TODO add vectors linking spots to their nearest

In [None]:
persistent_hotspots = gpd.GeoDataFrame.from_file('vectors/known_non_FHS.shp')

In [None]:
# TODO match DEA Hotspots to persistent hotspots
# TODO match S3VT Hotspots to persistent hotspots
# TODO how often do matched hotspots match with perisistent hotspots
# TODO - improved logging

In [None]:
nearest_persistent = ckdnearest( hotspots_gdf, persistent_hotspots)

In [None]:
nearest_persistent['count'] = 1

In [None]:
nearest_persistent['dist_m'] = nearest_persistent.apply(lambda row: distance((row.latitude, row.longitude),(row['2_Latitude'], row['2_Longitude'])).meters, axis = 1)

In [None]:
# Count of hotspot matches 
persistentcount = pd.pivot_table(nearest_persistent,values='count', index=['2_Comment'], columns=['satellite_sensor_product'], aggfunc={'count':np.sum})
#persistentcount
# Set seaborn styling for matrix
persistentcounttable = persistentcount.style.format("{:g}")
cm = seaborn.light_palette("orange", as_cmap=True)
s = persistentcounttable.background_gradient(cmap=cm)
s

In [None]:
output = comparison_prefix+'-'+"persistent_matches.html"
html = open(output,"w")
html.write(s.render())
html.close()

In [None]:
# Count of hotspot matches < 2000m
persistentcountm = pd.pivot_table(nearest_persistent[(nearest_persistent['dist_m'] < 2000)],values='count', index=['2_Comment'], columns=['satellite_sensor_product'], aggfunc={'count':np.sum})
# Set seaborn styling for matrix
persistentcountmtable = persistentcountm.style.format("{:g}")
cm = seaborn.light_palette("blue", as_cmap=True)
s = persistentcountmtable.background_gradient(cmap=cm)
s

In [None]:
output = comparison_prefix+'-'+"peristent_matches_2000m.html"
html = open(output,"w")
html.write(s.render())
html.close()

In [None]:
nearest_persistent['line_geometry'] = nearest_persistent.apply(lambda row: shapely.wkt.loads(LineString([row.geometry, row['2_geometry']]).wkt), axis = 1)

In [None]:
nearest_persistent = nearest_persistent.set_geometry('line_geometry')

In [None]:
nearest_persistent.geometry.plot(figsize=(20, 20))

In [None]:
gjson = persistent_hotspots.geometry.to_crs(epsg='4326').to_json()

In [None]:
mapa = folium.Map([-26, 132],
                  zoom_start=4,
                  tiles='Stamen Terrain')

points = folium.features.GeoJson(gjson)

mapa.add_child(points)
mapa

# Discussion

# Conclusion