In [None]:
import os
import sys
import datetime
from pathlib import Path
import xml.etree.ElementTree as ET
import logging
import uuid
import time
import shapely
import pandas as pd
pd.options.plotting.backend = "plotly"
import geopandas as gpd
import seaborn
import dask
import numpy as np
import dask.dataframe as dd
from dask.distributed import Client

sys.path.insert(0, '..')
import src.hotspot_utils as util 
import src.process_nearest_hotspots as nearest_process
import src.xml_util as xutil

In [None]:
outdir = Path(f"/home/jovyan/s3vt_dask/s3vtdata/workdir_test1")

In [None]:
logging.basicConfig(
    format='%(asctime)s [%(levelname)s] %(name)s - %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S',
)
_LOG = logging.getLogger(__name__)

In [None]:
# include n_workers equal or less than the number of core
# To visualise status in Dask add /user/<username>/proxy/8787/status
#client = Client(memory_limit='7GB', n_workers=8, processes=False ) - no S3B low memory use
#client = Client(memory_limit='7GB', n_workers=8, threads_per_worker=2 )
client = Client( n_workers=4 )
client

# Processing Parameter used in Sub-setting Spatial Extent and Temporal Range for Area of Interest
##### The FRP data from nasa, esa, eumetsat and landgate are merged, sub-setted and neareast hotspots csv files are generated based on the parameters in `processing_parameters`  
##### The parameter `chunks` in blocking FRP data to enable multi-processing. If you encounter memory issues then higher the number.
##### The `start_time` and `end_time` can be used to subset for solar_day (3:00-22:00), solar_night (22:00-03:00 with 12 hours offset) and solar_all(0:00-24:00) hours.

In [None]:
processing_parameters = {
    "nasa_frp": "s3://s3vtaustralia/nasa_hotspots_gdf.geojson",
    "esa_frp": "s3://s3vtaustralia/s3vt_hotspots.geojson",
    "eumetsat_frp": "s3://s3vtaustralia/s3vt_eumetsat_hotspots.geojson",
    "landgate_frp": "s3://s3vtaustralia/landgate_hotspots_gdf.geojson",
    "sentinel3_swath_geojson": "s3://s3vtaustralia/sentinel3_swath_gdfs.geojson",
    "dea_frp": None,
    "lon_west": 113.0, #147.0,
    "lat_south": -44, #-38.0,
    "lon_east": 154.0,
    "lat_north": -10, #-27.,
    "start_date":  "2020-02-02", #"2019-11-01",
    "end_date": "2020-10-08",
    "start_time": "20:00",
    "end_time": "03:00",
    "chunks": 250,
    "compare_field": "solar_night",
    "swath_config_file": Path("/home/jovyan/s3vt_dask/s3vtdata/configs/s3vtconfig.yaml"),
    "outdir": outdir,
    "test": False
}

In [None]:
# This is to generate nearest .csv files. If .csv files already exists then skip this process. Takes around ~5-6 hours in this sandbox environment with 2-core and 16 GB RAM
nearest_hotspots_product_files = nearest_process.process_nearest_points(**processing_parameters)

## Nearest Hotspots DataFrame merged from neareast hotspots csv files

In [None]:
# csv directory is where nearest hotspots csv files are stored. 
csv_directory = outdir
print(csv_directory)
# This is read all the .csv files if name starts with `nearest_points` and ends with `compare_field` value from processing parameters.
nearest_hotspots_csv_files = [
    fp for fp in csv_directory.iterdir()
    if (fp.name.startswith("nearest_points"))
    and (fp.name.endswith("csv"))
]

In [None]:
#optional if full run is broken into smaller timeperiods
#import glob
# relative path to search all text files
#files = glob.glob("/home/jovyan/s3vt_dask/s3vtdata/workdir_test1/20*/*")
#nearest_hotspots_csv_files = []
#for i in files:
#    nearest_hotspots_csv_files.append(Path(i))

In [None]:
#nearest_hotspots_csv_files

In [None]:
# Checking for bad csvs
#csv_directory = Path(f"/home/jovyan/s3vt_dask/s3vtdata/workdir_test1/20191101_20200201/")
#nearest_hotspots_csv_files2 = [
#    fp for fp in csv_directory.iterdir()
#    if (fp.name.startswith("nearest_points"))
#    and (fp.name.endswith("csv"))
#]

In [None]:
# nearest points csv files that will be used analysis from here on.
nearest_hotspots_csv_files

In [None]:
# returns a dask DataFrame with index set at column `compare_field` from processing_parameters.
nearest_points_ddf = util.csv_to_dataframe(nearest_hotspots_csv_files, processing_parameters["compare_field"])

In [None]:
nearest_points_ddf = nearest_points_ddf.compute()

In [None]:
satellite_sensor_product = nearest_points_ddf.satellite_sensor_product.unique()
satellite_sensor_product_daily_list = []
for i in satellite_sensor_product:
    
    for j in satellite_sensor_product:
        if i != j:
            Acount = nearest_points_ddf[(nearest_points_ddf['satellite_sensor_product'] == i) & (nearest_points_ddf['2_satellite_sensor_product'] == j)].resample("D", on='datetime').datetime.count()
            Acount = Acount.rename('main_count')
            Bcount = nearest_points_ddf[(nearest_points_ddf['satellite_sensor_product'] == j) & (nearest_points_ddf['2_satellite_sensor_product'] == i)].resample("D", on='datetime').datetime.count()
            Bcount = Bcount.rename('secondary_count')
            
            # Use the ratio of the master and slave daily counts in the same observation extent as a consistency measure
            # - theory being that there should be a proportional shift in both hotspot sources
            # - using ratio as a quick measure
            
            ratio = Acount/Bcount
            ratio = ratio.rename('ratio')
           
            zscore = ratio - ratio.mean() / ratio.std()
            zscore = zscore.rename('zscore')

            satellite_sensor_product_daily = pd.concat([Acount, Bcount, ratio, zscore], axis=1)
            satellite_sensor_product_daily['satellite_sensor_product'] = i
            satellite_sensor_product_daily['2_satellite_sensor_product'] = j
            satellite_sensor_product_daily['pair'] = i+'|'+j
            satellite_sensor_product_daily_list.append(satellite_sensor_product_daily)
            print(i,j,ratio.max(),ratio.std(),ratio.mean())
satellite_sensor_product_daily_pd = pd.concat(satellite_sensor_product_daily_list)

In [None]:
satellite_sensor_product_daily_pd

In [None]:
satellite_sensor_product_daily_pd.zscore.max()

In [None]:
pairs = list(satellite_sensor_product_daily_pd['pair'].unique())

In [None]:
satellite_sensor_product_daily.satellite_sensor_product.unique()

In [None]:
pair_list = []
pair_dict = {}
for pair in pairs:
    daily_pair = satellite_sensor_product_daily_pd[satellite_sensor_product_daily_pd['pair'] == pair]
    correlation = daily_pair[['main_count','secondary_count']].corr().main_count[1]
    #pair_list.append(pd.DataFrame({pair: {'correlation': correlation}}))
    pair_dict[pair] = {'correlation': correlation, 'main': pair.split('|')[0], 'secondary': pair.split('|')[1]}
pair_pd = pd.DataFrame(pair_dict).transpose()

In [None]:
for main in pair_pd['main'].unique():
    print(pair_pd[pair_pd['main'] == main][['correlation']].sort_values(by=['correlation'], ascending=False))

In [None]:
sample = satellite_sensor_product_daily_pd[(satellite_sensor_product_daily_pd['pair'] == 'NOAA 20_VIIRS_NASA2.0NRT|SENTINEL_3A_SLSTR_EUMETSAT')]
#sample = satellite_sensor_product_daily_pd[(satellite_sensor_product_daily_pd['satellite_sensor_product'] == 'SUOMI NPP_VIIRS_LANDGATE') & (satellite_sensor_product_daily_pd['2_satellite_sensor_product'] == 'SUOMI NPP_VIIRS_NASA1')]#.plot()

In [None]:
#plotly.write_image('daily_count_comparison')

In [None]:
# Cut and paste pair from above to visualise daily pair counts (from coincident observation area)
# Theory is that within in some range, the number of hotspots should be roughly proportional, day to day.
pair = 'SENTINEL_3B_SLSTR_EUMETSAT|NOAA 20_VIIRS_NASA2.0NRT'

example = satellite_sensor_product_daily_pd[satellite_sensor_product_daily_pd['pair'] == pair]
ax = example.plot.scatter(x=example.index, y=['main_count', 'secondary_count'], title=pair)
ax.layout.xaxis = {    'anchor': 'y', 'domain': [0.0, 1.0], 'title': {'text': 'date'}}
ax.layout.yaxis = {    'anchor': 'x', 'domain': [0.0, 1.0], 'title': {'text': 'hotspots'}}
#ax.layout.legend = {    'title': {'satellite_sensor_product': 'variable'}, 'tracegroupgap': 0}
ax


In [None]:
filter_candidates_list = []
for i in satellite_sensor_product_daily_pd['pair'].unique():
    candidate_pair = satellite_sensor_product_daily_pd[(satellite_sensor_product_daily_pd['pair'] == i)]
    filter_candidates_list.append(pd.DataFrame({'pair': i, 
                                                'satellite_sensor_product': i.split('|')[0],
                                                '2_satellite_sensor_product': i.split('|')[1],
                                                'gt3std_date': (candidate_pair[candidate_pair['ratio'] > 3*candidate_pair['ratio'].std()].index),
                                                'correlation': candidate_pair[['main_count','secondary_count']].corr().main_count[1]}))
filter_candidates_pd = pd.concat(filter_candidates_list)

In [None]:
filter_candidates_pd

# Confirm that the candidate_pair and results make sense. 
# High positive or negative zscore indicate how far away from the mean the sample is

In [None]:
candidate_pair[candidate_pair['ratio'] > 3*candidate_pair['ratio'].std()]

In [None]:
candidate_pair[candidate_pair['ratio'] > 2*candidate_pair['ratio'].std()]

In [None]:
candidate_pair[candidate_pair['ratio'] > 1*candidate_pair['ratio'].std()]

In [None]:
# Suggest doing this for each comparison for all combinations
ax = candidate_pair.plot.scatter(x=candidate_pair['main_count'], y=['secondary_count'])
ax.layout.xaxis = {    'anchor': 'y', 'domain': [0.0, 1.0], 'title': {'text': 'SENTINEL_3B_SLSTR_EUMETSAT'}}
ax.layout.yaxis = {    'anchor': 'x', 'domain': [0.0, 1.0], 'title': {'text': 'NOAA 20_VIIRS_NASA2.0NRT'}}
ax

# Candidate dates should be excluded from the pairwise comparison results if they fail two tests:
### 1. the ratio of counts for the pair exceeds a threshold of 3 standard deviations from the mean ratio (assumes that the ratio of hotspots will be reasonably stable)
### 2. the threshold of 3 standard deviations from the mean ratio is also exceeded in at least 4 other pairwise comparisons where the master / target satellite_sensor_product 

### Use the results to drop candidates matching the time and satellite_sensor_product (Master) from the nearest match results - chances are they get excluded from the <5000m results anyway

In [None]:
for date in filter_candidates_pd['gt3std_date'].unique():
    for i in filter_candidates_pd['satellite_sensor_product'].unique():
        candidate_product_count = filter_candidates_pd[(filter_candidates_pd['gt3std_date'] == date) & (filter_candidates_pd['satellite_sensor_product'] == i)].count()
        if candidate_product_count[0] >= 3:
            print(i, date, candidate_product_count[0])   

In [None]:
filter_candidates_pd

In [None]:
#sample = satellite_sensor_product_daily_pd[satellite_sensor_product_daily_pd['pair'] == 'SUOMI NPP_VIIRS_LANDGATE|SENTINEL_3B_SLSTR_EUMETSAT']
candidate_pair[['main_count', 'secondary_count']].plot()

In [None]:
import plotly.express as px
sample = satellite_sensor_product_daily_pd[satellite_sensor_product_daily_pd['pair'] == 'SENTINEL_3B_SLSTR_ESA|SENTINEL_3B_SLSTR_EUMETSAT']
fig = px.line(sample, x=sample.index, y=sample['ratio'], color=sample['pair'])
fig.show()

In [None]:
satellite_sensor_product_daily_list[5]

In [None]:
subset = nearest_points_ddf[(pd.to_datetime(nearest_points_ddf['datetime']).dt.date == np.datetime64('2019-12-24')) & (nearest_points_ddf['satellite_sensor_product'] == 'TERRA_MODIS_NASA6.03') & (nearest_points_ddf['2_satellite_sensor_product'] == 'SENTINEL_3B_SLSTR_ESA')]
#hotspots_gdf.plot(column='satellite_sensor_product', legend=True, legend_kwds={'loc': 'upper right'}, figsize=(20, 20))

In [None]:
import folium
mapa = folium.Map([-26, 132],
                  zoom_start=4,
                  tiles='Stamen Terrain')
points = folium.features.GeoJson(gpd.GeoSeries.from_wkt(subset.geometry).to_json())
mapa.add_child(points)
mapa

# Results
## Co-occurrence metrics

In [None]:
region_alias = "continental"
output_directory = processing_parameters["outdir"]
comparison_prefix = (
    f"{processing_parameters['start_date'].replace('-', '')}"
    f"_{processing_parameters['end_date'].replace('-', '')}"
    f"_{processing_parameters['start_time'].replace(':','')}"
    f"_{processing_parameters['end_time'].replace(':','')}"
    f"_{region_alias}"
)

In [None]:
# set the nearest distance threshold between two hotspots to confine the analysis within the distance threshold.  
dist_threshold = 5000  # units in meters

In [None]:
nearest_ddf_dist_subset = nearest_points_ddf[nearest_points_ddf["dist_m"] < dist_threshold]

In [None]:
nearest_ddf_dist_subset

In [None]:
numerator = util.pandas_pivot_table(
    nearest_ddf_dist_subset,
    index=["satellite_sensor_product"],
    columns=["2_satellite_sensor_product"],
    values=["count"],
    aggfunc={"count": np.sum}
    
)

In [None]:
numerator.to_csv(output_directory.joinpath(f"{comparison_prefix}_matches_{dist_threshold}.csv"))
numerator

In [None]:
denominator = util.pandas_pivot_table(
    nearest_points_ddf,
    index=["satellite_sensor_product"],
    columns=["2_satellite_sensor_product"],
    values=["count"],
    aggfunc={"count": np.sum}
    
)

In [None]:
denominator.to_csv(output_directory.joinpath(f"{comparison_prefix}_matches_count.csv"))
denominator

In [None]:
# Difference of matched points closer than 5000m
difference = denominator - numerator

In [None]:
difference.to_csv(output_directory.joinpath(f"{comparison_prefix}_count_difference.csv"))
difference

In [None]:
# Percentage of matched points closer than dist_threshold
percentage = (numerator / denominator) * 100
percentage = np.round(percentage, 2)
##
percentage = (numerator / denominator).style.format("{:.0%}")

In [None]:
#percentage.to_csv(output_directory.joinpath(f"{comparison_prefix}_percentage.csv"))
#percentage
# Set seaborn styling for matrix
cm = seaborn.color_palette("rocket_r", as_cmap=True)
s = percentage.background_gradient(cmap=cm)
s.set_table_styles(
    [dict(selector="th",props=[('max-width', '200px')]),
        dict(selector="th.col_heading",
                 props=[("writing-mode", "vertical-rl"), 
                        ('transform', 'rotateZ(180deg)'),
                        ])]
)
s

In [None]:
# Maximum time between match points < dist_threshold
timemax = util.pandas_pivot_table(
    nearest_ddf_dist_subset,
    index=["satellite_sensor_product"],
    columns=["2_satellite_sensor_product"],
    values=["timedelta"],
    aggfunc={"timedelta": np.max}
    
)

In [None]:
timemax.to_csv(output_directory.joinpath(f"{comparison_prefix}_max_time_matched_points.csv"))

In [None]:
timemaxtable = timemax.style.format("{:}")
cm = seaborn.color_palette("rocket", as_cmap=True)
s = timemaxtable.background_gradient(cmap=cm)
s.set_table_styles(
    [dict(selector="th",props=[('max-width', '200px')]),
        dict(selector="th.col_heading",
                 props=[("writing-mode", "vertical-rl"), 
                        ('transform', 'rotateZ(180deg)'),
                        ])]
)

In [None]:
# Minimum time between match points < dist_threshold
timemin = util.pandas_pivot_table(
    nearest_ddf_dist_subset,
    index=["satellite_sensor_product"],
    columns=["2_satellite_sensor_product"],
    values=["timedelta"],
    aggfunc={"timedelta": np.min}
)

In [None]:
timemin.to_csv(output_directory.joinpath(f"{comparison_prefix}_min_time_matched_points.csv"))
timemin

In [None]:
# Average distance (m) between matched points < dist_threshold
averagedist = util.pandas_pivot_table(
    nearest_ddf_dist_subset,
    index=["satellite_sensor_product"],
    columns=["2_satellite_sensor_product"],
    values=["dist_m"],
    aggfunc={"dist_m": np.mean}
)

In [None]:
averagedist = np.round(averagedist)
averagedist.to_csv(output_directory.joinpath(f"{comparison_prefix}_average_distance_{dist_threshold}m.csv"))


In [None]:
# Set seaborn styling for matrix
averagedisttable = averagedist.style.format("{:}")
cm = seaborn.color_palette("rocket", as_cmap=True)
s = averagedisttable.background_gradient(cmap=cm)
s.set_table_styles(
    [dict(selector="th",props=[('max-width', '200px')]),
        dict(selector="th.col_heading",
                 props=[("writing-mode", "vertical-rl"), 
                        ('transform', 'rotateZ(180deg)'),
                        ])]
)
s

In [None]:
#client.close() # close dask.distributed client

In [None]:
# Subsetting for NSW Case Study
for product in satellite_sensor_product:
        unique_matched_hotspots = nearest_ddf_dist_subset[(nearest_ddf_dist_subset['satellite_sensor_product']==product)]
        unique_matched_hotspots.drop(['Unnamed: 0', '2_latitude', '2_longitude',
           '2_satellite', '2_sensor', '2_confidence', '2_power', '2_datetime',
           '2_solar_day', '2_satellite_sensor_product', '2_geometry',
           '2_solar_night', 'dist', 'dist_m', 'timedelta', 'count'],axis=1, inplace=True)
        unique_matched_hotspots = unique_matched_hotspots.drop_duplicates()
        unique_matched_hotspots = gpd.GeoDataFrame(unique_matched_hotspots,geometry=gpd.points_from_xy(unique_matched_hotspots.longitude, unique_matched_hotspots.latitude))
        unique_matched_hotspots.cx[147:154, -38:-27].to_csv(output_directory.joinpath(f"{product}_nsw_unique_within_5km.csv"))