## refs
- https://towardsdatascience.com/dbscan-algorithm-complete-guide-and-application-with-python-scikit-learn-d690cbae4c5d
- https://hdbscan.readthedocs.io/en/latest/comparing_clustering_algorithms.html#
- https://github.com/gboeing/2014-summer-travels/blob/master/clustering-scikitlearn.ipynb

In [None]:
import json
import time # unused?

import numpy as np # +dep
import pandas as pd # +dep
import matplotlib.pyplot as plt # interactive notebook
#import seaborn as sns # unused?
import sklearn.cluster as cluster # +dep
import hdbscan
# mappings, interactive notebook
#import cartopy
import folium
from shapely.geometry import MultiPoint # +optdep

from pylab import rcParams # interactive notebook
%matplotlib inline
rcParams['figure.figsize'] = (14,10)

#sns.set_context('poster')
#sns.set_color_codes()
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}

In [None]:
# raw NRT data
import pathlib
data = pd.read_csv(str(pathlib.Path("./hotspots_data_2019.csv").resolve()))
data.head()

In [None]:
# (our own) slightly processed data
with open("./agni/nrt-20200618.json", "r") as jf:
    json_data = json.load(jf)
jdf = pd.DataFrame(json_data) # can toss current list of dicts into it
jdf.head()

In [None]:
used_data = data[data.acq_date == '2019-04-02']
coords = used_data[['latitude', 'longitude']].to_numpy()
display(used_data.head(), coords.shape)
# haversine need radians
RADIUS_KM = 0.375 * 1.5
KMS_PER_RAD = 6371.0088
eps = RADIUS_KM/KMS_PER_RAD

In [None]:
# the heart of clustering
hdb = hdbscan.HDBSCAN(min_samples=3).fit(np.radians(coords))
db = cluster.DBSCAN(eps=eps, min_samples=3, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
# clustering results, noise is -1
# .fit_predict(x) returns this
cluster_labels = db.labels_
display(cluster_labels)

In [None]:
# classifying point type for turfjs compat
dbccol = pd.Series(cluster_labels)
dbccol[db.core_sample_indices_] = 'core'
dbccol[dbccol == -1] = 'noise'
dbccol[dbccol.apply(lambda x: isinstance(x, int))] = 'edge'
display(dbccol)

In [None]:
num_clusters = len(set(cluster_labels))-1
clusters = pd.Series([coords[cluster_labels==n] for n in range(num_clusters)])
clusters.head()

In [None]:
def get_centroid(cluster):
    c = MultiPoint(cluster)
    centroid = (c.centroid.x, c.centroid.y)
    return tuple((*centroid, len(cluster))) # return: (x, y, count)

centroids = clusters.map(get_centroid)

In [None]:
c_lats, c_lons, c_count = zip(*centroids)
# list of cluster centroids along with count
rs = pd.DataFrame({'latitude':c_lats, 'longitude':c_lons, 'count':c_count})
r_lats, r_lons = zip(*coords)
rs.tail()

In [None]:
import shapefile
import geojson

In [None]:
lmap = folium.Map(location=[13, 100.8], zoom_start=6)

for _, row in used_data.iterrows():
    folium.CircleMarker(location=(row['latitude'], row['longitude']),
                        radius=6
                       ).add_to(lmap)

for _, row in rs.iterrows():
    folium.CircleMarker(location=(row['latitude'], row['longitude']),
                        radius=6, fill_color='red', fill_opacity=1,
                        popup="<b>count</b>: {}".format(int(row['count']))
                       ).add_to(lmap)

lmap