In [None]:
%%bash
cd /mnt/tmp
wget s3.amazonaws.com/geowave/latest/scripts/emr/quickstart/geowave-env.sh
source /mnt/tmp/geowave-env.sh
mkdir gdelt
cd gdelt
wget http://data.gdeltproject.org/events/md5sums
for file in `cat md5sums | cut -d' ' -f3 | grep "^${TIME_REGEX}"` ; \
do wget http://data.gdeltproject.org/events/$file ; done
md5sum -c md5sums 2>&1 | grep "^${TIME_REGEX}"

In [None]:
%%bash

# We have to source here again because bash runs in a separate sub process each cell.
source /mnt/tmp/geowave-env.sh

# clear old potential runs
geowave config rmstore gdelt-hbase
geowave config rmindex gdelt-spatial

# configure geowave connection params for hbase stores "gdelt" and "kmeans"
geowave config addstore gdelt --gwNamespace geowave.gdelt -t hbase --zookeeper $HOSTNAME:2181

# configure a spatial index
geowave config addindex -t spatial gdelt-spatial --partitionStrategy round_robin --numPartitions $NUM_PARTITIONS

# run the ingest for a 10x10 deg bounding box over Europe
geowave ingest localtogw /mnt/tmp/gdelt gdelt gdelt-spatial -f gdelt \
--gdelt.cql "BBOX(geometry, 0, 50, 10, 60)"

In [None]:
# Pull classes to desribe core GeoWave classes
import os
hbase_options_class = sc._jvm.mil.nga.giat.geowave.datastore.hbase.cli.config.HBaseRequiredOptions
accumulo_options_class = sc._jvm.mil.nga.giat.geowave.datastore.accumulo.cli.config.AccumuloRequiredOptions

query_options_class = sc._jvm.mil.nga.giat.geowave.core.store.query.QueryOptions
byte_array_class = sc._jvm.mil.nga.giat.geowave.core.index.ByteArrayId
# Pull core GeoWave Spark classes from jvm
geowave_rdd_class = sc._jvm.mil.nga.giat.geowave.analytic.spark.GeoWaveRDD
rdd_loader_class = sc._jvm.mil.nga.giat.geowave.analytic.spark.GeoWaveRDDLoader
rdd_options_class = sc._jvm.mil.nga.giat.geowave.analytic.spark.RDDOptions
sf_df_class = sc._jvm.mil.nga.giat.geowave.analytic.spark.sparksql.SimpleFeatureDataFrame

In [None]:
#Setup input datastore options
input_store = hbase_options_class()
input_store.setZookeeper(os.environ['HOSTNAME'] + ':2181')
input_store.setGeowaveNamespace('geowave.gdelt')

input_store_plugin = input_store.createPluginOptions()

In [None]:
adapter_id = byte_array_class('gdeltevent')
adapter = input_store_plugin.createAdapterStore().getAdapter(adapter_id)

rdd_opts = rdd_options_class()
rdd_opts.setQueryOptions(query_options_class(adapter))
rdd_opts.setMinSplits(1000)
rdd_opts.setMaxSplits(1000)

geowave_rdd = rdd_loader_class.loadRDD(sc._jsc.sc(), input_store_plugin, rdd_opts)

In [None]:
# Create a SimpleFeatureDataFrame from the GeoWaveRDD
sf_df_gdelt = sf_df_class(spark._jsparkSession)
sf_df_gdelt.init(input_store_plugin, adapter_id)
df_gdelt = sf_df_gdelt.getDataFrame(geowave_rdd)

# Convert Java DataFrame to Python DataFrame
import pyspark.mllib.common as convert
py_df = convert._java2py(sc, df_gdelt)

# Create a sql table view of the hulls data
py_df.createOrReplaceTempView('gdelt')

In [None]:
%%bash
# set up geoserver
geowave config geoserver "$HOSTNAME:8000"

# add the centroids layer
geowave gs addlayer gdelt -id gdeltevent
geowave gs setls gdeltevent --styleName point

In [None]:
import owslib
from owslib.wms import WebMapService

url = "http://" + os.environ['HOSTNAME'] + ":8000/geoserver/geowave/wms"
web_map_services = WebMapService(url)

#print layers available wms
print('\n'.join(web_map_services.contents.keys()))

In [None]:
import folium

#grab wms info for centroids
layer = 'gdeltevent'
wms = web_map_services.contents[layer]

#build center of map off centroid bbox
lon = (wms.boundingBox[0] + wms.boundingBox[2]) / 2.
lat = (wms.boundingBox[1] + wms.boundingBox[3]) / 2.
center = [lat, lon]

m = folium.Map(location = center,zoom_start=3)


name = wms.title
gdelt = folium.features.WmsTileLayer(
    url=url,
    name=name,
    fmt='image/png',
    transparent=True,
    layers=layer,
    overlay=True,
    COLORSCALERANGE='1.2,28',
)
gdelt.add_to(m)
m