# GIS visualization of Wikipedia
## TL/DR:

1. Start with a Wikipedia Dump, preprocessed into Spark by the notebook 0_spark_preprocessing_of_wikipedia.ipynb
2. Produce interactive maps similar to this screenshot: ![alt text](all_wikipedia_coords.png "Title")



## Install dependencies

This is using `%pip` rather than `pkg_resources.resolve()` because on this environment's spark cluster, `%pip` will make sure the libraries are available on the spark worker nodes.

In [1]:
required_packages = {"mwparserfromhell","geopandas","h3","geocoder","pydeck"}
import pkg_resources
for lib in required_packages - {pkg.key for pkg in pkg_resources.working_set}:
    print(f"installing {lib}")
    %pip install -q --upgrade pip
    %pip install -q $lib
    pkg_resources.require(lib)


### Set config

In [None]:
static_file_directory= '.'
if sys.env.contains("DATABRICKS_RUNTIME_VERSION"):
    static_file_directory = '/dbfs/FileStore/wikipedia'


In [2]:
import xml.sax
import json
import mwparserfromhell
import subprocess
import json
import time
import IPython

## Launch Spark (if running on a standalone environment)

* On databricks clusters the Spark Context will already have existed.

In [3]:
if not "spark" in locals():
    import pyspark
    MAX_MEMORY = "8g"  # 24 gives OOM here. # 6 gives "out of heap space"
    spark = (pyspark.sql.SparkSession.builder.appName("MyApp") 
        .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0") 
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
        .config("spark.executor.memory", MAX_MEMORY) 
        .config("spark.driver.memory", MAX_MEMORY) 
        .config("spark.python.worker.reuse",False)
        .config("spark.task.maxFailures",5)
        .enableHiveSupport() 
        .getOrCreate()        
        )
spark

In [4]:
# About a half million h6 grids with information in wikipedia.
spark.sql('''
  select count(distinct coord.h3[3]) as num_h3,
         count(distinct coord.h3[4]) as num_h4,
         count(distinct coord.h3[5]) as num_h5,
         count(distinct coord.h3[6]) as num_h6,
         count(distinct coord.h3[7]) as num_h7,
         count(distinct coord.h3[8]) as num_h8,
         count(distinct coord.h3[9]) as num_h9,
         count(*)
 from wikipedia_silver_structured_templates
 lateral view explode(coords) as coord
''').toPandas()

Unnamed: 0,num_h3,num_h4,num_h5,num_h6,num_h7,num_h8,num_h9,count(1)
0,21799,72608,206136,498461,883727,1164699,1335738,1855143


In [5]:
try_kepler_gl = False 
# kepler_gl doesn't play well in Jupyter Lab or Databricks notebooks.
# Embedding it in an iframe seems to kinda help - but many features are still broken
if try_kepler_gl:
    h3s = spark.sql('''
     select coord.h3[2] as h3,
            count(*) as num_pages,
            log(count(*)) as log_num_pages,
            array_join(array_distinct(array(first(title),last(title))),' ... ') as example_pages
     from wikipedia_silver_structured_templates
     lateral view explode(coords) as coord
     where size(coords)>0
     group by coord.h3[2]
     order by num_pages desc
    ''').limit(100000).toPandas()
    import keplergl
    map_1 = keplergl.KeplerGl(height=600) # ,config=kepler_config)
    #map_1.add_data(data=pdf,name='points')
    map_1.add_data(data=h3s,name='h3s')
    keplergl_html = str(map_1._repr_html_(),'utf-8')
    # Firefox will only display the keplergl element 
    # if the HTML is embedded into an iframe's source as base64.
    import base64
    b64 = base64.b64encode(keplergl_html.encode('utf-8'))
    src = f"data:text/html;base64,{b64.decode('utf-8')}"
    html = f'<iframe src="{src}" style="width:100%; height: 600px">'
    from IPython.core.display import HTML
    result = HTML(html)
try_kepler_gl and result

False

In [6]:
# 'iframe' or 'inline' or 'none'
#
# Displaying the maps inline is convenient for debugging but makes the notebook painfully large for git.
# (rerun with this set to 'iframe' before checking in)
#
display_mode = 'iframe'  


In [19]:

render_in_deckgl = True
h3_lvl = 5
if render_in_deckgl:
    import pydeck
    h3s = spark.sql(f'''
     select coord.h3[{h3_lvl}] as h3,
            array_min(array(count(*),255)) as num_pages,
            log(count(*)) as log_n,
            first(coord.lat) as lat,
            first(coord.lon) as lng,
            array_join(array_distinct(array(first(title),last(title))),' ... ') as example_pages
     from wikipedia_silver_structured_templates
     lateral view explode(coords) as coord
     where size(coords)>0
     group by coord.h3[{h3_lvl}]
     order by num_pages desc
    ''').limit(500000).cache() # 800,000 crashes the browser on my laptop.

    log_n_stats = h3s.selectExpr('count(*) as count',
                                 'max(log_n) as max_log_n',
                                 'approx_percentile(log_n, 0.25, 100) as p25',
                                 'approx_percentile(log_n, 0.50, 100) as p50',
                                 'approx_percentile(log_n, 0.95, 100) as p95',
                                 'approx_percentile(log_n, 0.98, 200) as p98',
                                 'approx_percentile(log_n, 0.99, 500) as p99'
                               ).collect()[0]
    print(f'max_log_n = {log_n_stats}')

    # make everything above the 99% percentile white ()
    max_log_n = log_n_stats['p99']
    layer = pydeck.Layer(
        'H3HexagonLayer',
        h3s.toPandas(),
        get_hexagon = 'h3',
        auto_highlight = True,
        stroked=False,
        # ramp up red for the first 25%
        # ramp up yellow for the middle 50%
        # ramp up white for the final 25%
        get_fill_color = f'[(log_n < {max_log_n}/4) ? 128 + log_n/{max_log_n}*4*256/2 : 255,'
                         f' (log_n < {max_log_n}/4) ? 0 : (log_n > 3*{max_log_n}/4) ? 255 : (log_n/{max_log_n} - 0.25)*255*2'
                         f' log_n > {max_log_n}*3/4 ? (log_n/{max_log_n} - 0.75)*4*255 : 0,'
                         f' 256]',
        get_line_color=[0, 255, 255],
        #elevation_scale=50,
        #elevation_range=[0, 3000],
        #extruded=True,  
        extruded=False,
        pickable=True,
        coverage=1)

    view_state = pydeck.ViewState(
        longitude=-1.415,
        latitude=52.2323,
        zoom=3,
        min_zoom=0,
        max_zoom=15,
        pitch=0,
        bearing=0)

    r = pydeck.Deck(layers=[layer], initial_view_state=view_state)
    result = r.to_html(f'wikipedia_map_at_h3_{h3_lvl}.html',iframe_height=600)

(display_mode == 'inline') and result or IPython.display.IFrame(f'wikipedia_map_at_h3_{h3_lvl}.html',800,600)


max_log_n = Row(count=206136, max_log_n=8.71308886823731, p25=0.0, p50=0.6931471805599453, p95=3.332204510175204, p98=3.9889840465642745, p99=4.574710978503383)


In [8]:
import pydeck as pdk
import pandas as pd
import pydeck
render_globe_view=True
if render_globe_view:
    lvl=6
    h3s = spark.sql(f'''
     select coord.h3[{lvl}] as h3,
            count(*) as num_pages,
            log(count(*)) as log_n,
            first(coord.lat) as lat,
            first(coord.lon) as lng,
            array_join(array_distinct(array(first(title),last(title))),' ... ') as example_pages
     from wikipedia_silver_structured_templates
     lateral view explode(coords) as coord
     where size(coords)>0
     group by coord.h3[{lvl}]
     order by num_pages desc
    ''').limit(500000).cache() # 800,000 crashes the browser on my laptop.

    log_n_stats = h3s.selectExpr('count(*) as count',
                                 'max(log_n) as max_log_n',
                                 'approx_percentile(log_n, 0.25, 100) as p25',
                                 'approx_percentile(log_n, 0.50, 100) as p50',
                                 'approx_percentile(log_n, 0.95, 100) as p95',
                                 'approx_percentile(log_n, 0.98, 200) as p98',
                                 'approx_percentile(log_n, 0.99, 500) as p99'
                               ).collect()[0]
    print(f'max_log_n = {log_n_stats}')

    # make everything above the 99% percentile white ()
    max_log_n = log_n_stats['p99']

    COUNTRIES = "https://d2ad6b4ur7yvpq.cloudfront.net/naturalearth-3.3.0/ne_50m_admin_0_scale_rank.geojson"

    view_state = pdk.ViewState(latitude=0, longitude=0, zoom=2,         min_zoom=0,
            max_zoom=15)

    # Set height and width variables
    view = pdk.View(type="_GlobeView", controller=True, width=1000, height=700)

    layers = [
        pdk.Layer(
            "GeoJsonLayer",
            id="base-map",
            data=COUNTRIES,
            stroked=False,
            filled=True,
            get_fill_color=[30,50,30],
        ),
        pydeck.Layer(
                'H3HexagonLayer',
                h3s.toPandas(),
                get_hexagon = 'h3',
                auto_highlight = True,
                # ramp up red for the first 25%
                # ramp up yellow for the middle 50%
                # ramp up white for the final 25%
                get_fill_color = f'[(log_n < {max_log_n}/4) ? 128 + log_n/{max_log_n}*4*256/2 : 255,'
                                 f' (log_n < {max_log_n}/4) ? 0 : (log_n > 3*{max_log_n}/4) ? 255 : (log_n/{max_log_n} - 0.25)*255*2'
                                 f' log_n > {max_log_n}*3/4 ? (log_n/{max_log_n} - 0.75)*4*255 : 0,'
                                 f' 256]',
                get_elevation='log_n',
                # radius=11000,
                elevation_scale=50000,
                elevation_range=[1,200000],
                extruded=False,#True,  
                pickable=True,
                coverage=1)
    ]
    #layers = layers[:1]
    deck = pdk.Deck(
        views=[view],
        initial_view_state=view_state,
        #tooltip={"text": "{name}, {primary_fuel} plant, {country}"},
        layers=layers,
        # Note that this must be set for the globe to be opaque
        parameters={"cull": True},
    )

    result = deck.to_html("globe_view.html", css_background_color="black",iframe_height=600)
    
(display_mode == 'inline') and result or IPython.display.IFrame('globe_view.html',800,600)


max_log_n = Row(count=498461, max_log_n=7.984121958702927, p25=0.0, p50=0.6931471805599453, p95=2.3978952727983707, p98=3.044522437723423, p99=3.4011973816621555)


In [9]:
# Working blendfunc
#    https://deck.gl/docs/api-reference/json/conversion-reference

import pydeck
import pydeck as pdk

render_scatterplot = True
if render_scatterplot:

    dfpts = spark.sql('''
      select coord.lon as lng, coord.lat as lat,title from wikipedia_silver_structured_templates 
      lateral view explode(coords) as coord 
      limit 1000000
      ''')  # 2,000,000 crashes firefox on a moderate desktop

    pdfpts = dfpts.toPandas()
    layer = pdk.Layer(
        "ScatterplotLayer",
        pdfpts,
        pickable=True,
        opacity=1,
        stroked=False,
        filled=True,
        radius_scale=10000,
        radius_min_pixels=1,
        radius_max_pixels=20,
        line_width_min_pixels=1,
        get_position=['lng', 'lat'],
        #get_position="coordinates",
        get_radius="exits_radius",
        get_fill_color=[196, 16, 4],
        get_line_color=[0, 0, 0],
        parameters= {
            'blend': True,
            'depthTest': False,
            'blendFunc': [770, 772]   # pydeck makes it really hard to find these enums.  
          }
    )

    # Set the viewport location
    view_state = pdk.ViewState(
        longitude=-0.2,
        latitude=51.5,
        zoom=3,
        min_zoom=0,
        max_zoom=18,
        pitch=0,
        bearing=0)

    # Combined all of it and render a viewport

    r = pdk.Deck(layers=[layer], initial_view_state=view_state)
    dir(r)
    r.to_html('scatterplot.html')
    import json
    #print(json.dumps(json.loads(r.to_json()),indent=1))
    #r.to_html('hexagon-example.html')
    #### https://deck.gl/docs/api-reference/json/conversion-reference --- does that show how the javascript should look?
    
    
#render_scatterplot and r
(display_mode == 'inline') and r or IPython.display.IFrame('scatterplot.html',800,600)


In [10]:
import pydeck as pdk
import pandas as pd
import pydeck
lvl=5

render_scatterplot_on_globe = True
if render_scatterplot_on_globe:


    COUNTRIES = "https://d2ad6b4ur7yvpq.cloudfront.net/naturalearth-3.3.0/ne_50m_admin_0_scale_rank.geojson"

    view_state = pdk.ViewState(latitude=0, longitude=0, zoom=2,         min_zoom=0,
            max_zoom=15)

    # Set height and width variables
    view = pdk.View(type="_GlobeView", controller=True, width=1000, height=700)

    dfpts = spark.sql('''
      select coord.lon as lng, coord.lat as lat,title from wikipedia_silver_structured_templates 
      lateral view explode(coords) as coord 
      limit 1000000
      ''')  # 2,000,000 crashes firefox on a moderate desktop

    pdfpts = dfpts.toPandas()
    layer = pdk.Layer(
        "ScatterplotLayer",
        pdfpts,
        pickable=True,
        opacity=1,
        stroked=False,
        filled=True,
        radius_scale=10000,
        radius_min_pixels=1,
        radius_max_pixels=15,
        line_width_min_pixels=1,
        get_position=['lng', 'lat'],
        #get_position="coordinates",
        get_radius="exits_radius",
        #get_fill_color=[196, 16, 4],
        get_fill_color=[196, 16, 4],
        #get_fill_color=[16, 4, 196],
        get_line_color=[0, 0, 0],
        parameters= {
            'blend': True,
            'depthTest': False,
            'blendFunc': [770, 772]   # pydeck makes it really hard to find these enums.  
          }
    )
    layers = [
        pdk.Layer(
            "GeoJsonLayer",
            id="base-map",
            data=COUNTRIES,
            stroked=False,
            filled=True,
            get_fill_color=[30,40,30],
        ),
        layer
    ]
    #layers = layers[:1]
    deck = pdk.Deck(
        views=[view],
        initial_view_state=view_state,
        #tooltip={"text": "{name}, {primary_fuel} plant, {country}"},
        layers=layers,
        # Note that this must be set for the globe to be opaque
        parameters={"cull": True},
    )
    deck.to_html("scatterplot_globe.html", css_background_color="black",iframe_height=600)
    
(display_mode == 'inline') and r or IPython.display.IFrame('scatterplot_globe.html',1000,600)


TODO

* http://mcburton.net/blog/static-files/

In [11]:
spark.sql(r"select title,replace(substr(body,1,40)||'...','\n','\\n') as body from wikipedia_bronze limit 5").show(5,False)

+-----------------------------+---------------------------------------------+
|title                        |body                                         |
+-----------------------------+---------------------------------------------+
|Black panther                |{{short description|Melanistic colour va...  |
|Great white shark            |{{other uses of|great white|Great White ...  |
|Melisende, Queen of Jerusalem|{{short description|Queen regnant of the...  |
|Carcharodon carcharias       |#REDIRECT [[Great white shark]] \n \n {{Re...|
|Method                       |{{Wiktionary|method}} \n '''Method''' ({{... |
+-----------------------------+---------------------------------------------+



In [12]:
from IPython.display import FileLink, FileLinks
FileLink('demo.html')

In [13]:
help(FileLink('demo.html'))
#FileLink('demo.html').html_link_str
#FileLink('demo.html').path
#FileLink('demo.html').result_html_prefix
#FileLink('demo.html').result_html_suffix
#FileLink('demo.html').url_prefix





Help on FileLink in module IPython.lib.display object:

class FileLink(builtins.object)
 |  FileLink(path, url_prefix='', result_html_prefix='', result_html_suffix='<br>')
 |  
 |  Class for embedding a local file link in an IPython session, based on path
 |  
 |  e.g. to embed a link that was generated in the IPython notebook as my/data.txt
 |  
 |  you would do::
 |  
 |      local_file = FileLink("my/data.txt")
 |      display(local_file)
 |  
 |  or in the HTML notebook, just::
 |  
 |      FileLink("my/data.txt")
 |  
 |  Methods defined here:
 |  
 |  __init__(self, path, url_prefix='', result_html_prefix='', result_html_suffix='<br>')
 |      Parameters
 |      ----------
 |      path : str
 |          path to the file or directory that should be formatted
 |      url_prefix : str
 |          prefix to be prepended to all files to form a working link [default:
 |          '']
 |      result_html_prefix : str
 |          text to append to beginning to link [default: '']
 |      r