# Process Wikipedia in Spark

## TL/DR:

1. Start with a Wikipedia Dump parsed into spark dataframes from the other notebook.
2. Analyze links.

## Install dependencies

This is using `%pip` rather than `pkg_rsources.resolve()` because on databricks clusters, `%pip` will make sure the libraries are available on the spark worker nodes.

In [1]:
required_packages = {"mwparserfromhell","geopandas","h3","geocoder","pydeck"}

import pkg_resources
for lib in required_packages - {pkg.key for pkg in pkg_resources.working_set}:
    print(f"installing {lib}")
    %pip install -q --upgrade pip
    %pip install -q $lib
    pkg_resources.require(lib)


In [2]:
import json
import mwparserfromhell
import subprocess
import json
import time

## Launch Spark (if running on a standalone environment)

* On databricks clusters the Spark Context will already have existed.

In [3]:
if not "spark" in locals():
    import pyspark
    MAX_MEMORY = "8g"  # 24 gives OOM here. # 6 gives "out of heap space"
    spark = (pyspark.sql.SparkSession.builder.appName("MyApp") 
        .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0") 
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
        .config("spark.executor.memory", MAX_MEMORY) 
        .config("spark.driver.memory", MAX_MEMORY) 
        .config("spark.python.worker.reuse",False)
        .config("spark.task.maxFailures",5)
        .enableHiveSupport() 
        .getOrCreate()        
        )
spark

In [4]:
spark.sql('''
  select *
  from wikipedia_silver_structured_templates 
  limit 10
''').printSchema()

root
 |-- title: string (nullable = true)
 |-- body: string (nullable = true)
 |-- infoboxes: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- params: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- body: string (nullable = true)
 |-- templates: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- params: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- body: string (nullable = true)
 |-- extlinks: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |-- wikilinks: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- text

In [6]:
test_timing = False
if test_timing:
    t0 = time.perf_counter()
    spark.sql('''
      DROP TABLE IF EXISTS wikipedia_links
      ''')
    spark.sql('''
      CREATE TABLE IF NOT EXISTS wikipedia_links AS
      SELECT DISTINCT
        title    as src,
        wl.title as dst
      FROM wikipedia_silver_structured_templates 
      LATERAL VIEW explode(wikilinks) AS wl
    ''')
    print(f"created flattened wikipedia_links table in {time.perf_counter() - t0}")

In [None]:
spark.sql("drop table if exists tmp_test_join")
spark.sql("create table if not exists tmp_test_join (a string, b string)")
spark.sql("insert into tmp_test_join values ('a','b'),('b','c')")
spark.sql("""
    WITH incoming as (SELECT a as title, collect_list(b) as incoming_links FROM tmp_test_join group by a),
         outgoing as (SELECT b as title, collect_list(a) as outgoing_links FROM tmp_test_join group by b)
    SELECT title,incoming_links,outgoing_links FROM incoming FULL OUTER JOIN outgoing using (title)
          """).show()

In [7]:
t0 = time.perf_counter()
spark.sql("""
    CREATE TABLE IF NOT EXISTS denormalized_wikipedia_links AS
    WITH incoming as (SELECT dst as title, collect_list(src) as incoming_links FROM wikipedia_links group by dst),
         outgoing as (SELECT src as title, collect_list(dst) as outgoing_links FROM wikipedia_links group by src)
    SELECT title,incoming_links,outgoing_links FROM incoming FULL OUTER JOIN outgoing using (title)
""")
print(f"created flattened wikipedia_links table in {time.perf_counter() - t0}")

created flattened wikipedia_links table in 0.0640717709902674


In [8]:
t0 = time.perf_counter()
df = spark.sql("""
  select count(*),count(distinct *) from wikipedia_links
""")
print(f'found {df.take(1)} in {time.perf_counter()-t0} seconds')

found [Row(count(1)=313514602, count(DISTINCT src, dst)=313500372)] in 133.93299864395522 seconds


In [9]:
t0 = time.perf_counter()
df = spark.sql("""
    select * from wikipedia_links where src = 'Doge of Venice' or dst = 'Doge of Venice'
    """)
results = df.collect()
print(f'found {len(results)} in {time.perf_counter()-t0} seconds')

found 676 in 30.05684120999649 seconds


In [10]:
t0 = time.perf_counter()
df = spark.sql("""
    select * from denormalized_wikipedia_links where title = 'Doge of Venice'
    """)
results = df.collect()
print(f'found {len(results)} in {time.perf_counter()-t0} seconds')

found 1 in 35.227666849968955 seconds


In [11]:
# spark.sql("""optimze denormalized_wikipedia_links zorder by (title)""")

In [7]:
def get_links(titles,max_links=200):
    escaped_titles = [title.replace("'","''") for title in titles]
    in_clause = ",".join([f"'{et}'" for et in escaped_titles])
    links = spark.sql(f"""select * from denormalized_wikipedia_links where title in ({in_clause}) and size(outgoing_links)<{max_links}""").collect()
    return [l.asDict(True) for l in links]

l1 = get_links(['Mount Emei'])
print(json.dumps(l1,indent=1))

[
 {
  "title": "Mount Emei",
  "incoming_links": [
   "World Heritage site No. 779",
   "Omei",
   "Index of Buddhism-related articles",
   "Mario Biondi (writer)",
   "Megophrys wawuensis",
   "COVID-19 pandemic in Sichuan",
   "Template:Sichuan topics",
   "O-mei Shan",
   "Leptobrachella oshanensis",
   "Nga My",
   "Mt. Emei",
   "Nan Huai-Chin",
   "Mount Omei",
   "AAAAA Tourist Attractions of China",
   "Chu-Yuan Lee",
   "Zhoushan",
   "Wikipedia:WikiProject Volcanoes/Assessment/Log April 2007",
   "List of tourist attractions in China",
   "Scutiger chintingensis",
   "Omei Shan",
   "The Legend of Zu",
   "Emeishan Traps",
   "Jiang Wei",
   "Emei Shan Protected Scenic Site",
   "Template:Chinese Buddhism",
   "Wikipedia:Files for deletion/2011 October 10",
   "EM Legend",
   "Rana omeimontis",
   "Scutiger (frog)",
   "Inoculation",
   "Transcendental whistling",
   "List of mountains in China",
   "Longdong stream salamander",
   "Hongchunping Temple",
   "Daxiangling",
  

In [22]:
spark.sql("""select title,size(wikilinks) from wikipedia_silver_structured_templates where title like '%Emei%' order by size(wikilinks) desc""").show()

+--------------------+---------------+
|               title|size(wikilinks)|
+--------------------+---------------+
|          Mount Emei|             56|
|Wannian Temple (M...|             55|
|Baoguo Temple (Mo...|             54|
|      Emeishan Traps|             34|
|               Emeis|             28|
|   Emei leaf warbler|             28|
|           Emei Sect|             25|
|            Emeiquan|             23|
|       Emei, Hsinchu|             21|
|     Emei music frog|             21|
|Emeigh, Pennsylvania|             21|
|Template:Emei–Pan...|             21|
| Cletus Komena Emein|             19|
| Emei Shan liocichla|             19|
|Emei railway station|             18|
|       Emeishan City|             17|
|Emeis Park and Go...|             12|
|              Emeici|              8|
|   Zhonghai Emeishan|              7|
|             Emeidae|              3|
+--------------------+---------------+
only showing top 20 rows



In [11]:
import string

def make_dot(nodes,edges):
    tmpl = string.Template('''
      digraph {
        rankdir=LR;
        graph [ranksep=0.25, overlap=prism, nodesep=0.25, splines=true];
        node [style="filled"; dir="none"];
        edge[arrowhead="none"arrowtail="none"];
        $__nodes__
        $__edges__
      }
    ''')
    d = tmpl.safe_substitute({
        '__nodes__':';\n'.join(nodes),
        '__edges__':';\n'.join(edges)
    })
    return d
 

In [12]:

import html
import IPython

def make_dot_viewer(dots):
    html_tmpl = string.Template(r"""
    <!DOCTYPE html>
    <meta charset="utf-8">
    <body>
    <script src="https://d3js.org/d3.v5.min.js"></script>
    <script src="https://unpkg.com/@hpcc-js/wasm@0.3.11/dist/index.min.js"></script>
    <script src="https://unpkg.com/d3-graphviz@3.0.5/build/d3-graphviz.js"></script>
    <button id="reset_dot">reset</button>
    <button id="next_dot">next</button>
    <button id="resize_dot">resize</button>

    <div id="graph" style="width: 95%; border: 1px solid red; text-align: center;"></div>

    <script>
        var _gv = '';

        function resize_dot(w,h) {
            w = document.body.style.width;
            h = document.body.style.height;
            var svg = d3.select("#graph").selectWithoutDataPropagation("svg");
            svg
                .transition()
                .duration(700)
                .attr("width", w)
                .attr("height", h);
            var d = svg.datum();
            d.attributes['width'] = w;
            d.attributes['height'] = h;
        };

        function draw_graphs(dots) {
            var dot_index = 0;
            function render_next_dot() {
                var dot = dots[dot_index];
                gv.renderDot(dot).on("end", function() {dot_index = (dot_index + 1) % dots.length})
            }
            function reset_dot() {
                dot_index = 0;
                var dot = dots[dot_index];
                gv.renderDot(dot).on("end", function() {dot_index = (dot_index + 1) % dots.length})
                gv.resetZoom();
            }
            var engine = 'dot';
            //var engine = 'neato';
            var transition_fun = function(){
                return d3.transition('dottrans')
                         .delay(10)
                         .duration(200)
                         .ease(d3.easeLinear)
            };
            var gv = d3.select("#graph").graphviz()
                     .engine(engine)
                     .height(600)
                     .width(800)
                     .fit(1)
                     .zoomScaleExtent([0.01,100])
                     .transition(transition_fun);
            _gv = gv;
            render_next_dot()
            document.getElementById("next_dot").addEventListener('click', render_next_dot, false);
            document.getElementById("reset_dot").addEventListener('click', reset_dot, false);
            document.getElementById("resize_dot").addEventListener('click', resize_dot, false);

        }

        function rand_int(max) {
          return Math.floor(Math.random() * max);
        }

        draw_graphs($__dots__);
    </script>
    </body>
    """)

    h1 = html_tmpl.safe_substitute({'__dots__':json.dumps(dots)})
    return(h1)

dots = [
     "digraph {rankdir=LR;a->b}",
     "digraph {rankdir=LR;a->b; b->c}",
     "digraph {rankdir=LR;a->b; b->c; c->a}",
     "digraph {rankdir=LR;a->b; b->c; a->c; a->b2}",
     "digraph {rankdir=LR;a->b; b->c; c->a; a->b2; b2 -> c}",    
]
h1 = make_dot_viewer(dots)
h2 = f'<iframe style="width:98%; height:800px; border: 1px solid green;" srcdoc="{html.escape(h1)}">'
#IPython.display.HTML(h2)


In [8]:
l1 = get_links(['Mount Emei'])


In [9]:
l2 = get_links(l1[0]['outgoing_links'],max_links=100)

In [13]:
nodes = []
edges = []

for row in l1:
    src = row['title']
    dsts = row['outgoing_links'] or []
    new_edges  = [f'"{src}" -> "{dst}"' for dst in dsts]
    edges.extend(new_edges)
    
for row in l2:
    src = row['title']
    dsts = row['outgoing_links'] or []
    new_edges  = [f'"{src}" -> "{dst}"' for dst in dsts]
    edges.extend(new_edges)

dot = make_dot(nodes,edges[9:1000])

#print(dot)
dots = [dot]
h1 = make_dot_viewer(dots)
h2 = f'<iframe style="width:98%; height:800px; border: 1px solid green;" srcdoc="{html.escape(h1)}">'
#IPython.display.HTML(h2)


In [None]:
def get_links(title):
    escaped_title = title.replace("'","''")
    spark.sql("""select * from denormalized_wikipedia_links where title = '{escaped_title}'""").collect()


In [None]:
df = spark.sql("""
    select * from wikipedia_links where dst = 'Doge of Venice'
    """)
df.show(10)

In [None]:
spark.sql("
    ")

In [7]:
t0 = time.perf_counter()
df = spark.sql("select link_to as title, collect_list(link_from) as incoming_links from links group by link_to")
try:
    df.write.format("delta").saveAsTable("tmp_wiki_incoming_links")
except Exception as e:
    print(e)
print(f"saved incoming link table in {time.perf_counter() - t0} seconds")


Table default.tmp_wiki_incoming_links already exists
saved incoming link table in 0.12487451289780438 seconds


In [14]:
# Category and Help pages have absurdly large numbers of links.
# Probably want to treat them separately.

spark.sql("""
  select title,size(incoming_links),incoming_links
   from tmp_wiki_incoming_links 
   where title not like '%{%' and title not like 'Category%'
   order by size(incoming_links) desc
""").show(10,40)

+---------------------------------------+--------------------+----------------------------------------+
|                                  title|size(incoming_links)|                          incoming_links|
+---------------------------------------+--------------------+----------------------------------------+
|                          Help:Category|                8504|[Category:2009 protests, Category:194...|
|                     Wikipedia:Category|                3766|[Category:Chartjackers, Category:Germ...|
|                          United States|                3579|[Category:North Carolina populated pl...|
|List of adjectival forms of place names|                2782|[Category:Barbados political party sh...|
|                                England|                2590|[Category:Kerrier geography stubs, Ca...|
|                            User:Polbot|                2566|[Category:Mesua, Category:Asterogyne,...|
|                             U.S. state|                2230|[C

In [21]:
spark.sql("""select * from tmp_wiki_incoming_links where title='Little Brown Bat'""").collect()

[]

In [32]:
spark.sql("""select * from tmp_wiki_incoming_links where title like 'Doge of Venice'""").show(100,100)

+--------------+---------------------------+
|         title|             incoming_links|
+--------------+---------------------------+
|Doge of Venice|[Category:Leonardo Loredan]|
+--------------+---------------------------+



In [51]:
r = spark.sql("""select title,wikilinks.title from wikipedia_silver_structured_templates where title = 'Paolo Renier'""")
print(json.dumps(r.collect(),indent=1))

[
 [
  "Paolo Renier",
  [
   "Doge of Venice",
   "Alvise Giovanni Mocenigo",
   "Ludovico Manin",
   "Venice",
   "Republic of Venice",
   "Margherita Delmaz",
   "Venice",
   "Doge of Venice",
   "ambassador",
   "Ottoman Empire",
   "Habsburg Monarchy",
   "Ludovico Manin",
   "Margherita Delmaz",
   "Lodovico Gallina",
   "Sequin (coin)",
   "Doge of Venice",
   "Republic of Venice",
   "Alvise Giovanni Mocenigo",
   "List of Doges of Venice",
   "Ludovico Manin",
   "Category:Ambassadors of the Republic of Venice to Austria",
   "Category:1710 births",
   "Category:1789 deaths",
   "Category:Baili of Constantinople",
   "Category:18th-century diplomats",
   "Category:18th-century Italian people",
   "Category:18th-century Doges of Venice",
   "Category:Ambassadors of the Republic of Venice to the Ottoman Empire"
  ]
 ]
]
