# Process Wikipedia in Spark

## TL/DR:

1. Start with a Wikipedia Dump parsed into spark dataframes from the other notebook.
2. Analyze links.

## Install dependencies

This is using `%pip` rather than `pkg_rsources.resolve()` because on databricks clusters, `%pip` will make sure the libraries are available on the spark worker nodes.

In [21]:
required_packages = {"mwparserfromhell","geopandas","h3","geocoder","pydeck"}

import pkg_resources
for lib in required_packages - {pkg.key for pkg in pkg_resources.working_set}:
    print(f"installing {lib}")
    %pip install -q --upgrade pip
    %pip install -q $lib
    pkg_resources.require(lib)


In [23]:
import json
import mwparserfromhell
import subprocess
import json
import time

## Launch Spark (if running on a standalone environment)

* On databricks clusters the Spark Context will already have existed.

In [32]:
if not "spark" in locals():
    import pyspark
    MAX_MEMORY = "8g"  # 24 gives OOM here. # 6 gives "out of heap space"
    spark = (pyspark.sql.SparkSession.builder.appName("MyApp") 
        .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0") 
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
        .config("spark.executor.memory", MAX_MEMORY) 
        .config("spark.driver.memory", MAX_MEMORY) 
        .config("spark.python.worker.reuse",False)
        .config("spark.task.maxFailures",5)
        .enableHiveSupport() 
        .getOrCreate()        
        )
spark

In [35]:
spark.sql('''
  select *
  from wikipedia_silver_structured_templates 
  limit 10
''').printSchema()



root
 |-- title: string (nullable = true)
 |-- body: string (nullable = true)
 |-- infoboxes: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- params: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- body: string (nullable = true)
 |-- templates: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- params: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- body: string (nullable = true)
 |-- extlinks: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |-- wikilinks: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- text

In [45]:
df = spark.sql('''
  select 
    title    as link_from,
    wl.title as link_to
  from wikipedia_silver_structured_templates 
  lateral view explode(wikilinks) as wl
  where title like 'Category:%'
''')
df.createOrReplaceTempView('links')
df.show(10,100)


+------------------------------------------------------------+------------------------------------------------+
|                                                   link_from|                                         link_to|
+------------------------------------------------------------+------------------------------------------------+
|                                   Category:Monza Rally Show|            Category:Rally competitions in Italy|
|                                   Category:Monza Rally Show|       Category:World Rally Championship rallies|
|                   Category:Categories by city in Kyrgyzstan|         Category:Categories by country and city|
|     Category:World War II resistance members by nationality|        Category:World War II resistance members|
|     Category:World War II resistance members by nationality|      Category:Resistance members by nationality|
|Category:Ambassadors of the State of Palestine to Costa Rica|  Category:Ambassadors of the State of Pal

In [42]:
t0 = time.perf_counter()
count = df.count()
print(f'found {count} links between categories in {time.perf_counter() - t0} seconds')

found 4803875 links between categories in 32.94304472103249 seconds


In [51]:
t0 = time.perf_counter()
df = spark.sql("select link_to as title, collect_list(link_from) as incoming_links from links group by link_to")
df.write.format("delta").saveAsTable("tmp_wiki_incoming_links")
print(f"saved incoming link table in {time.perf_counter() - t0} seconds")


AnalysisException: Table default.tmp_wiki_incoming_links already exists

In [55]:
spark.sql('select title,size(incoming_links),incoming_links from tmp_wiki_incoming_links order by size(incoming_links) desc').show(100,40)

+----------------------------------------+--------------------+----------------------------------------+
|                                   title|size(incoming_links)|                          incoming_links|
+----------------------------------------+--------------------+----------------------------------------+
|                          {{Title year}}|               11118|[Category:2021 in Philadelphia, Categ...|
|                           Help:Category|                8504|[Category:2009 protests, Category:194...|
|            Category:Songs by songwriter|                8088|[Category:Songs written by Phil Wickh...|
|              Category:Films by director|                8022|[Category:Films directed by Paul Terr...|
|Category:Minor league baseball player...|                4263|[Category:Moline A's players, Categor...|
|      Category:Wikipedia 1.0 assessments|                4192|[Category:Extinction articles by qual...|
|                      Wikipedia:Category|             