In [1]:
# Import main packages
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
# Create Spark context
sparkConf = SparkConf()
sparkConf.setAppName("wiki")
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext

22/11/25 12:11:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
# Read dataframe
df_cat = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("delimiter", "\t") \
    .option("mode", "DROPMALFORMED") \
    .load("hdfs://localhost:9000/user/bigdata2022/datasets/wiki/categories.tsv")
df_cat.show(5)

+--------------------+--------------------+
|             article|          categories|
+--------------------+--------------------+
|%C3%81ed%C3%A1n_m...|subject.History.B...|
|%C3%81ed%C3%A1n_m...|subject.People.Hi...|
|          %C3%85land|   subject.Countries|
|          %C3%85land|subject.Geography...|
|  %C3%89douard_Manet|subject.People.Ar...|
+--------------------+--------------------+
only showing top 5 rows



In [4]:
from urllib.parse import unquote
rdd_cat = df_cat.rdd
rdd_cat  = rdd_cat.map(lambda x: (unquote(x.article),unquote(x.categories)))
rdd_cat.collect()

[Stage 2:>                                                          (0 + 1) / 1]                                                                                

[('Áedán_mac_Gabráin',
  'subject.History.British_History.British_History_1500_and_before_including_Roman_Britain'),
 ('Áedán_mac_Gabráin', 'subject.People.Historical_figures'),
 ('Åland', 'subject.Countries'),
 ('Åland', 'subject.Geography.European_Geography.European_Countries'),
 ('Édouard_Manet', 'subject.People.Artists'),
 ('Éire', 'subject.Countries'),
 ('Éire', 'subject.Geography.European_Geography.European_Countries'),
 ('Óengus_I_of_the_Picts',
  'subject.History.British_History.British_History_1500_and_before_including_Roman_Britain'),
 ('Óengus_I_of_the_Picts', 'subject.People.Historical_figures'),
 ('€2_commemorative_coins', 'subject.Business_Studies.Currency'),
 ('10th_century', 'subject.History.General_history'),
 ('11th_century', 'subject.History.General_history'),
 ('12th_century', 'subject.History.General_history'),
 ('13th_century', 'subject.History.General_history'),
 ('14th_century', 'subject.History.General_history'),
 ('15th_Marine_Expeditionary_Unit',
  'subject.H

In [5]:
output_cat = rdd_cat.map(lambda x: (x[0],[x[1]])) \
            .reduceByKey(lambda a, b: a+b)
output_cat.collect()

[('Áedán_mac_Gabráin',
  ['subject.History.British_History.British_History_1500_and_before_including_Roman_Britain',
   'subject.People.Historical_figures']),
 ('Åland',
  ['subject.Countries',
   'subject.Geography.European_Geography.European_Countries']),
 ('Édouard_Manet', ['subject.People.Artists']),
 ('Éire',
  ['subject.Countries',
   'subject.Geography.European_Geography.European_Countries']),
 ('Óengus_I_of_the_Picts',
  ['subject.History.British_History.British_History_1500_and_before_including_Roman_Britain',
   'subject.People.Historical_figures']),
 ('€2_commemorative_coins', ['subject.Business_Studies.Currency']),
 ('10th_century', ['subject.History.General_history']),
 ('11th_century', ['subject.History.General_history']),
 ('12th_century', ['subject.History.General_history']),
 ('13th_century', ['subject.History.General_history']),
 ('14th_century', ['subject.History.General_history']),
 ('15th_Marine_Expeditionary_Unit',
  ['subject.History.Military_History_and_War']),


In [6]:
# Read dataframe
df_link = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("delimiter", "\t") \
    .option("mode", "DROPMALFORMED") \
    .load("hdfs://localhost:9000/user/bigdata2022/datasets/wiki/links.tsv")
df_link.show(5)

+--------------------+--------------+
|              source|   destination|
+--------------------+--------------+
|%C3%81ed%C3%A1n_m...|          Bede|
|%C3%81ed%C3%A1n_m...|       Columba|
|%C3%81ed%C3%A1n_m...|D%C3%A1l_Riata|
|%C3%81ed%C3%A1n_m...| Great_Britain|
|%C3%81ed%C3%A1n_m...|       Ireland|
+--------------------+--------------+
only showing top 5 rows



In [7]:
from urllib.parse import unquote
rdd_link = df_link.rdd
rdd_link  = rdd_link.map(lambda x: (unquote(x.source),unquote(x.destination)))
rdd_link.collect()

                                                                                

[('Áedán_mac_Gabráin', 'Bede'),
 ('Áedán_mac_Gabráin', 'Columba'),
 ('Áedán_mac_Gabráin', 'Dál_Riata'),
 ('Áedán_mac_Gabráin', 'Great_Britain'),
 ('Áedán_mac_Gabráin', 'Ireland'),
 ('Áedán_mac_Gabráin', 'Isle_of_Man'),
 ('Áedán_mac_Gabráin', 'Monarchy'),
 ('Áedán_mac_Gabráin', 'Orkney'),
 ('Áedán_mac_Gabráin', 'Picts'),
 ('Áedán_mac_Gabráin', 'Scotland'),
 ('Áedán_mac_Gabráin', 'Wales'),
 ('Åland', '20th_century'),
 ('Åland', 'Baltic_Sea'),
 ('Åland', 'Crimean_War'),
 ('Åland', 'Currency'),
 ('Åland', 'Euro'),
 ('Åland', 'European_Union'),
 ('Åland', 'Finland'),
 ('Åland', 'League_of_Nations'),
 ('Åland', 'List_of_countries_by_system_of_government'),
 ('Åland', 'Nationality'),
 ('Åland', 'Parliamentary_system'),
 ('Åland', 'Police'),
 ('Åland', 'Russia'),
 ('Åland', 'Stockholm'),
 ('Åland', 'Sweden'),
 ('Åland', 'Time_zone'),
 ('Åland', 'Tourism'),
 ('Åland', 'United_Kingdom'),
 ('Åland', 'World_War_II'),
 ('Édouard_Manet', 'Absinthe'),
 ('Édouard_Manet', 'Beer'),
 ('Édouard_Manet', 'C

In [8]:
output_link = rdd_link.map(lambda x: (x[0],[x[1]])) \
            .reduceByKey(lambda a, b: a+b)
output_link.collect()

                                                                                

[('Áedán_mac_Gabráin',
  ['Bede',
   'Columba',
   'Dál_Riata',
   'Great_Britain',
   'Ireland',
   'Isle_of_Man',
   'Monarchy',
   'Orkney',
   'Picts',
   'Scotland',
   'Wales']),
 ('Åland',
  ['20th_century',
   'Baltic_Sea',
   'Crimean_War',
   'Currency',
   'Euro',
   'European_Union',
   'Finland',
   'League_of_Nations',
   'List_of_countries_by_system_of_government',
   'Nationality',
   'Parliamentary_system',
   'Police',
   'Russia',
   'Stockholm',
   'Sweden',
   'Time_zone',
   'Tourism',
   'United_Kingdom',
   'World_War_II']),
 ('Édouard_Manet',
  ['Absinthe',
   'Beer',
   'Claude_Monet',
   'Diego_Velázquez',
   'Edgar_Allan_Poe',
   'France',
   'Francisco_Goya',
   'Germany',
   'Impressionism',
   'Italy',
   'Landscape',
   'Netherlands',
   'Painting',
   'Paris',
   'Photography',
   'Raphael',
   'Renaissance',
   'Sweden',
   'United_States_dollar',
   'Washington,_D.C.']),
 ('Éire',
  ['Canada',
   'English_language',
   'George_VI_of_the_United_Kingdom

In [9]:
# pip3 install neo4j-driver
# python3 example.py

#from neo4j import GraphDatabase, basic_auth

#def functionx(namex):
#    driver = GraphDatabase.driver(
#      "bolt://44.212.46.93:7687",
#      auth=basic_auth("neo4j", "tours-procurements-deserters"))
#    with driver.session(database="neo4j") as session:
#        session.execute_write(lambda tx: tx.run("CREATE (a:Article {name: $name})", name=namex))
#        driver.close()
        
#rdd_cat.map(lambda x: functionx(x[0]))



In [20]:
from py2neo import Graph
import threading

def functiony(namex):
    graph = Graph("bolt://localhost:7474", auth=("neo4j", ""))
    graph.run("CREATE (a:Article {name: $name})", name=namex)

rdd_cat.map(lambda x: functiony(x[0]))


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonUtils.getBroadcastThreshold.
: java.lang.NullPointerException
	at org.apache.spark.api.java.JavaSparkContext$.toSparkContext(JavaSparkContext.scala:739)
	at org.apache.spark.api.python.PythonUtils$.getBroadcastThreshold(PythonUtils.scala:86)
	at org.apache.spark.api.python.PythonUtils.getBroadcastThreshold(PythonUtils.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [18]:
from neo4j import GraphDatabase, basic_auth

driver = GraphDatabase.driver(
  "bolt://44.212.46.93:7687",
  auth=basic_auth("neo4j", "tours-procurements-deserters"))

cypher_query = '''
MATCH (n)
RETURN COUNT(n) AS count
LIMIT $limit
'''

with driver.session(database="neo4j") as session:
    results = session.execute_read(
    lambda tx: tx.run(cypher_query,
                      limit=10).data())
    for record in results:
        print(record['count'])

driver.close()


3


In [12]:
# Stop context
spark.stop()