In [22]:
# Import main packages
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [23]:
# Create Spark context
sparkConf = SparkConf()
sparkConf.setAppName("pm10")
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext

In [36]:
# Read dataframe
df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("delimiter", "\t") \
    .option("mode", "DROPMALFORMED") \
    .load("hdfs://localhost:9000/user/bigdata2022/datasets/wiki/categories.tsv")
df.show(5)

+--------------------+--------------------+
|             article|          categories|
+--------------------+--------------------+
|%C3%81ed%C3%A1n_m...|subject.History.B...|
|%C3%81ed%C3%A1n_m...|subject.People.Hi...|
|          %C3%85land|   subject.Countries|
|          %C3%85land|subject.Geography...|
|  %C3%89douard_Manet|subject.People.Ar...|
+--------------------+--------------------+
only showing top 5 rows



In [41]:
from urllib.parse import unquote
rdd = df.rdd
rdd = rdd.map(lambda x: (unquote(x.article),unquote(x.categories)))
rdd.collect()

[('Áedán_mac_Gabráin',
  'subject.History.British_History.British_History_1500_and_before_including_Roman_Britain'),
 ('Áedán_mac_Gabráin', 'subject.People.Historical_figures'),
 ('Åland', 'subject.Countries'),
 ('Åland', 'subject.Geography.European_Geography.European_Countries'),
 ('Édouard_Manet', 'subject.People.Artists'),
 ('Éire', 'subject.Countries'),
 ('Éire', 'subject.Geography.European_Geography.European_Countries'),
 ('Óengus_I_of_the_Picts',
  'subject.History.British_History.British_History_1500_and_before_including_Roman_Britain'),
 ('Óengus_I_of_the_Picts', 'subject.People.Historical_figures'),
 ('€2_commemorative_coins', 'subject.Business_Studies.Currency'),
 ('10th_century', 'subject.History.General_history'),
 ('11th_century', 'subject.History.General_history'),
 ('12th_century', 'subject.History.General_history'),
 ('13th_century', 'subject.History.General_history'),
 ('14th_century', 'subject.History.General_history'),
 ('15th_Marine_Expeditionary_Unit',
  'subject.H

In [46]:
output = rdd.map(lambda x: (x[0],[x[1]])) \
            .reduceByKey(lambda a, b: a+b)
output.collect()

                                                                                

[('Áedán_mac_Gabráin',
  ['subject.History.British_History.British_History_1500_and_before_including_Roman_Britain',
   'subject.People.Historical_figures']),
 ('Åland',
  ['subject.Countries',
   'subject.Geography.European_Geography.European_Countries']),
 ('Édouard_Manet', ['subject.People.Artists']),
 ('Éire',
  ['subject.Countries',
   'subject.Geography.European_Geography.European_Countries']),
 ('Óengus_I_of_the_Picts',
  ['subject.History.British_History.British_History_1500_and_before_including_Roman_Britain',
   'subject.People.Historical_figures']),
 ('€2_commemorative_coins', ['subject.Business_Studies.Currency']),
 ('10th_century', ['subject.History.General_history']),
 ('11th_century', ['subject.History.General_history']),
 ('12th_century', ['subject.History.General_history']),
 ('13th_century', ['subject.History.General_history']),
 ('14th_century', ['subject.History.General_history']),
 ('15th_Marine_Expeditionary_Unit',
  ['subject.History.Military_History_and_War']),


In [7]:
top_sites = critical_sites.map(lambda site: (site[1], site[0]))
top_sites = top_sites.reduceByKey(lambda a, b: a+", "+b)
top_sites.sortByKey(ascending=False).take(50)

                                                                                

[(14728, '0004'),
 (14134, '0005'),
 (12605, '0500'),
 (8537, '0011'),
 (8374, '1004'),
 (7951, '2002'),
 (7593, '0003'),
 (7248, '0014'),
 (7219, '2010'),
 (6971, '0001'),
 (6730, '1003'),
 (6322, '0232'),
 (5835, '0002'),
 (5718, '0020'),
 (5457, '0017'),
 (5435, '1002'),
 (4999, '0007'),
 (4901, '5001'),
 (4872, '0008'),
 (4492, '4004'),
 (4359, '4003'),
 (3983, '0019'),
 (3978, '0010'),
 (3852, '0018'),
 (3822, '1005'),
 (3659, '4009'),
 (3623, '8005'),
 (3225, '0031'),
 (3190, '0009'),
 (3173, '3015'),
 (3083, '1001'),
 (2911, '0016'),
 (2878, '3001'),
 (2684, '3008'),
 (2651, '0022'),
 (2615, '4011'),
 (2610, '0025'),
 (2585, '0021'),
 (2478, '1999'),
 (2248, '3002'),
 (2221, '9004'),
 (2156, '8001'),
 (2011, '1016'),
 (1992, '7004'),
 (1942, '3011'),
 (1921, '0015'),
 (1904, '0006'),
 (1869, '2005'),
 (1857, '3016'),
 (1798, '8011')]

In [8]:
# Stop context
spark.stop()