In [1]:
from pyspark.sql import SparkSession
import re
from operator import add

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.87:7077") \
        .appName("mei_wu_part-1")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()
#        .config('spark.executor.cores', 2)\


# Old API (RDD)
spark_context = spark_session.sparkContext

rdd_en = spark_context.newAPIHadoopFile(
    'hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.en',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text'
)\
.cache() # Keep this RDD in memory!

rdd_sv = spark_context.newAPIHadoopFile(
    'hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.sv',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text'
)\
.cache() # Keep this RDD in memory!

In [2]:
en_count = rdd_en.count() # A.1.1 Counts the lines in English
en_count

1862234

In [3]:
sv_count = rdd_sv.count() # A.1.2 Counts the lines in Swedish
sv_count

1862234

In [4]:
en_count == sv_count # A.1.3 Verify the lines are same in both languages

True

In [5]:
rdd_en.getNumPartitions() # A.1.4 Count number of partitions in English

2

In [6]:
rdd_sv.getNumPartitions() # A.1.4 Count number of partitions in Swedish

3

In [7]:
# A.2.1
def lowerNsplit(w):
    w = w.lower().split(' ')
    return w

# A.2.2
words_en = rdd_en.flatMapValues(lowerNsplit)
words_en.take(10)


[(0, 'resumption'),
 (0, 'of'),
 (0, 'the'),
 (0, 'session'),
 (26, 'i'),
 (26, 'declare'),
 (26, 'resumed'),
 (26, 'the'),
 (26, 'session'),
 (26, 'of')]

In [8]:
# A.2.2
words_sv = rdd_sv.flatMapValues(lowerNsplit)
words_sv.take(10)


[(0, 'återupptagande'),
 (0, 'av'),
 (0, 'sessionen'),
 (29, 'jag'),
 (29, 'förklarar'),
 (29, 'europaparlamentets'),
 (29, 'session'),
 (29, 'återupptagen'),
 (29, 'efter'),
 (29, 'avbrottet')]

In [9]:
# A.2.3 Verify that the line counts still match after the pre-processing.
count_en = words_en.reduceByKey(lambda x, y: x).count()
count_sv = words_sv.reduceByKey(lambda x, y: x).count()

count_en == count_sv

True

In [10]:
# A.3.1 Use Spark to compute the 10 most frequently according words in the English language corpus. Repeat for the other language.
result_en = words_en.map(lambda x: (x[1], 1)).reduceByKey(add) # English
result_sv = words_sv.map(lambda x: (x[1], 1)).reduceByKey(add) # Swedish

# A.3.2 Verify that your results are reasonable.
result_en.takeOrdered(10, key=lambda x: -x[1])

[('the', 3498375),
 ('of', 1659758),
 ('to', 1539760),
 ('and', 1288401),
 ('in', 1085993),
 ('that', 797516),
 ('a', 773522),
 ('is', 758050),
 ('for', 534242),
 ('we', 522849)]

In [11]:
# A.3.2 Verify that your results are reasonable.
result_sv.takeOrdered(10, key=lambda x: -x[1])

[('att', 1706293),
 ('och', 1344830),
 ('i', 1050774),
 ('det', 924866),
 ('som', 913276),
 ('för', 908680),
 ('av', 738068),
 ('är', 694381),
 ('en', 620310),
 ('vi', 539797)]

In [41]:
# A.4.1
def f(x): return len(x)

eng = words_en.map(lambda x: (x[0], [ x[1] ]))

sumCount = eng.combineByKey(lambda value: (value, 1),
                            lambda x, value: (x[0] + value, x[1] + 1),
                            lambda x, y: (x[0] + y[0], x[1] + y[1]),
                           )

sumCount.first()

(117370,
 (['there',
   'were',
   'a',
   'total',
   'of',
   '27',
   'cases',
   'in',
   '1998',
   'and',
   'the',
   'commission',
   'submitted',
   'its',
   'own',
   'report',
   'on',
   'these.'],
  18))

In [16]:
# A.4.1
# se = words_sv.map(lambda x: (x[0], [ x[1] ])).reduceByKey(lambda a, b: a + b)
se = words_sv.map(lambda x: (x[0], [ x[1] ]))

[(135067650,
  (<pyspark.resultiterable.ResultIterable at 0x7ff85aaa3c50>,
   <pyspark.resultiterable.ResultIterable at 0x7ff85aaa3048>))]

[(1042584, 27),
 (3704422, 29),
 (4680994, 30),
 (4884494, 44),
 (4896220, 32),
 (5451822, 8),
 (5678414, 43),
 (5878066, 26),
 (6018196, 15),
 (6246322, 22)]

In [None]:
spark_session.stop()