In [1]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.1.153:7077") \
        .appName("A2")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

In [2]:

#Import documents in English and Swedish: 
europarl_en= spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.en")
europarl_sv= spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.sv")


In [3]:
#Question A.1 

#Function to split the text into lines.
def split_line(line):
    return line.split()

#Splitting the both text into lines.
lines_splitted_en = europarl_en.map(split_line);
lines_splitted_sv = europarl_sv.map(split_line);

#Counting the number of lines and partitions of each text.
number_of_lines_en=lines_splitted_en.count();
number_of_lines_sv=lines_splitted_sv.count();
partitions_en=europarl_en.getNumPartitions();
partitions_sv=europarl_sv.getNumPartitions();


print("Number of lines in English:",number_of_lines_en)
print("Number of lines in Swedish:",number_of_lines_sv)
print("Partitions English:",partitions_en,"Partitions Swedish:",partitions_sv)

Number of lines in English: 1862234
Number of lines in Swedish: 1862234
Partitions English: 2 Partitions Swedish: 3


In [4]:
#Question A.2

#Function that converts the text into lower cases and split it by lines.
def Func(lines):
      lines = lines.lower()
      lines = lines.split(' ')
      return lines
rdd_en = europarl_en.map(Func)
rdd_sv = europarl_sv.map(Func)

#Verify number of lines.
test_number_of_lines_en=rdd_en.count();
test_number_of_lines_sv=rdd_sv.count();

#Verify number of lines and inspecto 10 entries from each RDD.
test_lowercase_en=rdd_en.take(10);
test_lowecase_sv=rdd_sv.take(10);

print("Number of lines in English:",test_number_of_lines_en)
print("Number of lines in Swedish:",test_number_of_lines_sv)


Number of lines in English: 1862234
Number of lines in Swedish: 1862234


In [5]:
#Question A.3

#3.1 10 Most frequent Words in English
words_en = rdd_en.flatMap(lambda x:x)
words_en = words_en.map(lambda x: (x,1))\
.reduceByKey(lambda x,y: x + y)\
.map(lambda x:(x[1],x[0]))\
.sortByKey(False)

print ("10 Most frequent Words in English:",words_en.take(10))

#3.1 10 Most frequent Words in Swedish
words_sv = rdd_sv.flatMap(lambda x:x)
words_sv = words_sv.map(lambda x: (x,1))\
.reduceByKey(lambda x,y: x + y)\
.map(lambda x:(x[1],x[0]))\
.sortByKey(False)

print ("10 Most frequent Words in Swedish:",words_sv.take(10))



10 Most frequent Words in English: [(3498375, 'the'), (1659758, 'of'), (1539760, 'to'), (1288401, 'and'), (1085993, 'in'), (797516, 'that'), (773522, 'a'), (758050, 'is'), (534242, 'for'), (522849, 'we')]
10 Most frequent Words in Swedish: [(1706293, 'att'), (1344830, 'och'), (1050774, 'i'), (924866, 'det'), (913276, 'som'), (908680, 'för'), (738068, 'av'), (694381, 'är'), (620310, 'en'), (539797, 'vi')]


In [15]:
#Question A.4

#Keyvalue lines in English
lines_en = rdd_en.map(lambda x:x)\
.map(lambda x: (x))\
.zipWithIndex()\
.map(lambda x:(x[1],x[0]))
lines_en.take(2);

#Keyvalue lines in Swedish
lines_sv = rdd_sv.map(lambda x:x)\
.map(lambda x: (x))\
.zipWithIndex()\
.map(lambda x:(x[1],x[0]))
lines_sv.take(2);



In [16]:
lines_sv.take(2)

[(0, ['återupptagande', 'av', 'sessionen']),
 (1,
  ['jag',
   'förklarar',
   'europaparlamentets',
   'session',
   'återupptagen',
   'efter',
   'avbrottet',
   'den',
   '17',
   'december.',
   'jag',
   'vill',
   'på',
   'nytt',
   'önska',
   'er',
   'ett',
   'gott',
   'nytt',
   'år',
   'och',
   'jag',
   'hoppas',
   'att',
   'ni',
   'haft',
   'en',
   'trevlig',
   'semester.'])]

In [7]:
number_of_lines_en=lines_en.count();
number_of_lines_sv=lines_sv.count();

print("Number of lines in English:",test_number_of_lines_en)
print("Number of lines in Swedish:",test_number_of_lines_sv)

Number of lines in English: 1862234
Number of lines in Swedish: 1862234


In [8]:
#join_on_en_sv = lines_en.join(lines_sv)
#join_on_en_sv.take(2)


inner_join = lines_en.join(lines_sv)


In [19]:
rdd_out = inner_join.sortBy(lambda x:x[0])
rdd_out.take(2)

[(0,
  ((['resumption', 'of', 'the', 'session'], 1),
   (['återupptagande', 'av', 'sessionen'], 1))),
 (1,
  ((['i',
     'declare',
     'resumed',
     'the',
     'session',
     'of',
     'the',
     'european',
     'parliament',
     'adjourned',
     'on',
     'friday',
     '17',
     'december',
     '1999,',
     'and',
     'i',
     'would',
     'like',
     'once',
     'again',
     'to',
     'wish',
     'you',
     'a',
     'happy',
     'new',
     'year',
     'in',
     'the',
     'hope',
     'that',
     'you',
     'enjoyed',
     'a',
     'pleasant',
     'festive',
     'period.'],
    1),
   (['jag',
     'förklarar',
     'europaparlamentets',
     'session',
     'återupptagen',
     'efter',
     'avbrottet',
     'den',
     '17',
     'december.',
     'jag',
     'vill',
     'på',
     'nytt',
     'önska',
     'er',
     'ett',
     'gott',
     'nytt',
     'år',
     'och',
     'jag',
     'hoppas',
     'att',
     'ni',
     'haft',
     'e

In [None]:
spark_context.stop()