In [1]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.1.153:7077") \
        .appName("A2")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

In [2]:

#Import documents in English and Swedish: 
europarl_en= spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.en")
europarl_sv= spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.sv")


In [3]:
#Question A.1 

#Function to split the text into lines.
def split_line(line):
    return line.split()

#Splitting the both text into lines.
lines_splitted_en = europarl_en.map(split_line);
lines_splitted_sv = europarl_sv.map(split_line);

#Counting the number of lines and partitions of each text.
number_of_lines_en=lines_splitted_en.count();
number_of_lines_sv=lines_splitted_sv.count();
partitions_en=europarl_en.getNumPartitions();
partitions_sv=europarl_sv.getNumPartitions();


print("Number of lines in English:",number_of_lines_en)
print("Number of lines in Swedish:",number_of_lines_sv)
print("Partitions English:",partitions_en,"Partitions Swedish:",partitions_sv)

Number of lines in English: 1862234
Number of lines in Swedish: 1862234
Partitions English: 2 Partitions Swedish: 3


In [9]:
#Question A.2

#Function that converts the text into lower cases and split it by lines.
def Func(lines):
      lines = lines.lower()
      lines = lines.split(' ')
      return lines
    
rdd_en = europarl_en.map(Func)
rdd_sv = europarl_sv.map(Func)

#Verify number of lines.
test_number_of_lines_en=rdd_en.count();
test_number_of_lines_sv=rdd_sv.count();

#Verify number of lines and inspecto 10 entries from each RDD.
test_lower_case_en=rdd_en.take(10)
test_lower_case_sv=rdd_sv.take(10)

print("Lower case in English:",test_lower_case_en)
print("Lower case in Swedish:",test_lower_case_sv)
print("Number of lines in English:",test_number_of_lines_en)
print("Number of lines in Swedish:",test_number_of_lines_sv)


Lower case in English: [['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 

In [10]:
#Question A.3

#3.1 10 Most frequent Words in English
words_en = rdd_en.flatMap(lambda x:x)
words_en = words_en.map(lambda x: (x,1))\
.reduceByKey(lambda x,y: x + y)\
.map(lambda x:(x[1],x[0]))\
.sortByKey(False)

print ("10 Most frequent Words in English:",words_en.take(10))

#3.1 10 Most frequent Words in Swedish
words_sv = rdd_sv.flatMap(lambda x:x)
words_sv = words_sv.map(lambda x: (x,1))\
.reduceByKey(lambda x,y: x + y)\
.map(lambda x:(x[1],x[0]))\
.sortByKey(False)

print ("10 Most frequent Words in Swedish:",words_sv.take(10))



10 Most frequent Words in English: [(3498375, 'the'), (1659758, 'of'), (1539760, 'to'), (1288401, 'and'), (1085993, 'in'), (797516, 'that'), (773522, 'a'), (758050, 'is'), (534242, 'for'), (522849, 'we')]
10 Most frequent Words in Swedish: [(1706293, 'att'), (1344830, 'och'), (1050774, 'i'), (924866, 'det'), (913276, 'som'), (908680, 'för'), (738068, 'av'), (694381, 'är'), (620310, 'en'), (539797, 'vi')]


In [11]:
#Question A.4

#Keyvalue lines in English
lines_en = rdd_en.map(lambda x:x)\
.map(lambda x: (x))\
.zipWithIndex()\
.map(lambda x:(x[1],x[0]))
lines_en.take(2);

#Keyvalue lines in Swedish
lines_sv = rdd_sv.map(lambda x:x)\
.map(lambda x: (x))\
.zipWithIndex()\
.map(lambda x:(x[1],x[0]))
lines_sv.take(2);



In [12]:
#Testing same number of lines after indexing
number_of_lines_en=lines_en.count();
number_of_lines_sv=lines_sv.count();

print("Number of lines in English:",number_of_lines_en)
print("Number of lines in Swedish:",number_of_lines_sv)

Number of lines in English: 1862234
Number of lines in Swedish: 1862234


In [13]:
#Join the two RDDs together according to the line number key

inner_join = lines_en.join(lines_sv)
rdd_out = inner_join.sortBy(lambda x:x[0])

#Filter: exclude line pairs that have an empty,only pairs of sentences with max 15 words and same number of words.

rdd_filter=rdd_out.filter(lambda x: len(x[1][0]) < 15)\
.filter(lambda x: len(x[1][0]) == len(x[1][1]))\
.sortBy(lambda x:x[0])

pair_words = rdd_filter.map(lambda x:x[1])


In [20]:
#Test filters 
pair_words.take(1)

[(['it',
   'says',
   'that',
   'this',
   'should',
   'be',
   'done',
   'despite',
   'the',
   'principle',
   'of',
   'relative',
   'stability.'],
  ['i',
   'betänkandet',
   'står',
   'det',
   'att',
   'detta',
   'bör',
   'göras',
   'trots',
   'principen',
   'om',
   'relativ',
   'stabilitet.'])]

In [16]:
#Map each pair word in the two sentences

def pair(words):
    for i in range(len(words[0])):
        words_list = list(zip(words[1],words[0]))
    return words_list

pair_words_en= pair_words.flatMap(lambda a: pair(a))


In [21]:
#Test function for pairing words.
pair_words_en.take(5)

[('i', 'it'),
 ('betänkandet', 'says'),
 ('står', 'that'),
 ('det', 'this'),
 ('att', 'should')]

In [18]:
#Reduce pairing words Swedish-English.
pair_words_most = pair_words_en.map(lambda x: (x,1))\
.reduceByKey(lambda x,y: x + y)\
.map(lambda x:(x[1],x[0]))\
.sortByKey(False)


In [19]:
print ("10 Most frequent Words in Swedish-English:",pair_words_most.take(10))

10 Most frequent Words in Swedish-English: [(15364, ('är', 'is')), (10875, ('vi', 'we')), (10416, ('jag', 'i')), (8503, ('och', 'and')), (5960, ('en', 'a')), (5762, ('det', 'it')), (5682, ('detta', 'this')), (5279, ('i', 'in')), (4856, ('inte', 'not')), (4288, ('att', 'to'))]


In [24]:
spark_context.stop()