In [11]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

In [9]:
spark_session = SparkSession.builder\
    .master("spark://192.168.2.70:7077") \
    .appName("Sepehr_3A")\
    .config("spark.dynamicAllocation.enabled", True)\
    .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
    .config("spark.shuffle.service.enabled", True)\
    .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
    .config("spark.cores.max", 4)\
    .getOrCreate()

In [10]:
def add(a, b):
# associative and commutative!
    return a + b
rdd = spark_session.sparkContext.parallelize(range(10**7))
result = rdd.filter(lambda x: x % 2 == 0).map(lambda x: x ** 2).reduce(add)
print(result)

[Stage 0:>                                                          (0 + 2) / 2]

166666616666670000000


                                                                                

In [18]:
# create spark contex
sc = spark_session.sparkContext

# create RDD from the textfile transcript in English
rdd_en = sc.textFile("hdfs://192.168.2.70:9000/europarl/europarl-v7.sv-en.en")

#count number of lines in the English version
linecount_en = rdd_en.count()

print(f"Number of Lines in the English Version: {linecount_en}")

#-------------------------------------------------

# create RDD from the textfile transcript in Swedish
rdd_sv = sc.textFile("hdfs://192.168.2.70:9000/europarl/europarl-v7.sv-en.sv")

#count number of lines in the Swedish version
linecount_sv = rdd_sv.count()

print(f"Number of Lines in the Swedish Version: {linecount_sv}")

#-----------------------------------------------------

#Compare the number of lines
if linecount_en == linecount_sv :
    print("The Number of Lines in both Languages are the same")
    
else:
    print ("The Number of Lines in Swedish and English are Different")
    
#-------------------------------------------------------------

#count the number of partitions

rdd_en_partitions = rdd_en.getNumPartitions()
rdd_sv_partitions = rdd_sv.getNumPartitions()

print(f'Number of partitions in English Version: {rdd_en_partitions}')
print(f'Number of partitions in Swedish Version: {rdd_sv_partitions}')

if rdd_en_partitions == rdd_sv_partitions :
    print("The Number of Partitions in both Languages are the same")
    
else:
    print ("The Number of Partitions in Swedish and English are Different")





                                                                                

Number of Lines in the English Version: 1862234




Number of Lines in the Swedish Version: 1862234
The Number of Lines in both Languages are the same
Number of partitions in English Version: 2
Number of partitions in Swedish Version: 3
The Number of Partitions in Swedish and English are Different




In [19]:
# function to pre-process the RDDs
def prp(rdd):
    rdd = rdd.map(lambda a: a.lower().split())
    return rdd

In [20]:
#pre-process the RDDS
rdd_en_prp = prp(rdd_en)
rdd_sv_prp = prp(rdd_sv)

In [23]:
# investigate 10 entries of each language version
print(f"10 Entries of the English Version:\n {rdd_en_prp.take(10)}")
print(f"10 Entries of the Swedish Version:\n {rdd_sv_prp.take(10)}")

#----------------------------------------
#Inverstigate the number of lines in pre-processed files
linecount_en_prp = rdd_en_prp.count()
print(f"Number of Lines in the English pre-processed Version: {linecount_en_prp}")

linecount_sv_prp = rdd_sv_prp.count()
print(f"Number of Lines in the Swedish pre-processed Version: {linecount_sv_prp}")

if linecount_en_prp == linecount_sv_prp:
    print("The Number of the line is still the same after pre processing")
else:
    print("The Number of the line is not the same after pre processing")

                                                                                

10 Entries of the English Version:
 [['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', '

                                                                                

10 Entries of the Swedish Version:
 [['återupptagande', 'av', 'sessionen'], ['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester.'], ['som', 'ni', 'kunnat', 'konstatera', 'ägde', '"den', 'stora', 'år', '2000-buggen"', 'aldrig', 'rum.', 'däremot', 'har', 'invånarna', 'i', 'ett', 'antal', 'av', 'våra', 'medlemsländer', 'drabbats', 'av', 'naturkatastrofer', 'som', 'verkligen', 'varit', 'förskräckliga.'], ['ni', 'har', 'begärt', 'en', 'debatt', 'i', 'ämnet', 'under', 'sammanträdesperiodens', 'kommande', 'dagar.'], ['till', 'dess', 'vill', 'jag', 'att', 'vi,', 'som', 'ett', 'antal', 'kolleger', 'begärt,', 'håller', 'en', 'tyst', 'minut', 'för', 'offren', 'för', 'bl.a.', 'stormarna', 'i', 'de', 'länder', 'i', 'europeiska', 'unionen', 'som', 'drabbats.'], ['jag', 'ber', 'er', 'resa', 'e

                                                                                

Number of Lines in the English pre-processed Version: 1862234




Number of Lines in the Swedish pre-processed Version: 1862234
The Number of the line is still the same after pre processing


                                                                                

In [32]:
#mapreduce function for the pre-processed texts, and frequency counter function
def n_freq(language, rdd, n):
    wordcount = rdd.flatMap(lambda line: line).map(lambda word: (word,1)).reduceByKey(lambda a,b: a+b)
    freq = wordcount.sortBy(lambda x: x[1], False).take(n)
    print(f'{n} most Frequent Words in {language} version:')
    for word, count in freq:
          print(f"{word}: {count}")
    #wordcount_sv = rdd_sv_prp.flatMap(lambda line: line).map(lambda word: (word,1)).reduceByKey(lambda a,b: a+b)


In [34]:
n_freq("English", rdd_en_prp,10)
n_freq("Swedish", rdd_sv_prp,10)

                                                                                

10 most Frequent Words in English version:
the: 3498574
of: 1659884
to: 1539823
and: 1288620
in: 1086089
that: 797576
a: 773812
is: 758087
for: 534270
we: 522879


                                                                                

10 most Frequent Words in Swedish version:
att: 1706309
och: 1344895
i: 1050989
det: 924878
som: 913302
för: 908703
av: 738102
är: 694389
en: 620347
vi: 539808
