In [1]:
from pyspark.sql import SparkSession 

spark_session = SparkSession.builder.master("spark://192.168.2.122:7077")\
        .appName("sergi_olives_juan-a3-part_a")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout", "30s")\
        .config("spark.cores.max", 2)\
        .getOrCreate()

spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/25 10:37:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/02/25 10:37:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
def add(a, b):
    # associative and commutative!
    return a + b

rdd = spark_session.sparkContext.parallelize(range(10**7))

result = rdd.filter(lambda x: x % 2 == 0)\
            .map(lambda x: x ** 2)\
            .reduce(add)

print(result)

[Stage 0:>                                                          (0 + 2) / 2]

166666616666670000000


                                                                                

# Part A - Working with the RDD API

## Question A.1

### A.1.1

In [3]:
en_lines = spark_context.textFile('hdfs://192.168.2.122:9000/europarl/europarl-v7.sv-en.en')
print(f'English transcripts lines count: {en_lines.count()}')

[Stage 1:>                                                          (0 + 2) / 2]

English transcripts lines count: 1862234


                                                                                

### A.1.2

In [4]:
sv_lines = spark_context.textFile('hdfs://192.168.2.122:9000/europarl/europarl-v7.sv-en.sv')
print(f'Swedish transcripts lines count: {sv_lines.count()}')

[Stage 2:>                                                          (0 + 2) / 3]

Swedish transcripts lines count: 1862234




### A.1.3

In [5]:
assert en_lines.count() == sv_lines.count(), "Lines count should be equal"

                                                                                

### A.1.4

In [6]:
print(f'Number of partitions in English transcripts: {en_lines.getNumPartitions()}')
print(f'Number of partitions in Swedish transcripts: {sv_lines.getNumPartitions()}')

Number of partitions in English transcripts: 2
Number of partitions in Swedish transcripts: 3


## Question A.2

### A.2.1

In [7]:
def text_preprocessing(rdd):
    text = rdd.map(lambda x: x.lower())
    text = text.map(lambda x: x.split(' '))
    return text

processed_en_text = text_preprocessing(en_lines)
processed_sv_text = text_preprocessing(sv_lines)

### A.2.2

In [8]:
print(f'10 first entries English text: {processed_en_text.take(10)}')

[Stage 5:>                                                          (0 + 1) / 1]

10 first entries English text: [['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'reque

                                                                                

In [9]:
print(f'10 first entries Swedish text: {processed_sv_text.take(10)}')

[Stage 6:>                                                          (0 + 1) / 1]

10 first entries Swedish text: [['återupptagande', 'av', 'sessionen'], ['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester.'], ['som', 'ni', 'kunnat', 'konstatera', 'ägde', '"den', 'stora', 'år', '2000-buggen"', 'aldrig', 'rum.', 'däremot', 'har', 'invånarna', 'i', 'ett', 'antal', 'av', 'våra', 'medlemsländer', 'drabbats', 'av', 'naturkatastrofer', 'som', 'verkligen', 'varit', 'förskräckliga.'], ['ni', 'har', 'begärt', 'en', 'debatt', 'i', 'ämnet', 'under', 'sammanträdesperiodens', 'kommande', 'dagar.'], ['till', 'dess', 'vill', 'jag', 'att', 'vi,', 'som', 'ett', 'antal', 'kolleger', 'begärt,', 'håller', 'en', 'tyst', 'minut', 'för', 'offren', 'för', 'bl.a.', 'stormarna', 'i', 'de', 'länder', 'i', 'europeiska', 'unionen', 'som', 'drabbats.'], ['jag', 'ber', 'er', 'resa', 'er', '

                                                                                

### A.2.3

In [10]:
assert processed_en_text.count() == processed_sv_text.count(), "Lines count should be equal"

                                                                                

## Question A.3

### A.3.1

In [11]:
def top_frequent_words(rdd, num_words=10):
    result = rdd.flatMap(lambda x: [(word, 1) for word in x]) # map
    result = result.reduceByKey(lambda x, y: x + y) # reduce
    result = result.takeOrdered(num_words, key=lambda x: -x[1])
    return result

In [12]:
print(f'Top 10 frequent words for English transcripts : {top_frequent_words(processed_en_text)}')



Top 10 frequent words for English transcripts : [('the', 3498375), ('of', 1659758), ('to', 1539760), ('and', 1288401), ('in', 1085993), ('that', 797516), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522849)]


                                                                                

In [13]:
print(f'Top 10 frequent words for Swedish transcripts : {top_frequent_words(processed_sv_text)}')



Top 10 frequent words for Swedish transcripts : [('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924866), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]


                                                                                

There is an overlap between the most frequent English and Swedish words. The word counts are in descending order. Among the most frequent words there are very common articles, propositions and conjunctions. The results seem reliable.

## Question A.4

## A.4.1

In [14]:
en_1 = processed_en_text.zipWithIndex()
sv_1 = processed_sv_text.zipWithIndex()

                                                                                

In [15]:
en_2 = en_1.map(lambda x: (x[1],x[0]))
sv_2 = sv_1.map(lambda x: (x[1],x[0]))

In [16]:
join_3 = en_2.join(sv_2)

In [17]:
filter_4 = join_3.filter(lambda x: len(x[1][0]) > 0 and len(x[1][1]) > 0)

In [18]:
filter_5 = filter_4.filter(lambda x: len(x[1][0]) <=5 or len(x[1][1]) <=5)

In [19]:
filter_6 = filter_5.filter(lambda x: len(x[1][0]) == len(x[1][1]))

In [20]:
map_7 = filter_6.flatMap(lambda x: zip(x[1][0], x[1][1]))
map_7 = map_7.map(lambda x: (x, 1))

In [21]:
reduce_8 = map_7.reduceByKey(lambda x, y: x + y)

In [22]:
reduce_8.takeOrdered(15, key=lambda x: -x[1])

                                                                                

[(('is', 'är'), 4699),
 (('closed.', 'avslutad.'), 2951),
 (('(applause)', '(applåder)'), 2546),
 (('', '.'), 2223),
 (('.', '.'), 2084),
 (('that', 'det'), 1494),
 (('we', 'vi'), 1443),
 (('the', 'jag'), 1336),
 (('is', 'debatten'), 1327),
 (('debate', 'förklarar'), 1319),
 (('\xa0\xa0', '\xa0\xa0'), 1287),
 (('the', 'debatten'), 1253),
 (('is', 'härmed'), 1240),
 (('debate', 'är'), 1211),
 (('i', 'jag'), 1168)]

Some tranlations are correct, although, the assumption that words stay in the same order when translated, makes the translations unreliable.

In [23]:
spark_session.stop()