In [21]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('rdd').getOrCreate()
sc = spark.sparkContext

In [30]:
def fibonacci(n):
    a, b = 0, 1
    for _ in range(n):
        yield a
        a, b = b, a + b


fib = list(fibonacci(20))

In [31]:
rdd = sc.parallelize(fib)

In [32]:
rdd.collect()

[0,
 1,
 1,
 2,
 3,
 5,
 8,
 13,
 21,
 34,
 55,
 89,
 144,
 233,
 377,
 610,
 987,
 1597,
 2584,
 4181]

In [33]:
squared = rdd.map(lambda x: x ** 2)
squared.collect()

[0,
 1,
 1,
 4,
 9,
 25,
 64,
 169,
 441,
 1156,
 3025,
 7921,
 20736,
 54289,
 142129,
 372100,
 974169,
 2550409,
 6677056,
 17480761]

In [34]:
def be_prime(x):
    if x < 2:
        return False
    for i in range(2, x):
        if x % i == 0:
            return False
    return True


prime_number = rdd.filter(lambda x: be_prime(x))
prime_number.collect()

[2, 3, 5, 13, 89, 233, 1597]

In [35]:
number_of_prime = rdd.filter(lambda x: be_prime(x)).count()
print(number_of_prime)

7


In [41]:
sum_of_prime = rdd.filter(lambda x: be_prime(x)).sum()
print(
    f"Sum of prime number: {sum_of_prime}, average of prime number: {sum_of_prime / number_of_prime}")

Sum of prime number: 1942, average of prime number: 277.42857142857144


In [42]:
ordered = rdd.sortBy(lambda x: x, ascending=False)
ordered.collect()

[4181,
 2584,
 1597,
 987,
 610,
 377,
 233,
 144,
 89,
 55,
 34,
 21,
 13,
 8,
 5,
 3,
 2,
 1,
 1,
 0]

In [46]:
rdd = sc.parallelize([1, 2, 3, 4])
flat = rdd.flatMap(lambda x: [x, x + 1])
flat.collect()

[1, 2, 2, 3, 3, 4, 4, 5]

In [63]:
edges = [(1, 2), (2, 3), (4, 5), (6, 5)]
edgesRDD = sc.parallelize(edges)
print("Initial edges:")
print(edgesRDD.collect())

vertices = edgesRDD.flatMap(lambda edge: [edge[0], edge[1]]).distinct()
print("Flat-mapped vertices:")
print(vertices.collect())
verticesRDD = vertices.map(lambda vertex: (vertex, vertex))
print("After map phase:")
print(verticesRDD.collect())

print("Initial vertices and labels:")
print(verticesRDD.collect())


def emit_edges(edge):
    return [(edge[0], edge[1]), (edge[1], edge[0])]


iteration = 0
while True:
    iteration += 1

    contribs = edgesRDD.flatMap(emit_edges)
    print(f"\nContributions after map phase, iteration {iteration}:")
    print(contribs.collect())

    newVerticesRDD = contribs.reduceByKey(min)
    print(f"Updated vertices and labels after reduce phase, iteration {iteration}:")
    print(newVerticesRDD.collect())

    changes = newVerticesRDD.join(verticesRDD).filter(lambda x: x[1][0] != x[1][1]).count()

    print(f"\nAfter map-reduce phase, iteration {iteration}:")
    print(newVerticesRDD.collect())

    if changes == 0:
        print("\nNo more changes, stopping the iteration.")
        break
    else:
        verticesRDD = newVerticesRDD

print("\nFinal Components:")
print(newVerticesRDD.collect())


Initial edges:
[(1, 2), (2, 3), (4, 5), (6, 5)]
Flat-mapped vertices:
[1, 2, 3, 4, 5, 6]
After map phase:
[(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6)]
Initial vertices and labels:
[(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6)]

Contributions after map phase, iteration 1:
[(1, 2), (2, 1), (2, 3), (3, 2), (4, 5), (5, 4), (6, 5), (5, 6)]
Updated vertices and labels after reduce phase, iteration 1:
[(1, 2), (2, 1), (3, 2), (4, 5), (5, 4), (6, 5)]

After map-reduce phase, iteration 1:
[(1, 2), (2, 1), (3, 2), (4, 5), (5, 4), (6, 5)]

Contributions after map phase, iteration 2:
[(1, 2), (2, 1), (2, 3), (3, 2), (4, 5), (5, 4), (6, 5), (5, 6)]
Updated vertices and labels after reduce phase, iteration 2:
[(1, 2), (2, 1), (3, 2), (4, 5), (5, 4), (6, 5)]

After map-reduce phase, iteration 2:
[(1, 2), (2, 1), (3, 2), (4, 5), (5, 4), (6, 5)]

No more changes, stopping the iteration.

Final Components:
[(1, 2), (2, 1), (3, 2), (4, 5), (5, 4), (6, 5)]
