# Chapter 6: Advanced Spark Programming (Python)

In [74]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Advanced-Spark-Programming").master("local[*]").getOrCreate()
sc = spark.sparkContext

## Accumulators

In [75]:
acc_5 = sc.accumulator(0)

In [76]:
num_rdd = sc.parallelize([1,2,5,5,3,6,9,6,5,9])

In [77]:
num_rdd.glom().collect()

[[1, 2], [5, 5], [3, 6], [9, 6, 5, 9]]

In [78]:
def count_5(num):
    global acc_5
    if num == 5:
        acc_5 += 1

In [79]:
num_rdd.map(count_5).collect()

[None, None, None, None, None, None, None, None, None, None]

In [80]:
acc_5

Accumulator<id=2, value=3>

## Broadcast Variables

In [81]:
from random import randint

In [82]:
rdd_value = sc.parallelize([1,2,3,4,5])

In [83]:
rdd_key_value = rdd_value.map(lambda x: (x, x**2))

In [84]:
rdd_key_value.lookup(5)

[25]

In [85]:
dict_map = rdd_key_value.collectAsMap()

In [86]:
type(dict_map)

dict

In [87]:
dict_map.get(5)

25

In [88]:
dict_broad = sc.broadcast(dict_map)

In [89]:
dict_broad.value.get(1)

1

In [90]:
rdd_key_keys = sc.parallelize([randint(1,5) for _ in range(20)])

In [91]:
rdd_big_values = rdd_key_keys.map(lambda key: (key, dict_broad.value.get(key)))

In [92]:
rdd_big_values.collect()

[(3, 9),
 (5, 25),
 (1, 1),
 (1, 1),
 (2, 4),
 (1, 1),
 (3, 9),
 (3, 9),
 (4, 16),
 (1, 1),
 (5, 25),
 (3, 9),
 (4, 16),
 (4, 16),
 (5, 25),
 (3, 9),
 (1, 1),
 (2, 4),
 (2, 4),
 (2, 4)]