# Broadcast variables

In [6]:
'''In PySpark RDD and DataFrame, Broadcast variables are read-only shared variables that are cached and available on all 
nodes in a cluster in-order to access or use by the tasks. Instead of sending this data along with every task, PySpark 
distributes broadcast variables to the workers using efficient broadcast algorithms to reduce communication costs.'''

'In PySpark RDD and DataFrame, Broadcast variables are read-only shared variables that are cached and available on all \nnodes in a cluster in-order to access or use by the tasks. Instead of sending this data along with every task, PySpark \ndistributes broadcast variables to the workers using efficient broadcast algorithms to reduce communication costs.'

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('Pyspark-Examples') \
                    .getOrCreate()

In [8]:

data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

dataRDD = spark.sparkContext.parallelize(data)

print(dataRDD)

ParallelCollectionRDD[5] at readRDDFromFile at PythonRDD.scala:274


In [9]:
#Broadcasting Variable to all nodes in cluster

states = {"NY":"New York", "CA":"California", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)

In [10]:
def state_convert(code):
    return broadcastStates.value[code]

result = dataRDD.map(lambda x: (x[0],x[1],x[2],state_convert(x[3])))

print(result.collect())

[('James', 'Smith', 'USA', 'California'), ('Michael', 'Rose', 'USA', 'New York'), ('Robert', 'Williams', 'USA', 'California'), ('Maria', 'Jones', 'USA', 'Florida')]


In [None]:
#Lmitations
#1.apply hash Algarithm -> HashMap, Cached shared to every nodes, read-only structured
#2.some memory in executor, 2GB [1GB- times], 1GB
#If broadcasted data size is more than 24 MB, better not to use broadcast variables 


# Accumulators

In [11]:
#Useful to get count the elements in every Executors without confusion
'''The PySpark Accumulator is a shared variable that is used with RDD and DataFrame to perform sum and counter operations 
similar to Map-reduce counters. These variables are shared by all executors to update and add information through aggregation 
or computative operations.'''

accum=spark.sparkContext.accumulator(0)
rdd=spark.sparkContext.parallelize([1,2,3,4,5])

rdd.foreach(lambda x:accum.add(x))
print(accum.value)


def sum123(ele):
    return ele = ele +1

res= rdd.map(r=>sum123(r))



15
