In [0]:
# https://www.tutorialspoint.com/pyspark/pyspark_rdd.htm
from pyspark import SparkContext
from operator import add
# sc = SparkContext().getOrCreate()
sample_list = ['scala', 'java', 'hadoop', 'spark', 'akka', 'spark vs hadoop', 'pyspark', 'pyspark and spark']
words = sc.parallelize(sample_list)
print(words.count())  #count
print(words.collect()) # collect
def f(x): print(x)
fore = words.foreach(f)  # foreach (not working)
filter1 = words.filter(lambda x: 'spark' in x)  # filter
print(filter1.collect())
map_fun = words.map(lambda x: (x,1)) # map
print(map_fun.collect())


8
['scala', 'java', 'hadoop', 'spark', 'akka', 'spark vs hadoop', 'pyspark', 'pyspark and spark']
['spark', 'spark vs hadoop', 'pyspark', 'pyspark and spark']
[('scala', 1), ('java', 1), ('hadoop', 1), ('spark', 1), ('akka', 1), ('spark vs hadoop', 1), ('pyspark', 1), ('pyspark and spark', 1)]


In [0]:
#foreach
#rdd.foreach(function), where rdd is RDD on which you want to apply the opeartion, and function is the function you want to apply to each element of the RDD

data = [1,2,3,4,5]
rdd = sc.parallelize(data)

def print_square(x):
    print(x*x)

rdd.foreach(print_square)  # they are printing out on the Spark workers stdout, not in the driver/your shell session.
print(rdd.collect())


[1, 2, 3, 4, 5]


In [0]:
#filter 
# filter oparation in spark rdd allowss you to create a new RDD containing only the elements that satisfy a given condition.
#Elemenrs that do not meet the condition are excluded from resulting RDD.
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
rdd = sc.parallelize(data)

rdd_even = rdd.filter(lambda x: x%2 == 0)
rdd_odd = rdd.filter(lambda x: x%2 != 0)
print(rdd_even.collect())
print(rdd_odd.collect())

[2, 4, 6, 8, 10]
[1, 3, 5, 7, 9]


In [0]:
#map opearion in spark RDD allows you to transform each element in an RDD using a given function. 
#mapped_rdd = rdd.map(lambda x: transformation)

data = [1,2,3,4,5]
rdd = sc.parallelize(data)
rdd_map = rdd.map(lambda x: x * x)
print(rdd_map.collect())

[1, 4, 9, 16, 25]


In [0]:
#reduce opeartion is used to aggregate the elements of an RDD using specified function. The function should take two arguments and produce single value as the result.  
values = [1,2,3,4,5] 
values_sc = sc.parallelize(values) 
print(values_sc.reduce(add)) # reduce

def add1(a,b):
    return a+b

sum_result = values_sc.reduce(add1)
print(sum_result)


15
15


In [0]:
# Join 
x = sc.parallelize([('spark', 1), ('hadoop', 4)])
y = sc.parallelize([('spark', 2), ('hadoop', 5)])
joined = x.join(y) #Inner join
print(joined.collect()) 

[('spark', (1, 2)), ('hadoop', (4, 5))]


In [0]:
employeesRDD = sc.parallelize([(1,'Alice'),(2,'Bob'),(3,'Charlie')])
salariesRDD = sc.parallelize([(1, 50000),(3, 60000),(4, 55500)])

In [0]:
#Inner join
inner_joined = employeesRDD.join(salariesRDD)
print(inner_joined.collect())

[(1, ('Alice', 50000)), (3, ('Charlie', 60000))]


In [0]:
#Left outer join
left_outer_joined = employeesRDD.leftOuterJoin(salariesRDD)
left_outer_joined.collect()

Out[23]: [(1, ('Alice', 50000)), (2, ('Bob', None)), (3, ('Charlie', 60000))]

In [0]:
# Right outer join
right_outer_join = employeesRDD.rightOuterJoin(salariesRDD)
right_outer_join.collect()

Out[24]: [(1, ('Alice', 50000)), (3, ('Charlie', 60000)), (4, (None, 55500))]

In [0]:
#Full outer join
full_outer_joined = employeesRDD.fullOuterJoin(salariesRDD)
full_outer_joined.collect()

Out[25]: [(1, ('Alice', 50000)),
 (2, ('Bob', None)),
 (3, ('Charlie', 60000)),
 (4, (None, 55500))]

In [0]:
# cache oparation is used to persist the RDDs data in memory for faster access in subsequent actions.
#caching an rdd allows spark to store the data in memory and reuse it across multiple operations without recomputing the RDDs tranformation.
# remember that cached data persists across multiple actions until explicitly removed or until the Spark application ends. You can also use the unpersist method to remove the cached data when it's no longer needed.

words.cache()
caching = words.persist().is_cached
print(caching)  #True
words.unpersist()

True
Out[44]: ParallelCollectionRDD[41] at readRDDFromInputStream at PythonRDD.scala:435

### Shared variables

Shared variables are variables that can be used by multiple tasks in a distributed computation.spark provides two types of shared variables

  * **Broadcast variables** : It allows you to efficiently send a read-only variable to worker nodes so that it can be used across tasks without the need to send the data over the network multiple times. It is useful for large datasets that are used in tasks across the cluster
  * **Accumulators** : Accumulators are variables that can be updated in a distributed manner, allowing information to be aggregated across multiple task. These used for counters or sum

In [0]:
#Broadcast 

large_data = [1,2,3,4,5]
broadcast_data = sc.broadcast(large_data)

rdd = sc.parallelize([10,20,30])
result_rdd = rdd.map(lambda x: x* broadcast_data.value[4])
print(result_rdd.collect())

[50, 100, 150]


In [0]:
#Accumulators
rdd = sc.parallelize([1,2,3,4,5])
accumulator = sc.accumulator(1)

def process_element(x):
    global accumulator
    accumulator += x
    print(accumulator)

rdd.foreach(process_element)
print(accumulator.value)

16


In [0]:
sc.stop()

### Sparkconf

Sparkconf is a configuration class in Apache spark that is used to configure various settings and properties for spark application. It allows you to specify runtime configuration parameters such as application name, master url, and other propertoes related to the spark runtime environment

In [0]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("MySparkapplication")

sc = SparkContext(conf=conf)

sc.stop()

In [0]:
from pyspark import SparkContext, SparkConf

conf = SparkConf() \
        .setAppName("MySparkAPP") \
        .set('spark.executor.memory', '2g') \
        .set('spark.executor.cores', '4')

sc = SparkContext(conf=conf)

sc.stop()

[0;31m---------------------------------------------------------------------------[0m
[0;31mValueError[0m                                Traceback (most recent call last)
File [0;32m<command-1628867705520482>:8[0m
[1;32m      1[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m [38;5;28;01mimport[39;00m SparkContext, SparkConf
[1;32m      3[0m conf [38;5;241m=[39m SparkConf() \
[1;32m      4[0m         [38;5;241m.[39msetAppName([38;5;124m"[39m[38;5;124mMySparkAPP[39m[38;5;124m"[39m) \
[1;32m      5[0m         [38;5;241m.[39mset([38;5;124m'[39m[38;5;124mspark.executor.memory[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124m2g[39m[38;5;124m'[39m) \
[1;32m      6[0m         [38;5;241m.[39mset([38;5;124m'[39m[38;5;124mspark.executor.cores[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124m4[39m[38;5;124m'[39m)
[0;32m----> 8[0m sc [38;5;241m=[39m SparkContext(conf[38;5;241m=[39mconf)
[1;32m     10[0m sc[38;5;241m.[39mstop()

File

### SparkFiles

In Apache spark, you can upload your files using sc.addFile() and get the path on a worker using SparkFiles.get. Thus, SparkFiles resolve the paths to files added through SparkContext.addFile()

SparkFiles contain the following classmethods:
  * **get(filename)** : It specifies the path of the file that is added through SparkContext.addFile()
  * **getrootdirectory()** : It specifies the path to the root directory, which contains the file that is added through through the SparkContext.addFile()

### StorageLevel

Storage level decides how RDD should be stored. In Apache Spark, storage level decides whether RDD should be stored in the memory or should it be stored over the disk, or both. It also decides whether to serialize RDD and whether to replicate RDD partitions.