In [7]:
import os

# Set SPARK_HOME and JAVA_HOME environment variables
os.environ['SPARK_HOME'] = '/usr/local/Cellar/apache-spark/3.5.1/libexec'
os.environ['JAVA_HOME'] = '/usr/local/opt/openjdk/libexec/openjdk.jdk/Contents/Home'

In [9]:
# RDD (Resilient Distributed Dataset)

## Overview
## RDD (Resilient Distributed Dataset) is the fundamental data structure of Apache Spark. It is an immutable distributed 
# collection of objects that can be processed in parallel. RDDs can be created from Hadoop InputFormats (such as HDFS files)
#  or by transforming other RDDs.

## Creating RDDs

### From a Collection
from pyspark import SparkContext

# Stop any existing SparkContext
if 'sc' in globals():
    sc.stop()

sc = SparkContext("local", "RDD Example")
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)
print(rdd.collect())


[1, 2, 3, 4, 5]


In [10]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RDD from DataFrame").getOrCreate()
data = [("James", "Smith"), ("Anna", "Rose"), ("Robert", "Williams")]
columns = ["FirstName", "LastName"]
df = spark.createDataFrame(data, columns)

rdd_from_df = df.rdd
print(rdd_from_df.collect())

[Stage 1:>                                                          (0 + 1) / 1]

[Row(FirstName='James', LastName='Smith'), Row(FirstName='Anna', LastName='Rose'), Row(FirstName='Robert', LastName='Williams')]


                                                                                

Transformations
Transformations create a new RDD from an existing one. Transformations are lazy, meaning they are not 
executed until an action is called.

In [12]:
rdd = sc.parallelize([1, 2, 3, 4, 5])
squared_rdd = rdd.map(lambda x: x * x)
print(squared_rdd.collect())

[1, 4, 9, 16, 25]


In [13]:
rdd = sc.parallelize([1, 2, 3, 4, 5])
filtered_rdd = rdd.filter(lambda x: x % 2 == 0)
print(filtered_rdd.collect())

[2, 4]


In [14]:
rdd = sc.parallelize(["Hello World", "Apache Spark"])
flatmapped_rdd = rdd.flatMap(lambda x: x.split(" "))
print(flatmapped_rdd.collect())

['Hello', 'World', 'Apache', 'Spark']


In [15]:
rdd = sc.parallelize([1, 2, 2, 3, 4, 4, 5])
distinct_rdd = rdd.distinct()
print(distinct_rdd.collect())



[1, 2, 3, 4, 5]


In [16]:
rdd = sc.parallelize([1, 2, 3, 4, 5])
print(rdd.collect())

[1, 2, 3, 4, 5]


In [17]:
rdd = sc.parallelize([1, 2, 3, 4, 5])
print(rdd.count())

5


In [18]:
rdd = sc.parallelize([1, 2, 3, 4, 5])
print(rdd.first())

1


In [19]:
rdd = sc.parallelize([1, 2, 3, 4, 5])
print(rdd.take(3))

[1, 2, 3]


In [20]:
rdd = sc.parallelize([1, 2, 3, 4, 5])
sum = rdd.reduce(lambda x, y: x + y)
print(sum)

15


RDD Lineage
RDD lineage is the logical execution plan - the sequence of transformations that resulted in an RDD.
It can be accessed using the toDebugString method.

In [21]:
rdd = sc.parallelize([1, 2, 3, 4, 5]).map(lambda x: x * x).filter(lambda x: x > 10)
print(rdd.toDebugString())

b'(1) PythonRDD[32] at RDD at PythonRDD.scala:53 []\n |  ParallelCollectionRDD[31] at readRDDFromFile at PythonRDD.scala:289 []'


Fault Tolerance
RDDs are designed to handle failures of any worker node in the cluster. 
If a node fails, Spark can recompute the lost partitions using the lineage graph.

Example demonstrating fault tolerance:

In [22]:
def faulty_function(x):
    if x == 2:
        raise Exception("An error occurred!")
    return x

rdd = sc.parallelize([1, 2, 3, 4, 5])
try:
    result = rdd.map(faulty_function).collect()
except Exception as e:
    print(e)


An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 12.0 failed 1 times, most recent failure: Lost task 0.0 in stage 12.0 (TID 12) (prasannas-air executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.5.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/local/Cellar/apache-spark/3.5.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/Cellar/apache-spark/3.5.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/Cellar/apache-spark/3.5.1/libexec/python/lib/pyspark.zip/pyspark/util.py", line 83, in wrapper
    return f(*args, **kwargs)
  File "/var/folders/g

24/07/31 08:52:09 ERROR Executor: Exception in task 0.0 in stage 12.0 (TID 12)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.5.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/usr/local/Cellar/apache-spark/3.5.1/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/Cellar/apache-spark/3.5.1/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/Cellar/apache-spark/3.5.1/libexec/python/lib/pyspark.zip/pyspark/util.py", line 83, in wrapper
    return f(*args, **kwargs)
  File "/var/folders/g5/4lyrpx411517gbxgrcplfhvh0000gn/T/ipykernel_41805/1625257287.py", line 3, in faulty_function
Exception: An error occurred!

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonExcept