In [2]:
from pyspark import SparkContext
sc = SparkContext("local","RDD_revision")

In [3]:
list1 = [1,2,3,4,5,6,7,8,9,]
print(type(list1))

rdd1 = sc.parallelize(list1)
print(type(rdd1))

# parallelize turns a normal Python list into an RDD so Spark can process it in parallel.

<class 'list'>
<class 'pyspark.rdd.RDD'>


In [None]:
# RDD - (Resilient Distributed Dataset)
# In PySpark, an RDD (Resilient Distributed Dataset) is the fundamental low-level data structure used for distributed processing.
# While most people now use DataFrames for convenience and optimization, RDDs are still important for:

# Low-level transformations

# Fine-grained control

# Working with unstructured data

# Functional-style distributed computing



# ‚úÖ What is an RDD?

# An RDD is an immutable, distributed collection of elements that can be processed in parallel.

# Key characteristics:

# Immutable ‚Äì once created, it cannot change

# Lazy evaluation ‚Äì transformations are executed only when an action is called

# Fault-tolerant ‚Äì can be recomputed via lineage



In [4]:
# create new rdd as >=5 from rdd1

rdd2 = rdd1.filter(lambda x:x>=5)
rdd2.collect()

#In PySpark, the filter() transformation on an RDD (Resilient Distributed Dataset)
#is used to create a new RDD containing only the elements that satisfy a specified condition.
# This condition is provided as a function (often a lambda function) that takes an element of
#the RDD as input and returns True if the element should be included in the new RDD,
#and False otherwise.


#In PySpark, the collect() action on an RDD (Resilient Distributed Dataset)
#is used to retrieve all the elements of the RDD and return them as a list to the driver program.

[5, 6, 7, 8, 9]

In [5]:
# create new rdd by adding 10 to each value in rdd1

rdd2 = rdd1.map(lambda a : a+10)
rdd2.collect()

#In PySpark, the map transformation on an RDD (Resilient Distributed Dataset)
#is a fundamental operation used to apply a function to each element of the RDD,
# resulting in a new RDD

[11, 12, 13, 14, 15, 16, 17, 18, 19]

In [3]:
data =['pyspark session is going on', 'rdd concept started']
rdd1= sc.parallelize(data)
print( ' === flatMap output ====')
rdd3 = rdd1.flatMap( lambda x: x.split())
rdd3.saveAsTextFile("results")


# ‚úÖ Meaning of flatMap in PySpark

# flatMap takes each element in an RDD and maps it to multiple elements, then flattens the results into one RDD.

# In even simpler words:

# üëâ map gives one output per input
# üëâ flatMap gives many outputs per input
# üëâ and then flattens them into a single list

 === flatMap output ====


In [15]:
# list1 = [54,56,6,6,1,2,3,4]
# rdd= sc.parallelize(list1)

# # rdd.count()
# rdd.last()

rdd = sc.parallelize([1, 2, 3, 4, 5])
# last_element = rdd.collect().last
last_line = rdd.zipWithIndex().max()[0]

print(last_line)


# zipWithIndex pairs each element of an RDD with its index (position), starting from 0.
# This line finds and returns the last element of the RDD.
# max() returns the largest element in an RDD.


5


In [16]:
list1 =[1,3,4,5,2,3,4,1,2,5,6]

rdd= sc.parallelize(list1)

rdd2 =rdd.sortBy(lambda x:x, ascending= False)
rdd2.collect()

# sortBy sorts an RDD based on a key you choose.

# You give it a function, and Spark sorts the RDD using the value returned by that function

# lambda creates a small, anonymous function without a name.

[6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1]

In [6]:
list1 = [1,2,3,4,5]

rdd = sc.parallelize(list1)

res= rdd.fold(0, lambda x, y : x+y)

print(res)
print(type(res))

# fold combines all elements of an RDD using the same operation, starting with a ‚Äúzero value.‚Äù

# It is like reduce, but with an initial value added to every partition.

15
<class 'int'>


In [9]:
rdd= sc.textFile("prac.txt")

rdd.collect()

# textFile reads a text file and returns an RDD where each element is one line of the file.

['a b', 'c ', 'd', 'e f', 'g h']

In [10]:
rdd= sc.textFile("travel.csv")

rdd.collect()

['cust_id,flight_id,origin,destination,price',
 '1,f1,delhi,hyderabad,2500',
 '1,f2,hyderabad,kochi,1700',
 '1,f3,kochi,Mangalore,1800',
 '2,f1,Mumbai,Ayodhya,4000',
 '2,f2,Ayodhya,chennai,3000',
 ' ']

In [12]:
list1 = ['hello', 'world', 'students']

rdd= sc.parallelize(list1)

rdd.take(2)

# take(n) returns the first n elements of an RDD.

['hello', 'world']

In [13]:
list1 = ['hello', 'world', 'students', 'hello', 'world', 'hello', 'hello', 'world', 'students']
rdd= sc.parallelize(list1)

rdd1 = rdd.map( lambda x : (x,1))
rdd2 = rdd1.reduceByKey(lambda x, y: x+y)
rdd2.collect()

# reduceByKey groups values by key and then reduces (combines) the values of each key using the function you provide

[('hello', 4), ('world', 3), ('students', 2)]