In [10]:
# create spark session object

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rdd_practicals').getOrCreate()

In [3]:
#create rdd from collection

data = [1,2,3,4,5,6,10]
rdd = spark.sparkContext.parallelize(data)
rdd.collect()


                                                                                

[1, 2, 3, 4, 5, 6, 10]

In [6]:
# create rdd from a external file

data_path = "gs://dev-de-training-default/vrai/test/wordcount.txt"

rdd = spark.sparkContext.textFile(data_path)
#rdd.collect()

In [8]:
# Transformations in spark

#==================>>>>>>>>>>>>>>>>>>> Filter

a = spark.sparkContext.parallelize(range(1,10))

b = a.filter(lambda x: x % 2 == 0) 

b.collect()


                                                                                

[2, 4, 6, 8]

In [8]:
#================>>>>>>>>>>>>>>> Map and FlatMap 

rdd1 = spark.sparkContext.parallelize([3,4,5]).map(lambda x: [1,x])
rdd1.collect()


[[1, 9], [1, 16], [1, 25]]

In [24]:
rdd = spark.sparkContext.parallelize([3,4,5]).flatMap(lambda x: [1,x])
rdd.collect()

[1, 3, 1, 4, 1, 5]

In [16]:
maprdd = spark.sparkContext.parallelize([3,4,5]).map(lambda x: [x,x*x])
maprdd.collect()

[[3, 9], [4, 16], [5, 25]]

In [15]:
flatmaprdd = spark.sparkContext.parallelize([3,4,5]).flatMap(lambda x: [x,x*x])
flatmaprdd.collect()

[3, 9, 4, 16, 5, 25]

In [17]:

#====================>>>>>>>>> distinct

distinct_rdd = spark.sparkContext.parallelize(["Gnu", "Cat", "Rat", "Dog", "Gnu", "Rat"],2) 
distinct_rdd.distinct().collect()

                                                                                

['Cat', 'Rat', 'Gnu', 'Dog']

In [19]:
#========================>>>>>>>> cartesian

x = spark.sparkContext.parallelize([1,2,3,4,5]) 
y = spark.sparkContext.parallelize([6,7,8,9,10]) 

x.cartesian(y).collect()

In [22]:

#========================>>>>>>>> coalesce

y = spark.sparkContext.parallelize(range(1,10), 10)
y.getNumPartitions()
z = y.coalesce(2)
z.getNumPartitions()

2

In [24]:

#========================>>>>>>>> groupByKey

data = [("a", 1), ("b", 2), ("a", 3), ("b", 4)]
rdd = spark.sparkContext.parallelize(data)

# Group by key
grouped_rdd = rdd.groupByKey().mapValues(list)
print(grouped_rdd.collect())  # Output: [('a', [1, 3]), ('b', [2, 4])]


[('b', [2, 4]), ('a', [1, 3])]


In [None]:
#========================>>>>>>>> reduceByKey

data = [("a", 1), ("b", 2), ("a", 3), ("b", 4)]
rdd = spark.sparkContext.parallelize(data)

# Sum values by key
reduced_rdd = rdd.reduceByKey(lambda x, y: x + y)
print(reduced_rdd.collect())  


In [26]:
#========================>>>>>>>> join

rdd1 = sc.parallelize([("a", 1), ("b", 2)])
rdd2 = sc.parallelize([("a", 3), ("b", 4)])

# Join RDDs by key
joined_rdd = rdd1.join(rdd2)
print(joined_rdd.collect())  


[('b', (2, 4)), ('a', (1, 3))]


In [None]:
a = spark.sparkContext.parallelize(["dog", "tiger", "lion", "cat", "panther", "eagle"], 2) 

b = a.map(lambda x : (len(x), x)) 

b.keys().collect()

In [None]:
#======================> Actions

from pyspark import SparkContext

sc = SparkContext("local", "Collect Example")
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Collect all elements of the RDD
collected_data = rdd.collect()
print(collected_data)  # Output: [1, 2, 3, 4, 5]



In [None]:
# Count the number of elements in the RDD
count = rdd.count()
print(count)  # Output: 5


In [None]:
# Get the first element of the RDD
first_element = rdd.first()
print(first_element)  # Output: 1


In [None]:
# Take the first 3 elements of the RDD
taken_elements = rdd.take(3)
print(taken_elements)  # Output: [1, 2, 3]


In [None]:
# Take a sample of 3 elements with replacement
sampled_elements = rdd.takeSample(withReplacement=True, num=3)
print(sampled_elements)  # Output: [e.g., 3, 1, 4] (randomly sampled)


In [None]:
# Sum all elements of the RDD
sum_of_elements = rdd.reduce(lambda x, y: x + y)
print(sum_of_elements)  # Output: 15


In [None]:
# Sum all elements with an initial value of 0
sum_with_fold = rdd.fold(0, lambda x, y: x + y)
print(sum_with_fold)  # Output: 15


In [None]:
data = [("a", 1), ("b", 2), ("a", 3)]
rdd = sc.parallelize(data)

# Count occurrences of each key
count_by_key = rdd.countByKey()
print(dict(count_by_key))  # Output: {'a': 2, 'b': 1}


In [None]:
# Print each element
rdd.foreach(lambda x: print(x))


In [None]:
# Save RDD as text file (this won't work in a local notebook without a proper file system)
rdd.saveAsTextFile("/path/to/output")
