In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RDD-Exercises-Set2").getOrCreate()
sc = spark.sparkContext

nums = sc.parallelize(range(1, 16))
div_by_3 = nums.filter(lambda x: x % 3 == 0).collect()
doubled = nums.map(lambda x: x * 2).collect()
count_gt_10 = nums.filter(lambda x: x > 10).count()

print("Numbers divisible by 3:", div_by_3)
print("Doubled numbers:", doubled)
print("Count > 10:", count_gt_10)

Numbers divisible by 3: [3, 6, 9, 12, 15]
Doubled numbers: [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
Count > 10: 5


In [2]:
fruits = sc.parallelize(["apple", "banana", "grape", "banana", "apple", "mango"])
distinct_fruits = fruits.distinct().collect()
fruit_counts = fruits.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b).collect()
longest_fruit = fruits.reduce(lambda a, b: a if len(a) > len(b) else b)

print("Distinct fruits:", distinct_fruits)
print("Fruit counts:", fruit_counts)
print("Longest fruit:", longest_fruit)

Distinct fruits: ['apple', 'banana', 'grape', 'mango']
Fruit counts: [('apple', 2), ('banana', 2), ('grape', 1), ('mango', 1)]
Longest fruit: banana


In [3]:
sentences = sc.parallelize(["spark makes big data easy", "rdd is the core of spark", "python with spark"])
words = sentences.flatMap(lambda x: x.split(" "))
unique_words = words.map(lambda x: x.lower()).distinct()
unique_count = unique_words.count()

print("Unique words:", unique_words.collect())
print("Total unique words:", unique_count)

Unique words: ['big', 'easy', 'rdd', 'core', 'of', 'python', 'with', 'spark', 'makes', 'data', 'is', 'the']
Total unique words: 12


In [4]:
marks = sc.parallelize([("Rahul", 85), ("Priya", 92), ("Aman", 78), ("Rahul", 90), ("Priya", 88)])
total_marks = marks.reduceByKey(lambda a, b: a + b).collect()
avg_marks = marks.combineByKey(
    lambda val: (val, 1),
    lambda acc, val: (acc[0] + val, acc[1] + 1),
    lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])
).mapValues(lambda x: x[0] / x[1]).collect()
highest_mark = marks.reduce(lambda a, b: a if a[1] > b[1] else b)

print("Total marks per student:", total_marks)
print("Average marks per student:", avg_marks)
print("Highest mark overall:", highest_mark)

Total marks per student: [('Rahul', 175), ('Priya', 180), ('Aman', 78)]
Average marks per student: [('Rahul', 87.5), ('Priya', 90.0), ('Aman', 78.0)]
Highest mark overall: ('Priya', 92)


In [5]:
numbers = sc.parallelize([5, 10, 15, 20, 25])
sum_val = numbers.reduce(lambda a, b: a + b)
product_val = numbers.reduce(lambda a, b: a * b)
total_sum, count = numbers.map(lambda x: (x, 1)).reduce(lambda a, b: (a[0] + b[0], a[1] + b[1]))
avg_val = total_sum / count

print("Sum:", sum_val)
print("Product:", product_val)
print("Average:", avg_val)

Sum: 75
Product: 375000
Average: 15.0


In [6]:
words_rdd = sc.parallelize(["data", "engineering", "spark", "rdd", "pyspark", "analytics"])
word_len = words_rdd.map(lambda x: (x, len(x))).collect()
longest_word = words_rdd.reduce(lambda a, b: a if len(a) > len(b) else b)
total_len, total_count = words_rdd.map(lambda x: (len(x), 1)).reduce(lambda a, b: (a[0] + b[0], a[1] + b[1]))
avg_word_len = total_len / total_count

print("Word lengths:", word_len)
print("Longest word:", longest_word)
print("Average word length:", avg_word_len)

Word lengths: [('data', 4), ('engineering', 11), ('spark', 5), ('rdd', 3), ('pyspark', 7), ('analytics', 9)]
Longest word: engineering
Average word length: 6.5


In [7]:
students = sc.parallelize([(1, "Rahul"), (2, "Priya"), (3, "Aman")])
courses = sc.parallelize([(1, "Python"), (2, "Spark"), (4, "Databases")])

inner_join = students.join(courses).collect()
left_join = students.leftOuterJoin(courses).collect()
right_join = students.rightOuterJoin(courses).collect()

print("Inner Join:", inner_join)
print("Left Outer Join:", left_join)
print("Right Outer Join:", right_join)

Inner Join: [(1, ('Rahul', 'Python')), (2, ('Priya', 'Spark'))]
Left Outer Join: [(1, ('Rahul', 'Python')), (2, ('Priya', 'Spark')), (3, ('Aman', None))]
Right Outer Join: [(4, (None, 'Databases')), (1, ('Rahul', 'Python')), (2, ('Priya', 'Spark'))]


In [8]:
orders = sc.parallelize([(1, 200), (2, 500), (3, 300), (1, 150), (2, 250)])
total_spend = orders.reduceByKey(lambda a, b: a + b).collect()
max_spend = total_spend[0]
for c in total_spend:
    if c[1] > max_spend[1]:
        max_spend = c
total_revenue = orders.map(lambda x: x[1]).sum()

print("Total spend per customer:", total_spend)
print("Customer with max spend:", max_spend)
print("Total revenue:", total_revenue)

Total spend per customer: [(2, 750), (1, 350), (3, 300)]
Customer with max spend: (2, 750)
Total revenue: 1400
