In [2]:
from pyspark.sql import SparkSession

# Spark setup
spark = SparkSession.builder.appName("RDD-Exercises-Set2").getOrCreate()
sc = spark.sparkContext


In [3]:
nums = sc.parallelize(range(1, 16))

# Numbers divisible by 3
div_by_3 = nums.filter(lambda x: x % 3 == 0).collect()
print("Numbers divisible by 3:", div_by_3)

# Double each number
doubled = nums.map(lambda x: x * 2).collect()
print("Doubled numbers:", doubled)

# Count numbers > 10
greater_than_10 = nums.filter(lambda x: x > 10).count()
print("Count > 10:", greater_than_10)


Numbers divisible by 3: [3, 6, 9, 12, 15]
Doubled numbers: [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
Count > 10: 5


In [4]:
fruits = sc.parallelize(["apple", "banana", "grape", "banana", "apple", "mango"])

# Distinct fruits
distinct_fruits = fruits.distinct().collect()
print("Distinct fruits:", distinct_fruits)

# Count occurrences
fruit_counts = fruits.map(lambda f: (f, 1)).reduceByKey(lambda a, b: a + b).collect()
print("Fruit counts:", fruit_counts)

# Longest word
longest_fruit = fruits.reduce(lambda a, b: a if len(a) > len(b) else b)
print("Longest fruit:", longest_fruit)

Distinct fruits: ['apple', 'banana', 'grape', 'mango']
Fruit counts: [('apple', 2), ('banana', 2), ('grape', 1), ('mango', 1)]
Longest fruit: banana


In [5]:
sentences = sc.parallelize([
    "spark makes big data easy",
    "rdd is the core of spark",
    "python with spark"
])

# Split into words
words = sentences.flatMap(lambda s: s.split(" "))

# Lowercase + distinct
unique_words = words.map(lambda w: w.lower()).distinct()
print("Unique words:", unique_words.collect())

# Count unique
unique_count = unique_words.count()
print("Unique word count:", unique_count)

Unique words: ['big', 'easy', 'rdd', 'core', 'of', 'python', 'with', 'spark', 'makes', 'data', 'is', 'the']
Unique word count: 12


In [6]:
marks = sc.parallelize([
    ("Rahul", 85),
    ("Priya", 92),
    ("Aman", 78),
    ("Rahul", 90),
    ("Priya", 88)
])

# Total marks
total_marks = marks.reduceByKey(lambda a, b: a + b).collect()
print("Total marks per student:", total_marks)

# Average marks
marks_group = marks.groupByKey().mapValues(lambda vals: sum(vals) / len(vals)).collect()
print("Average marks per student:", marks_group)

# Highest marks
highest = marks.reduce(lambda a, b: a if a[1] > b[1] else b)
print("Highest single mark:", highest)

Total marks per student: [('Rahul', 175), ('Priya', 180), ('Aman', 78)]
Average marks per student: [('Rahul', 87.5), ('Priya', 90.0), ('Aman', 78.0)]
Highest single mark: ('Priya', 92)


In [7]:
nums2 = sc.parallelize([5, 10, 15, 20, 25])

# Sum
total_sum = nums2.reduce(lambda a, b: a + b)
print("Sum:", total_sum)

# Product
product = nums2.reduce(lambda a, b: a * b)
print("Product:", product)

# Average using reduce
sum_count = nums2.map(lambda x: (x, 1)).reduce(lambda a, b: (a[0] + b[0], a[1] + b[1]))
average = sum_count[0] / sum_count[1]
print("Average:", average)

Sum: 75
Product: 375000
Average: 15.0


In [8]:
words2 = sc.parallelize(["data", "engineering", "spark", "rdd", "pyspark", "analytics"])

# Word length
word_lengths = words2.map(lambda w: (w, len(w))).collect()
print("Word lengths:", word_lengths)

# Longest word
longest_word = words2.reduce(lambda a, b: a if len(a) > len(b) else b)
print("Longest word:", longest_word)

# Average word length
total_len = words2.map(lambda w: len(w)).reduce(lambda a, b: a + b)
avg_len = total_len / words2.count()
print("Average word length:", avg_len)

Word lengths: [('data', 4), ('engineering', 11), ('spark', 5), ('rdd', 3), ('pyspark', 7), ('analytics', 9)]
Longest word: engineering
Average word length: 6.5


In [9]:
students = sc.parallelize([(1, "Rahul"), (2, "Priya"), (3, "Aman")])
courses = sc.parallelize([(1, "Python"), (2, "Spark"), (4, "Databases")])

# Inner join
inner = students.join(courses).collect()
print("Inner join:", inner)

# Left outer join
left_outer = students.leftOuterJoin(courses).collect()
print("Left outer join:", left_outer)

# Right outer join
right_outer = students.rightOuterJoin(courses).collect()
print("Right outer join:", right_outer)

Inner join: [(1, ('Rahul', 'Python')), (2, ('Priya', 'Spark'))]
Left outer join: [(1, ('Rahul', 'Python')), (2, ('Priya', 'Spark')), (3, ('Aman', None))]
Right outer join: [(4, (None, 'Databases')), (1, ('Rahul', 'Python')), (2, ('Priya', 'Spark'))]


In [10]:
orders = sc.parallelize([
    (1, 200),
    (2, 500),
    (3, 300),
    (1, 150),
    (2, 250)
])

# Total spend per customer
spend_per_customer = orders.reduceByKey(lambda a, b: a + b).collect()
print("Total spend per customer:", spend_per_customer)

# Customer with max spend
max_spend = spend_per_customer[0]
for customer in spend_per_customer:
    if customer[1] > max_spend[1]:
        max_spend = customer
print("Customer with max spend:", max_spend)

# Total revenue
total_revenue = orders.map(lambda x: x[1]).reduce(lambda a, b: a + b)
print("Total revenue:", total_revenue)

Total spend per customer: [(2, 750), (1, 350), (3, 300)]
Customer with max spend: (2, 750)
Total revenue: 1400
