In [2]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Spark RDD Examples") \
    .getOrCreate()

sc = spark.sparkContext

# a) PARALLELIZE
print("\n=== a) parallelize() ===")
data = [10, 20, 30, 40, 50]
rdd_parallel = sc.parallelize(data)
print("Parallelized RDD:", rdd_parallel.collect())

# b) READ TEXT FILE & CSV
print("\n=== b) Read Text File & CSV ===")

# Text File (Assuming file.txt exists)
# Example content: line1\nline2\nline3
rdd_text = sc.textFile("sample_text.txt")
print("Text File RDD:", rdd_text.take(3))  # Show first 3 lines

# CSV File (Assuming file.csv exists)
# Example content:
# name,age
# Alice,21
# Bob,22
rdd_csv = sc.textFile("sample_csv.csv")
header = rdd_csv.first()
rdd_csv_data = rdd_csv.filter(lambda row: row != header)
print("CSV File RDD (without header):", rdd_csv_data.take(2))

# c) CREATE RDD
print("\n=== c) Create RDD ===")
rdd_created = sc.parallelize(["CS", "Maths", "Physics"])
print("Created RDD:", rdd_created.collect())

# d) ACTIONS
print("\n=== d) RDD Actions ===")
numbers = sc.parallelize([1, 2, 3, 4, 5])
print("Count:", numbers.count())
print("First:", numbers.first())
print("Take 3:", numbers.take(3))
print("Sum using reduce:", numbers.reduce(lambda x, y: x + y))

# e) PAIR FUNCTIONS
print("\n=== e) Pair Functions ===")
pair_rdd = sc.parallelize([("CS", 10), ("Maths", 15), ("CS", 20), ("Physics", 10)])

# reduceByKey
reduced = pair_rdd.reduceByKey(lambda a, b: a + b)
print("reduceByKey:", reduced.collect())

# groupByKey
grouped = pair_rdd.groupByKey().mapValues(list)
print("groupByKey:", grouped.collect())

# mapValues
mapped = pair_rdd.mapValues(lambda x: x * 2)
print("mapValues (values * 2):", mapped.collect())

# keys and values
print("Keys:", pair_rdd.keys().collect())
print("Values:", pair_rdd.values().collect())

# Stop Spark Session
spark.stop()



=== a) parallelize() ===
Parallelized RDD: [10, 20, 30, 40, 50]

=== b) Read Text File & CSV ===
Text File RDD: ['This is line one.', 'This is line two.', 'This is line three.']
CSV File RDD (without header): ['Alice,21', 'Bob,22']

=== c) Create RDD ===
Created RDD: ['CS', 'Maths', 'Physics']

=== d) RDD Actions ===
Count: 5
First: 1
Take 3: [1, 2, 3]
Sum using reduce: 15

=== e) Pair Functions ===
reduceByKey: [('CS', 30), ('Maths', 15), ('Physics', 10)]
groupByKey: [('CS', [10, 20]), ('Maths', [15]), ('Physics', [10])]
mapValues (values * 2): [('CS', 20), ('Maths', 30), ('CS', 40), ('Physics', 20)]
Keys: ['CS', 'Maths', 'CS', 'Physics']
Values: [10, 15, 20, 10]
