In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("M16-Lab02-RDD-vs-DataFrame") \
    .master("local[*]") \
    .getOrCreate()

sc = spark.sparkContext

print(f"Spark version: {spark.version}")
print(f"App name: {spark.sparkContext.appName}")
print("✅ SparkSession created")

Spark version: 4.0.2
App name: M16-Lab02-RDD-vs-DataFrame
✅ SparkSession created


In [2]:
# Create plays.csv

plays_data = """play_id,user_id,song_id,artist_id,played_at,duration_seconds
P001,U101,S001,A01,2025-03-01 08:15:00,240
P002,U102,S002,A02,2025-03-01 09:30:00,180
P003,U101,S001,A01,2025-03-01 10:00:00,240
P004,U103,S003,A01,2025-03-01 11:45:00,300
P005,U102,S001,A01,2025-03-01 12:00:00,240
P006,U104,S004,A03,2025-03-01 13:30:00,200
P007,U101,S002,A02,2025-03-01 14:00:00,180
P008,U105,S005,A02,2025-03-01 15:15:00,220
P009,U103,S001,A01,2025-03-01 16:00:00,240
P010,U104,S003,A01,2025-03-01 17:30:00,300
P011,U102,S004,A03,2025-03-02 08:00:00,200
P012,U101,S005,A02,2025-03-02 09:15:00,220
P013,U105,S001,A01,2025-03-02 10:30:00,240
P014,U103,S002,A02,2025-03-02 11:00:00,180
P015,U104,S001,A01,2025-03-02 12:45:00,240"""

with open("plays.csv", "w") as f:
    f.write(plays_data)

# Create songs.csv

songs_data = """song_id,song_name,genre,release_year
S001,Midnight Drive,Pop,2024
S002,Ocean Waves,Rock,2023
S003,City Lights,Pop,2025
S004,Thunder Road,Rock,2024
S005,Sunset Blvd,Jazz,2023"""

with open("songs.csv", "w") as f:
    f.write(songs_data)

print("✅ CSV files created")

✅ CSV files created


In [3]:
print("=" * 50)
print("PIPELINE 1: Play Count by Song (RDD)")
print("=" * 50)

rdd = sc.textFile("plays.csv")

header = rdd.first()
data = rdd.filter(lambda line: line != header)

parsed = data.map(lambda line: line.split(","))

song_plays = (
    parsed
    .map(lambda row: (row[2], 1))
    .reduceByKey(lambda a, b: a + b)
    .sortBy(lambda x: -x[1])
)

print("\nRDD Result — Play count by song:")
for song_id, count in song_plays.collect():
    print(f"  {song_id}: {count} plays")


PIPELINE 1: Play Count by Song (RDD)

RDD Result — Play count by song:
  S001: 6 plays
  S002: 3 plays
  S004: 2 plays
  S003: 2 plays
  S005: 2 plays


In [4]:
# dataframe version:
print("\n" + "=" * 50)
print("PIPELINE 1: Play Count by Song (DataFrame)")
print("=" * 50)

from pyspark.sql.functions import col, count, desc

df = spark.read.csv("plays.csv", header=True, inferSchema=True)

song_plays_df = (
    df.groupBy("song_id")
      .agg(count("play_id").alias("play_count"))
      .orderBy(desc("play_count"))
)

print("\nDataFrame Result — Play count by song:")
song_plays_df.show()



PIPELINE 1: Play Count by Song (DataFrame)

DataFrame Result — Play count by song:
+-------+----------+
|song_id|play_count|
+-------+----------+
|   S001|         6|
|   S002|         3|
|   S004|         2|
|   S005|         2|
|   S003|         2|
+-------+----------+



In [5]:
#Step 2c: Compare and verify
rdd_result = dict(song_plays.collect())
df_result = {row["song_id"]: row["play_count"] for row in song_plays_df.collect()}

print("\nVerification:")
print(f"RDD result:       {rdd_result}")
print(f"DataFrame result: {df_result}")
print(f"Results match:    {rdd_result == df_result}")



Verification:
RDD result:       {'S001': 6, 'S002': 3, 'S004': 2, 'S003': 2, 'S005': 2}
DataFrame result: {'S001': 6, 'S002': 3, 'S004': 2, 'S005': 2, 'S003': 2}
Results match:    True


## Pipeline 1 — Play Count by Song

| Metric | RDD Version | DataFrame Version |
|--------|-------------|------------------|
| Lines of code | 14 | 8 |
| Column access | By index (row[2]) | By name ("song_id") |
| CSV parsing | Manual (split(",")) | Automatic (header=True, inferSchema=True) |
| Optimization | None | Catalyst optimizer |
| Readability | Low (row[2] is unclear) | High (explicit column names) |


In [6]:
# Pipeline 2 — Total Listening Time per User

# Step 3a: RDD Version (Legacy)

print("=" * 50)
print("PIPELINE 2: Listening Time per User (RDD)")
print("=" * 50)

rdd = sc.textFile("plays.csv")
header = rdd.first()
data = rdd.filter(lambda line: line != header)
parsed = data.map(lambda line: line.split(","))

user_time = (
    parsed
    .map(lambda row: (row[1], int(row[5])))
    .reduceByKey(lambda a, b: a + b)
    .mapValues(lambda seconds: round(seconds / 60, 1))
    .sortBy(lambda x: -x[1])
)

print("\nRDD Result — Listening time per user (minutes):")
for user_id, minutes in user_time.collect():
    print(f"  {user_id}: {minutes} min")


PIPELINE 2: Listening Time per User (RDD)

RDD Result — Listening time per user (minutes):
  U101: 14.7 min
  U104: 12.3 min
  U103: 12.0 min
  U102: 10.3 min
  U105: 7.7 min


In [7]:
# Step 3b: DataFrame Version (Rewrite)
print("\n" + "=" * 50)
print("PIPELINE 2: Listening Time per User (DataFrame)")
print("=" * 50)

from pyspark.sql.functions import sum as spark_sum, round as spark_round

df = spark.read.csv("plays.csv", header=True, inferSchema=True)

user_time_df = (
    df.groupBy("user_id")
      .agg(
          spark_round(spark_sum("duration_seconds") / 60, 1).alias("total_minutes")
      )
      .orderBy(desc("total_minutes"))
)

print("\nDataFrame Result — Listening time per user (minutes):")
user_time_df.show()



PIPELINE 2: Listening Time per User (DataFrame)

DataFrame Result — Listening time per user (minutes):
+-------+-------------+
|user_id|total_minutes|
+-------+-------------+
|   U101|         14.7|
|   U104|         12.3|
|   U103|         12.0|
|   U102|         10.3|
|   U105|          7.7|
+-------+-------------+



In [8]:
# Step 3c: Compare and verify

rdd_result = dict(user_time.collect())
df_result = {row["user_id"]: row["total_minutes"] for row in user_time_df.collect()}

print("\nVerification:")
print(f"RDD result:       {rdd_result}")
print(f"DataFrame result: {df_result}")
print(f"Results match:    {rdd_result == df_result}")



Verification:
RDD result:       {'U101': 14.7, 'U104': 12.3, 'U103': 12.0, 'U102': 10.3, 'U105': 7.7}
DataFrame result: {'U101': 14.7, 'U104': 12.3, 'U103': 12.0, 'U102': 10.3, 'U105': 7.7}
Results match:    True


## Pipeline 2 — Listening Time per User

| Metric | RDD Version | DataFrame Version |
|--------|-------------|------------------|
| Lines of code | 15 | 9 |
| Column access | By index (row[1], row[5]) | By name ("user_id", "duration_seconds") |
| Type handling | Manual (int(row[5])) | Automatic (inferSchema=True) |
| Aggregation | reduceByKey + mapValues | groupBy + agg |
| Optimization | None | Catalyst optimizer |
| Readability | Medium-Low (index-based access) | High (clear column operations) |

### Analysis

- The DataFrame version reduces code by approximately 40%.
- Manual type conversion (int(row[5])) is eliminated with schema inference.
- Aggregations are more declarative and SQL-like using groupBy and agg.
- Column names make the logic clearer and easier to maintain.
- Catalyst optimizer enables performance improvements not available in RDDs.


In [9]:
# Pipeline 3 — Top Songs with Genre

# Step 4a: RDD Version (Legacy)

print("=" * 50)
print("PIPELINE 3: Top Songs with Genre (RDD)")
print("=" * 50)

plays_rdd = sc.textFile("plays.csv")
plays_header = plays_rdd.first()
plays_data = plays_rdd.filter(lambda line: line != plays_header) \
                       .map(lambda line: line.split(","))

songs_rdd = sc.textFile("songs.csv")
songs_header = songs_rdd.first()
songs_data = songs_rdd.filter(lambda line: line != songs_header) \
                       .map(lambda line: line.split(","))

plays_keyed = plays_data.map(lambda row: (row[2], 1)) \
                        .reduceByKey(lambda a, b: a + b)

songs_keyed = songs_data.map(lambda row: (row[0], (row[1], row[2])))

joined = plays_keyed.join(songs_keyed) \
                    .map(lambda x: (x[0], x[1][1][0], x[1][1][1], x[1][0])) \
                    .sortBy(lambda x: -x[3])

print("\nRDD Result — Top songs with genre:")
for song_id, song_name, genre, plays in joined.collect():
    print(f"  {song_name} ({genre}): {plays} plays")


PIPELINE 3: Top Songs with Genre (RDD)

RDD Result — Top songs with genre:
  Midnight Drive (Pop): 6 plays
  Ocean Waves (Rock): 3 plays
  Thunder Road (Rock): 2 plays
  Sunset Blvd (Jazz): 2 plays
  City Lights (Pop): 2 plays


In [10]:
# Step 4b: DataFrame Version (Rewrite)

print("\n" + "=" * 50)
print("PIPELINE 3: Top Songs with Genre (DataFrame)")
print("=" * 50)

plays_df = spark.read.csv("plays.csv", header=True, inferSchema=True)
songs_df = spark.read.csv("songs.csv", header=True, inferSchema=True)

top_songs_df = (
    plays_df
    .groupBy("song_id")
    .agg(count("play_id").alias("play_count"))
    .join(songs_df, "song_id")
    .select("song_id", "song_name", "genre", "play_count")
    .orderBy(desc("play_count"))
)

print("\nDataFrame Result — Top songs with genre:")
top_songs_df.show()



PIPELINE 3: Top Songs with Genre (DataFrame)

DataFrame Result — Top songs with genre:
+-------+--------------+-----+----------+
|song_id|     song_name|genre|play_count|
+-------+--------------+-----+----------+
|   S001|Midnight Drive|  Pop|         6|
|   S002|   Ocean Waves| Rock|         3|
|   S004|  Thunder Road| Rock|         2|
|   S005|   Sunset Blvd| Jazz|         2|
|   S003|   City Lights|  Pop|         2|
+-------+--------------+-----+----------+



In [11]:
# Step 4c: Compare and verify

rdd_result = [(r[0], r[3]) for r in joined.collect()]
df_result = [(row["song_id"], row["play_count"]) for row in top_songs_df.collect()]

rdd_sorted = sorted(rdd_result)
df_sorted = sorted(df_result)

print("\nVerification:")
print(f"Results match: {rdd_sorted == df_sorted}")



Verification:
Results match: True


## Pipeline 3 — Top Songs with Genre (Join)

| Metric | RDD Version | DataFrame Version |
|--------|-------------|------------------|
| Lines of code | 22 | 11 |
| Join logic | Manual key-value join | Built-in join() |
| Column access | Nested tuple access (x[1][1][0]) | Column names ("song_name") |
| CSV parsing | Manual split(",") | Automatic (header=True) |
| Aggregation | reduceByKey | groupBy + agg |
| Optimization | None | Catalyst optimizer |
| Readability | Very Low (deep tuple indexing) | Very High (SQL-like syntax) |


### Analysis

- The DataFrame version reduces code by approximately 50%.
- The RDD join requires complex nested tuple access (x[1][1][0]), which is difficult to read and maintain.
- DataFrame joins are declarative and self-documenting.
- Column-based operations reduce indexing errors.
- Catalyst optimizer can optimize joins (broadcast joins, predicate pushdown), which RDDs cannot.
- Join-heavy pipelines benefit the MOST from migrating to DataFrames.

## RDD to DataFrame Migration Guide

### Overall Comparison

| Metric | Pipeline 1 | Pipeline 2 | Pipeline 3 |
|--------|------------|------------|------------|
| RDD lines of code | 14 | 15 | 22 |
| DataFrame lines of code | 8 | 9 | 11 |
| Code reduction | 43% | 40% | 50% |
| Results match | ✅ | ✅ | ✅ |

---

### Key Findings

1. DataFrames require approximately **40–50% less code** across all pipelines.
2. Column access by name is **safer and more readable** than index-based access.
3. Join operations are **dramatically more readable** with DataFrames.
4. The **Catalyst optimizer** can optimize DataFrame queries but **cannot optimize RDD transformations**.
5. Schema inference eliminates manual type conversion.
6. DataFrame syntax is closer to SQL, making onboarding easier for new engineers.

---

### Technical Advantages of DataFrames

| Feature | RDD | DataFrame |
|----------|------|------------|
| Schema awareness | ❌ No schema | ✅ Structured schema |
| Catalyst optimizer | ❌ Not supported | ✅ Automatic optimization |
| Tungsten execution engine | ❌ No | ✅ Yes |
| Join optimization | ❌ Manual only | ✅ Broadcast + optimized joins |
| SQL support | ❌ No | ✅ Yes |
| Code maintainability | Low | High |

---

### Performance Considerations

- RDD transformations operate at a lower abstraction level.
- DataFrames allow Spark’s optimizer to:
  - Reorder operations
  - Push filters down
  - Optimize joins
  - Reduce shuffle
- In real production systems, DataFrames are typically **2–5x faster** than equivalent RDD implementations.

---

### Migration Recommendation for StreamPulse

**Priority 1 — Migrate Join-Based Pipelines**
- Highest readability improvement
- Highest performance gain
- Most bug-prone in RDD form

**Priority 2 — Migrate Aggregation Pipelines**
- Significant code reduction
- Performance optimization through Catalyst

**Priority 3 — Keep RDD Only For:**
- Low-level custom transformations
- Complex unstructured data processing
- Situations requiring fine-grained partition control

---

### Final Recommendation

StreamPulse should standardize on the DataFrame API for all new development and gradually migrate legacy RDD pipelines.  

RDD should be treated as a **low-level API for specialized use cases**, not the default choice.

The migration improves:
- Code readability
- Maintainability
- Performance
- Team onboarding efficiency
