In [1]:
# 1️⃣ Artist Catalog (Messy CSV)
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import time
import os

spark = SparkSession.builder \
    .appName("StreamPulse-DataIngestion") \
    .master("local[*]") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()


In [2]:
# 2️⃣ Listening Events (JSON)

import random
random.seed(42)

genres = ["Pop", "Rock", "Hip-Hop", "Jazz", "Classical", "Electronic", "R&B", "Country"]
countries = ["US", "UK", "DE", "JP", "BR", "KR", "FR", "AU", "NG", "IN"]

artist_data = []
for i in range(50000):
    name = f"Artist_{i+1:05d}"
    genre = random.choice(genres)
    country = random.choice(countries)
    followers = random.randint(100, 5000000)
    monthly_listeners = random.randint(50, 2000000)
    verified = random.choice(["true", "false", "TRUE", "False", "yes", ""])
    join_date = f"202{random.randint(0,4)}-{random.randint(1,12):02d}-{random.randint(1,28):02d}"

    if random.random() < 0.02:
        followers = "NULL"
    if random.random() < 0.01:
        name = f'Artist "Special" {i}'

    artist_data.append((name, genre, country, str(followers), str(monthly_listeners), verified, join_date))

artist_columns = ["artist_name", "genre", "country", "followers", "monthly_listeners", "verified", "join_date"]
df_artists = spark.createDataFrame(artist_data, artist_columns)
df_artists.write.csv("data/artists_csv", header=True, mode="overwrite", quote='"', escape='"')
print(f"✅ Artist catalog saved: {df_artists.count()} rows")


✅ Artist catalog saved: 50000 rows


In [3]:
# 3️⃣ User Profiles (Parquet — Clean)
event_data = []
for i in range(100000):
    event_data.append({
        "event_id": f"EVT-{i+1:07d}",
        "user_id": f"USR-{random.randint(1, 20000):06d}",
        "artist_id": f"ART-{random.randint(1, 50000):05d}",
        "track_id": f"TRK-{random.randint(1, 200000):06d}",
        "duration_seconds": random.randint(10, 400),
        "completed": random.choice([True, False]),
        "timestamp": f"2025-{random.randint(1,6):02d}-{random.randint(1,28):02d}T{random.randint(0,23):02d}:{random.randint(0,59):02d}:00Z",
        "device": random.choice(["mobile", "desktop", "smart_speaker", "tablet"]),
        "quality": random.choice(["low", "standard", "high", "lossless"]),
    })

df_events = spark.createDataFrame(event_data)
df_events.write.json("data/events_json", mode="overwrite")
print(f"✅ Listening events saved: {df_events.count()} rows")


✅ Listening events saved: 100000 rows


In [4]:
user_data = []
for i in range(20000):
    user_data.append((
        f"USR-{i+1:06d}",
        f"user_{i+1}@email.com",
        random.choice(countries),
        random.choice(["free", "premium", "family", "student"]),
        random.randint(16, 75),
        f"202{random.randint(0,4)}-{random.randint(1,12):02d}-{random.randint(1,28):02d}",
    ))

user_columns = ["user_id", "email", "country", "subscription_tier", "age", "signup_date"]

df_users = spark.createDataFrame(user_data, user_columns)

df_users.write.parquet("data/users_parquet", mode="overwrite")

print(f"✅ User profiles saved: {df_users.count()} rows")

✅ User profiles saved: 20000 rows


In [5]:
df_bad = spark.read.csv("data/artists_csv")
df_bad.show(5)
df_bad.printSchema()


+------------+----------+-------+---------+-----------------+--------+----------+
|         _c0|       _c1|    _c2|      _c3|              _c4|     _c5|       _c6|
+------------+----------+-------+---------+-----------------+--------+----------+
| artist_name|     genre|country|followers|monthly_listeners|verified| join_date|
|Artist_24577|       R&B|     IN|   790768|          1080587|   False|2024-09-14|
|Artist_24578|Electronic|     AU|  4653247|           465108|    true|2023-02-14|
|Artist_24579|      Rock|     IN|  2324908|            55481|    TRUE|2023-04-02|
|Artist_24580| Classical|     FR|  4691325|           373026|     yes|2021-06-25|
+------------+----------+-------+---------+-----------------+--------+----------+
only showing top 5 rows
root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nulla

In [6]:
df_artists = spark.read.csv(
    "data/artists_csv",
    header=True,
    inferSchema=True,
    nullValue="NULL",
    quote='"',
    escape='"'
)
df_artists.show(5)
df_artists.printSchema()


+------------+----------+-------+---------+-----------------+--------+----------+
| artist_name|     genre|country|followers|monthly_listeners|verified| join_date|
+------------+----------+-------+---------+-----------------+--------+----------+
|Artist_24577|       R&B|     IN|   790768|          1080587|   False|2024-09-14|
|Artist_24578|Electronic|     AU|  4653247|           465108|    true|2023-02-14|
|Artist_24579|      Rock|     IN|  2324908|            55481|    TRUE|2023-04-02|
|Artist_24580| Classical|     FR|  4691325|           373026|     yes|2021-06-25|
|Artist_24581|      Rock|     US|  4859855|          1827868|    TRUE|2023-01-05|
+------------+----------+-------+---------+-----------------+--------+----------+
only showing top 5 rows
root
 |-- artist_name: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- country: string (nullable = true)
 |-- followers: integer (nullable = true)
 |-- monthly_listeners: integer (nullable = true)
 |-- verified: string 

In [8]:
artist_schema = StructType([
    StructField("artist_name", StringType(), True),
    StructField("genre", StringType(), True),
    StructField("country", StringType(), True),
    StructField("followers", IntegerType(), True),
    StructField("monthly_listeners", IntegerType(), True),
    StructField("verified", StringType(), True),
    StructField("join_date", DateType(), True),
])


In [9]:
start = time.time()
df_infer = spark.read.csv("data/artists_csv", header=True, inferSchema=True, nullValue="NULL")
time_infer = time.time() - start

start = time.time()
df_explicit = spark.read.csv("data/artists_csv", header=True, schema=artist_schema,
                              nullValue="NULL", dateFormat="yyyy-MM-dd")
time_explicit = time.time() - start

print(f"inferSchema: {time_infer:.3f}s")
print(f"Explicit:    {time_explicit:.3f}s")


inferSchema: 0.905s
Explicit:    0.063s


In [10]:
df_artists_clean = df_explicit.withColumn(
    "verified",
    when(lower(col("verified")).isin("true", "yes"), True)
    .when(lower(col("verified")).isin("false", "no", ""), False)
    .otherwise(None)
    .cast("boolean")
)
df_artists_clean.groupBy("verified").count().show()


+--------+-----+
|verified|count|
+--------+-----+
|    true|24711|
|   false|25289|
+--------+-----+



In [11]:
start = time.time()
df_events = spark.read.json("data/events_json")
time_json = time.time() - start

df_events.show(5, truncate=False)
df_events.printSchema()
print(f"JSON read: {time_json:.3f}s, {df_events.count()} rows")


+---------+---------+-------------+----------------+-----------+--------+--------------------+----------+----------+
|artist_id|completed|device       |duration_seconds|event_id   |quality |timestamp           |track_id  |user_id   |
+---------+---------+-------------+----------------+-----------+--------+--------------------+----------+----------+
|ART-49157|false    |mobile       |373             |EVT-0000001|lossless|2025-05-01T23:26:00Z|TRK-116991|USR-013081|
|ART-46374|true     |tablet       |67              |EVT-0000002|high    |2025-01-02T12:16:00Z|TRK-135446|USR-009729|
|ART-27500|false    |smart_speaker|289             |EVT-0000003|low     |2025-02-20T08:10:00Z|TRK-113525|USR-016830|
|ART-23751|true     |tablet       |100             |EVT-0000004|lossless|2025-05-17T13:36:00Z|TRK-146585|USR-017005|
|ART-06543|false    |mobile       |144             |EVT-0000005|high    |2025-04-21T17:19:00Z|TRK-198640|USR-019184|
+---------+---------+-------------+----------------+-----------+

In [12]:
df_events_clean = df_events.withColumn(
    "event_timestamp",
    to_timestamp(col("timestamp"), "yyyy-MM-dd'T'HH:mm:ss'Z'")
).drop("timestamp")

df_events_clean.select("event_id", "event_timestamp").show(5, truncate=False)


+-----------+-------------------+
|event_id   |event_timestamp    |
+-----------+-------------------+
|EVT-0000001|2025-05-01 23:26:00|
|EVT-0000002|2025-01-02 12:16:00|
|EVT-0000003|2025-02-20 08:10:00|
|EVT-0000004|2025-05-17 13:36:00|
|EVT-0000005|2025-04-21 17:19:00|
+-----------+-------------------+
only showing top 5 rows


In [13]:
df_events_enriched = df_events_clean \
    .withColumn("event_date", to_date(col("event_timestamp"))) \
    .withColumn("event_hour", hour(col("event_timestamp"))) \
    .withColumn("event_month", month(col("event_timestamp")))

df_events_enriched.select("event_id", "event_date", "event_hour", "event_month").show(5)


+-----------+----------+----------+-----------+
|   event_id|event_date|event_hour|event_month|
+-----------+----------+----------+-----------+
|EVT-0000001|2025-05-01|        23|          5|
|EVT-0000002|2025-01-02|        12|          1|
|EVT-0000003|2025-02-20|         8|          2|
|EVT-0000004|2025-05-17|        13|          5|
|EVT-0000005|2025-04-21|        17|          4|
+-----------+----------+----------+-----------+
only showing top 5 rows


In [14]:
start = time.time()
df_users = spark.read.parquet("data/users_parquet")
time_parquet = time.time() - start

df_users.show(5)
df_users.printSchema()
print(f"Parquet read: {time_parquet:.3f}s, {df_users.count()} rows")


+----------+----------------+-------+-----------------+---+-----------+
|   user_id|           email|country|subscription_tier|age|signup_date|
+----------+----------------+-------+-----------------+---+-----------+
|USR-000001|user_1@email.com|     DE|           family| 33| 2024-02-09|
|USR-000002|user_2@email.com|     IN|          premium| 49| 2020-02-19|
|USR-000003|user_3@email.com|     IN|           family| 74| 2022-08-08|
|USR-000004|user_4@email.com|     BR|          student| 40| 2024-10-01|
|USR-000005|user_5@email.com|     FR|          premium| 47| 2023-10-13|
+----------+----------------+-------+-----------------+---+-----------+
only showing top 5 rows
root
 |-- user_id: string (nullable = true)
 |-- email: string (nullable = true)
 |-- country: string (nullable = true)
 |-- subscription_tier: string (nullable = true)
 |-- age: long (nullable = true)
 |-- signup_date: string (nullable = true)

Parquet read: 0.609s, 20000 rows


In [15]:
start = time.time()
df_all = spark.read.parquet("data/users_parquet")
_ = df_all.count()
time_all = time.time() - start

start = time.time()
df_two = spark.read.parquet("data/users_parquet").select("user_id", "subscription_tier")
_ = df_two.count()
time_two = time.time() - start

print(f"All columns:  {time_all:.3f}s")
print(f"Two columns:  {time_two:.3f}s")


All columns:  0.701s
Two columns:  0.442s


| Aspect                | CSV               | JSON         | Parquet |
| --------------------- | ----------------- | ------------ | ------- |
| Read time             | Slowest           | Medium       | Fastest |
| Schema auto-detected? | Needs inferSchema | Yes          | Yes     |
| Types accurate?       | Often wrong       | Mostly right | Perfect |
| Options needed        | Many              | Few          | None    |
| Cleaning required     | High              | Medium       | Minimal |


In [17]:
df_enriched_listens = df_events_enriched \
    .join(df_users.select("user_id", "country", "subscription_tier", "age"),
          on="user_id", how="left") \
    .join(df_artists_clean.select(
              col("artist_name"),
              col("genre"),
              col("country").alias("artist_country"),
              col("followers")
          ).withColumn("artist_id",
              concat(lit("ART-"), lpad(monotonically_increasing_id().cast("string"), 5, "0"))
          ),
          on="artist_id", how="left")


In [18]:
df_enriched_listens.coalesce(4) \
    .write.parquet(
        "data/unified_listening_analytics",
        mode="overwrite",
        partitionBy=["event_month"],
        compression="snappy"
    )

result = spark.read.parquet("data/unified_listening_analytics")
print(f"✅ Unified dataset: {result.count()} rows, {len(result.columns)} columns")
result.show(5)


✅ Unified dataset: 100000 rows, 19 columns
+---------+----------+---------+-------------+----------------+-----------+--------+----------+-------------------+----------+----------+-------+-----------------+---+------------+---------+--------------+---------+-----------+
|artist_id|   user_id|completed|       device|duration_seconds|   event_id| quality|  track_id|    event_timestamp|event_date|event_hour|country|subscription_tier|age| artist_name|    genre|artist_country|followers|event_month|
+---------+----------+---------+-------------+----------------+-----------+--------+----------+-------------------+----------+----------+-------+-----------------+---+------------+---------+--------------+---------+-----------+
|ART-17153|USR-012843|    false|       mobile|              74|EVT-0000020|    high|TRK-108464|2025-03-16 04:35:00|2025-03-16|         4|     IN|          student| 22|Artist_41730|Classical|            DE|  1954312|          3|
|ART-03721|USR-008636|     true|smart_speaker