In [1]:
# Import necessary libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from delta import *

In [2]:
# ==> THIS IS THE MOST IMPORTANT CONFIG <==
# Configure Spark Session to use Delta Lake
builder = pyspark.sql.SparkSession.builder.appName("AwesomeDeltaLake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create the SparkSession
spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("✨ Spark and Delta Lake are ready to go! ✨")

:: loading settings :: url = jar:file:/Users/jesses_fables/Desktop/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/jesses_fables/.ivy2.5.2/cache
The jars for the packages stored in: /Users/jesses_fables/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6e8bc227-8d38-43f6-9b6b-467668749f87;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 498ms :: artifacts dl 40ms
	:: modules in use:
	io.delta#delta-spark_2.13;4.0.0 from central in [default]
	io.delta#delta-storage;4.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.13.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules     

✨ Spark and Delta Lake are ready to go! ✨


In [3]:
# Define the path for our Delta table
delta_table_path = "fire/tmp/delta-table"

# Sample data for our table
data = spark.createDataFrame([
    (1, "Alice", 25),
    (2, "Bob", 30),
    (3, "Charlie", 35)
], ["id", "name", "age"])

# Write the DataFrame to a Delta table
print(f"Writing data to Delta table at: {delta_table_path}")
data.write.format("delta").mode("overwrite").save(delta_table_path)

print("✅ Write complete!")

Writing data to Delta table at: fire/tmp/delta-table


25/09/07 11:44:07 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

✅ Write complete!


In [4]:
# Let's see what we created!
# Read the data back and show it
df = spark.read.format("delta").load(delta_table_path)

print("Reading data from Delta table:")
df.show()

Reading data from Delta table:


                                                                                

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
|  1|  Alice| 25|
|  2|    Bob| 30|
+---+-------+---+



In [5]:
# Let's see what we created!
# Read the data back and show it
df = spark.read.format("delta").load(delta_table_path)

print("Reading data from Delta table:")
df.show()

Reading data from Delta table:
+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
|  1|  Alice| 25|
|  2|    Bob| 30|
+---+-------+---+



In [6]:
# Create a new DataFrame with new and updated data
newData = spark.createDataFrame([
    (1, "Alicia", 26),        # Update Alice's name and age
    (4, "David", 40)         # New person
], ["id", "name", "age"])

print("New data to merge:")
newData.show()


New data to merge:
+---+------+---+
| id|  name|age|
+---+------+---+
|  1|Alicia| 26|
|  4| David| 40|
+---+------+---+



In [7]:
# Load the Delta table as a DeltaTable object
from delta.tables import *
deltaTable = DeltaTable.forPath(spark, delta_table_path)

# Perform the merge operation
print("Performing merge...")
deltaTable.alias("oldData") \
  .merge(
    newData.alias("newData"),
    "oldData.id = newData.id"
  ) \
  .whenMatchedUpdate(set = { "name": col("newData.name"), "age": col("newData.age") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id"), "name": col("newData.name"), "age": col("newData.age") }) \
  .execute()

print("✅ Merge complete!")


Performing merge...


                                                                                

✅ Merge complete!


25/09/07 11:44:29 WARN MapPartitionsRDD: RDD 57 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


In [8]:
# See the results! Bob and Charlie are untouched.
# Alice is updated to Alicia. David is added.
print("Table content after merge:")
spark.read.format("delta").load(delta_table_path).orderBy("id").show()

Table content after merge:


                                                                                

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1| Alicia| 26|
|  2|    Bob| 30|
|  3|Charlie| 35|
|  4|  David| 40|
+---+-------+---+



In [9]:
# Let's read the *first* version of our table (before the merge)
# Version 0 was the initial write.
print("Reading table at version 0 (before merge):")
df_v0 = spark.read.format("delta").option("versionAsOf", 0).load(delta_table_path)
df_v0.orderBy("id").show()

Reading table at version 0 (before merge):


                                                                                

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [10]:
# Now let's read the *latest* version (after the merge)
# Version 1 was the merge operation.
print("Reading table at version 1 (after merge):")
df_v1 = spark.read.format("delta").option("versionAsOf", 1).load(delta_table_path)
df_v1.orderBy("id").show()

Reading table at version 1 (after merge):
+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1| Alicia| 26|
|  2|    Bob| 30|
|  3|Charlie| 35|
|  4|  David| 40|
+---+-------+---+



In [11]:
# Create a DataFrame with a different schema (new column 'city')
badData = spark.createDataFrame([
    (5, "Eve", 28, "Miami")
], ["id", "name", "age", "city"])

# Let's try to append this...
print("Attempting to write data with a different schema...")
try:
    badData.write.format("delta").mode("append").save(delta_table_path)
except Exception as e:
    print("🚨 ERROR! As expected, the write failed.")
    print(e)

Attempting to write data with a different schema...
🚨 ERROR! As expected, the write failed.
[_LEGACY_ERROR_TEMP_DELTA_0007] A schema mismatch detected when writing to the Delta table (Table ID: bcf4e37d-3820-4e1a-b849-58198574cb2a).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- id: long (nullable = true)
-- name: string (nullable = true)
-- age: long (nullable = true)


Data schema:
root
-- id: long (nullable = true)
-- name: string (nullable = true)
-- age: long (nullable = true)
-- city: string (nullable = true)

         


In [12]:
# Let's retry by enabling schema evolution
print("Writing again with schema evolution enabled...")

badData.write.format("delta") \
    .mode("append") \
    .option("mergeSchema", "true") \
    .save(delta_table_path)

print("✅ Write successful!")

Writing again with schema evolution enabled...


                                                                                

✅ Write successful!


In [13]:
# Check the new schema and data
# The 'city' column has been added for the new row,
# and is null for the old rows.
print("Table content after schema evolution:")
spark.read.format("delta").load(delta_table_path).orderBy("id").show()

print("New schema:")
spark.read.format("delta").load(delta_table_path).printSchema()

Table content after schema evolution:


                                                                                

+---+-------+---+-----+
| id|   name|age| city|
+---+-------+---+-----+
|  1| Alicia| 26| NULL|
|  2|    Bob| 30| NULL|
|  3|Charlie| 35| NULL|
|  4|  David| 40| NULL|
|  5|    Eve| 28|Miami|
+---+-------+---+-----+

New schema:
root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)



In [14]:
# Stop the Spark Session
spark.stop()

In [16]:
# ==> THIS IS THE MOST IMPORTANT CONFIG <==
# Configure Spark Session to use Delta Lake
builder = pyspark.sql.SparkSession.builder.appName("AwesomeDeltaLake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create the SparkSession
spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("✨ Spark and Delta Lake are ready to go! ✨")

25/09/07 11:45:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


✨ Spark and Delta Lake are ready to go! ✨


In [20]:
df = spark.read.format("delta").load(delta_table_path)
df.orderBy("id").show()

                                                                                

+---+-------+---+-----+
| id|   name|age| city|
+---+-------+---+-----+
|  1| Alicia| 26| NULL|
|  2|    Bob| 30| NULL|
|  3|Charlie| 35| NULL|
|  4|  David| 40| NULL|
|  5|    Eve| 28|Miami|
+---+-------+---+-----+



In [26]:
spark = SparkSession.builder.appName("DataFramePractice").getOrCreate()

# Create our sample data
employee_data = [
    (1, "Alice", "Engineering", 120000, "2021-03-15", None),
    (2, "Bob", "Engineering", 95000, "2022-01-20", 1),
    (3, "Charlie", "Sales", 80000, "2021-05-10", 4),
    (4, "David", "Sales", 110000, "2020-11-01", None),
    (5, "Eve", "HR", 75000, "2023-08-30", 6),
    (6, "Frank", "HR", 92000, "2019-07-22", None),
    (7, "Grace", "Engineering", 105000, "2023-02-12", 1)
]
columns = ["id", "name", "department", "salary", "hire_date", "manager_id"]

# Create the DataFrame
employee_data_df = spark.createDataFrame(employee_data, columns)

25/09/07 11:50:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [27]:
employee_data_df.show()

[Stage 18:>                                                         (0 + 3) / 3]

+---+-------+-----------+------+----------+----------+
| id|   name| department|salary| hire_date|manager_id|
+---+-------+-----------+------+----------+----------+
|  1|  Alice|Engineering|120000|2021-03-15|      NULL|
|  2|    Bob|Engineering| 95000|2022-01-20|         1|
|  3|Charlie|      Sales| 80000|2021-05-10|         4|
|  4|  David|      Sales|110000|2020-11-01|      NULL|
|  5|    Eve|         HR| 75000|2023-08-30|         6|
|  6|  Frank|         HR| 92000|2019-07-22|      NULL|
|  7|  Grace|Engineering|105000|2023-02-12|         1|
+---+-------+-----------+------+----------+----------+



                                                                                

In [29]:
delta_table_path2 = 'files/emp'

employee_data_df.write.format("delta").mode("overwrite").save(delta_table_path2)


                                                                                

In [31]:
employee_data_df.select("name", "salary").show()

+-------+------+
|   name|salary|
+-------+------+
|  Alice|120000|
|    Bob| 95000|
|Charlie| 80000|
|  David|110000|
|    Eve| 75000|
|  Frank| 92000|
|  Grace|105000|
+-------+------+



                                                                                

In [49]:
employee_data_df.filter(col("department") == "Engineering").orderBy("id", ascending=False).show()

+---+-----+-----------+------+----------+----------+
| id| name| department|salary| hire_date|manager_id|
+---+-----+-----------+------+----------+----------+
|  7|Grace|Engineering|105000|2023-02-12|         1|
|  2|  Bob|Engineering| 95000|2022-01-20|         1|
|  1|Alice|Engineering|120000|2021-03-15|      NULL|
+---+-----+-----------+------+----------+----------+



                                                                                

In [52]:
employee_data_df.filter(col("department") == "Engineering").orderBy(col("id").desc()).show()

+---+-----+-----------+------+----------+----------+
| id| name| department|salary| hire_date|manager_id|
+---+-----+-----------+------+----------+----------+
|  7|Grace|Engineering|105000|2023-02-12|         1|
|  2|  Bob|Engineering| 95000|2022-01-20|         1|
|  1|Alice|Engineering|120000|2021-03-15|      NULL|
+---+-----+-----------+------+----------+----------+



                                                                                

In [40]:
employee_data_df.filter("salary > 100000").show()

+---+-----+-----------+------+----------+----------+
| id| name| department|salary| hire_date|manager_id|
+---+-----+-----------+------+----------+----------+
|  1|Alice|Engineering|120000|2021-03-15|      NULL|
|  4|David|      Sales|110000|2020-11-01|      NULL|
|  7|Grace|Engineering|105000|2023-02-12|         1|
+---+-----+-----------+------+----------+----------+



In [43]:
# Let's give everyone a 10% bonus
employee_data_df_with_bonus = employee_data_df.withColumn("bonus", col("salary") * 0.10)
employee_data_df_with_bonus.show()

# We can also use it to extract the year from the hire_date
# df.withColumn("hire_year", year(col("hire_date"))).show()

+---+-------+-----------+------+----------+----------+-------+
| id|   name| department|salary| hire_date|manager_id|  bonus|
+---+-------+-----------+------+----------+----------+-------+
|  1|  Alice|Engineering|120000|2021-03-15|      NULL|12000.0|
|  2|    Bob|Engineering| 95000|2022-01-20|         1| 9500.0|
|  3|Charlie|      Sales| 80000|2021-05-10|         4| 8000.0|
|  4|  David|      Sales|110000|2020-11-01|      NULL|11000.0|
|  5|    Eve|         HR| 75000|2023-08-30|         6| 7500.0|
|  6|  Frank|         HR| 92000|2019-07-22|      NULL| 9200.0|
|  7|  Grace|Engineering|105000|2023-02-12|         1|10500.0|
+---+-------+-----------+------+----------+----------+-------+



                                                                                

In [45]:
from pyspark.sql.functions import col, year, avg, sum, count, max, round

# Let's find the average salary for each department
employee_data_df_with_bonus.groupBy("department") \
  .agg(
      count("*").alias("num_employees"),
      round(avg("salary"), 2).alias("avg_salary"),
      sum("salary").alias("total_payroll")
  ).show()

[Stage 47:>                                                         (0 + 4) / 4]

+-----------+-------------+----------+-------------+
| department|num_employees|avg_salary|total_payroll|
+-----------+-------------+----------+-------------+
|Engineering|            3| 106666.67|       320000|
|      Sales|            2|   95000.0|       190000|
|         HR|            2|   83500.0|       167000|
+-----------+-------------+----------+-------------+



                                                                                

In [47]:
# Sort employees by salary in descending order
employee_data_df_with_bonus.orderBy(col("salary").desc()).show()

[Stage 50:>                                                         (0 + 4) / 4]

+---+-------+-----------+------+----------+----------+-------+
| id|   name| department|salary| hire_date|manager_id|  bonus|
+---+-------+-----------+------+----------+----------+-------+
|  1|  Alice|Engineering|120000|2021-03-15|      NULL|12000.0|
|  4|  David|      Sales|110000|2020-11-01|      NULL|11000.0|
|  7|  Grace|Engineering|105000|2023-02-12|         1|10500.0|
|  2|    Bob|Engineering| 95000|2022-01-20|         1| 9500.0|
|  6|  Frank|         HR| 92000|2019-07-22|      NULL| 9200.0|
|  3|Charlie|      Sales| 80000|2021-05-10|         4| 8000.0|
|  5|    Eve|         HR| 75000|2023-08-30|         6| 7500.0|
+---+-------+-----------+------+----------+----------+-------+



                                                                                

In [57]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, count, weekofyear, lag, when, lit, desc, rank

# --- 1. Setup Spark Session and Create Dummy Data ---
spark = SparkSession.builder.appName("RedditSurgeAnalysis").getOrCreate()

posts_data = [("p1", "sub1", "2025-08-20 10:00:00"), ("p2", "sub1", "2025-08-28 11:00:00"),
              ("p3", "sub2", "2025-08-21 12:00:00"), ("p4", "sub2", "2025-08-29 13:00:00"),
              ("p5", "sub3", "2025-08-22 14:00:00"), ("p6", "sub3", "2025-08-30 15:00:00")]
posts = spark.createDataFrame(posts_data, ["post_id", "subreddit_id", "created_utc"]) \
    .withColumn("created_utc", col("created_utc").cast("timestamp"))

comments_data = [("c1", "p1", "2025-08-30 15:01:00"), ("c2", "p1", "2025-08-30 15:02:00"), ("c3", "p2", "2025-08-30 15:03:00"), # sub1
                 ("c4", "p3", "2025-08-30 15:04:00"), ("c5", "p4", "2025-08-30 15:05:00"), ("c6", "p4", "2025-08-30 15:06:00"), ("c7", "p4", "2025-08-30 15:07:00"), # sub2
                 ("c8", "p5", "2025-08-30 15:08:00"), ("c9", "p6", "2025-08-30 15:09:00"), ("c10", "p6", "2025-08-30 15:11:00")] # sub3
comments = spark.createDataFrame(comments_data, ["comment_id", "post_id", "created_utc"]) \
    .withColumn("created_utc", col("created_utc").cast("timestamp"))

In [59]:
comments_per_post = comments.groupBy("post_id").agg(count("comment_id").alias("comment_count"))

post_comments = posts.join(comments_per_post, "post_id")
post_comments.show()

                                                                                

+-------+------------+-------------------+-------------+
|post_id|subreddit_id|        created_utc|comment_count|
+-------+------------+-------------------+-------------+
|     p1|        sub1|2025-08-20 10:00:00|            2|
|     p2|        sub1|2025-08-28 11:00:00|            1|
|     p3|        sub2|2025-08-21 12:00:00|            1|
|     p4|        sub2|2025-08-29 13:00:00|            3|
|     p5|        sub3|2025-08-22 14:00:00|            1|
|     p6|        sub3|2025-08-30 15:00:00|            2|
+-------+------------+-------------------+-------------+

