In [None]:

# 1. Import and Create SparkSession
from pyspark.sql import SparkSession
import json
import time

spark = SparkSession.builder.appName("CachingExample").getOrCreate()

# 2. Generate a small data.json for demo purposes (valid JSON lines)
fake_data = [
    {"name": "Alice", "age": 25},
    {"name": "Bob",   "age": 30},
    {"name": "Cara",  "age": 35},
    {"name": "Dana",  "age": None},   # Missing age example
    {"name": "Eli"}                   # Another missing age example
]

with open("data.json", "w") as f:
    for row in fake_data:
        f.write(json.dumps(row) + "\n")

# 3. Read data.json
df = spark.read.json("data.json")

# 4. Show the DataFrame (should not error)
df.show()
df.printSchema()

# 5. Demonstrate caching a DataFrame
start_time = time.time()

# De-duplicate and cache
df_cached = df.distinct().cache()

# First count (loads + caches the data)
count_1 = df_cached.count()
duration_1 = time.time() - start_time
print(f"First count:  {count_1} rows; took {duration_1:.4f} seconds")

# Second count (cached)
start_time = time.time()
count_2 = df_cached.count()
duration_2 = time.time() - start_time
print(f"Second count: {count_2} rows; took {duration_2:.4f} seconds")

# 6. Unpersist the cache
print(f"\nIs df_cached cached? {df_cached.is_cached}")
df_cached.unpersist()
print(f"Is df_cached cached after unpersist()? {df_cached.is_cached}")

# 7. Optional: Stop the SparkSession
spark.stop()
