In [None]:

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# 1. Create SparkSession
spark = SparkSession.builder.appName("ParquetExample").getOrCreate()

# 2. Generate a fake JSON file (people.json) for demo purposes
fake_data = [
    ("John", 15),
    ("Jane", 22),
    ("Mia", 17),
    ("Jake", 40)
]
peopleDF_fake = spark.createDataFrame(fake_data, ["name", "age"])

# Write the fake data to JSON
peopleDF_fake.write.mode("overwrite").json("people.json")

# 3. Read people.json into a new DataFrame
peopleDF = spark.read.json("people.json")
peopleDF.show()

# 4. Write the DataFrame to Parquet
peopleDF.write.mode("overwrite").parquet("people.parquet")

# 5. Read in the Parquet file and create a temporary view
parquetFile = spark.read.parquet("people.parquet")
parquetFile.createOrReplaceTempView("parquetFile")

# 6. Query with SQL (get teenagers 13-19)
teenagers = spark.sql("SELECT name, age FROM parquetFile WHERE age BETWEEN 13 AND 19")
teenagers.show()

# 7. Optional: Stop the SparkSession when done
spark.stop()
