- **Name:** 001_Introduction
- **Author:** Shamas Imran
- **Desciption:** Read/Write as Delta Table in Unity Catalog

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DatapurProgram").getOrCreate()

In [0]:
# Root path in Unity Catalog volume
rootPath = "/Volumes/datapurcatalog/default/datapurvolume/"
deltaPath = rootPath + "score_delta"

In [0]:
from pyspark.sql import Row
import random

# --------------------------------
# 1. Generate Score Data (100+ rows)
# --------------------------------
semesters = ["2023-Spring", "2023-Fall", "2024-Spring", "2024-Fall", "2025-Spring"]

score_data = []
score_id = 1

for enrollment_id in range(1, 21):   # 20 enrollments
    for sem in random.sample(semesters, k=random.randint(2, 4)):  # Each enrollment has 2â€“4 semesters
        score = random.randint(60, 100)  # Random score between 60 and 100
        score_data.append(Row(
            ScoreID=score_id,
            EnrollmentID_FK=enrollment_id,
            Semester=sem,
            Score=score
        ))
        score_id += 1

df_score = spark.createDataFrame(score_data)
df_score.show()

In [0]:
df_score.show(20, truncate=False)
print(f"Total rows generated: {df_score.count()}")

In [0]:
df_score.write.format("delta").mode("overwrite").save(deltaPath)

In [0]:
"""
Following code will not work with unity catalog 
We need absolute parth to create a table from delta folder.
Instead, create a managed table directly in the next cell.
"""

dbutils.fs.rm(deltaPath, recurse=True)

# spark.sql(f"""
# CREATE TABLE IF NOT EXISTS datapurcatalog.default.score_table
# USING DELTA
# LOCATION '{deltaPath}'
# """)

In [0]:
df_score.write.format("delta").mode("overwrite") \
    .saveAsTable("datapurcatalog.default.score_table")

In [0]:
"""
query = "SELECT Semester, COUNT(*) AS Exams, AVG(Score) AS AvgScore " \
        "FROM datapurcatalog.default.score_table " \
        "GROUP BY Semester " \
        "ORDER BY Semester"
"""
# You can spread the string across multiple lines without using \n: --> """
    
query = """
SELECT Semester, 
       COUNT(*) AS Exams, 
       AVG(Score) AS AvgScore
FROM datapurcatalog.default.score_table
GROUP BY Semester
ORDER BY Semester
"""

df_result = spark.sql(query)
df_result.show()

In [0]:
%sql
SELECT Semester, COUNT(*) AS Exams, AVG(Score) AS AvgScore
FROM datapurcatalog.default.score_table
GROUP BY Semester
ORDER BY Semester

In [0]:
%sql
DESCRIBE HISTORY datapurcatalog.default.score_table;

In [0]:
%sql
UPDATE datapurcatalog.default.score_table
SET Score = Score + 5
WHERE Semester = 'Spring2024';

In [0]:
%sql
SELECT * FROM datapurcatalog.default.score_table VERSION AS OF 0;

-- SELECT * FROM datapurcatalog.default.score_table TIMESTAMP AS OF '2025-08-16T10:00:00';

In [0]:
%sql

-- RESTORE TABLE datapurcatalog.default.score_table TO VERSION AS OF 0;

DROP TABLE IF EXISTS datapurcatalog.default.score_table;