In [0]:
# Configuration
SILVER_TABLE = "silver_spooky_authors"
GOLD_TABLE = "gold_spooky_authors"

In [0]:
 # Author name mapping
AUTHOR_NAMES = {
    'EAP': 'Edgar Allan Poe',
    'HPL': 'H.P. Lovecraft',
    'MWS': 'Mary Shelley'
}

In [0]:
from pyspark.sql.functions import (
    col, when, lit, current_timestamp,
    size, split, trim, lower, regexp_replace
)

In [0]:
# Read Silver table
df_silver = spark.table(SILVER_TABLE)

In [0]:
print(f"Records in Silver layer: {df_silver.count():,}")
df_silver.show(5, truncate=False)

Records in Silver layer: 18,047
+-------+------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----------+--------------------------+--------------------------+------+
|id     |text                                                                                                                                                  |author|text_length|_ingestion_timestamp      |_silver_timestamp         |_layer|
+-------+------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----------+--------------------------+--------------------------+------+
|id11321|As the Comte and his associates turned away from the lowly abode of the alchemists, the form of Charles Le Sorcier appeared through the trees.        |HPL   |142        |2026-01-03 23:09:39.969833|2026-01-03 23:14:55.303888|silver|
|id2

In [0]:
# Gold Layer - Business-Ready Clean Data
df_gold = (
    df_silver
    # Add full author name
    .withColumn("author_name",
        when(col("author") == "EAP", "Edgar Allan Poe")
        .when(col("author") == "HPL", "H.P. Lovecraft")
        .when(col("author") == "MWS", "Mary Shelley")
        .otherwise("Unknown")
    )
    # Calculate word count
    .withColumn("word_count", size(split(trim(col("text")), "\\s+")))
    # Categorize text by length
    .withColumn("text_category",
        when(col("text_length") < 100, "Short")
        .when(col("text_length") < 250, "Medium")
        .when(col("text_length") < 500, "Long")
        .otherwise("Very Long")
    )
    # Add metadata
    .withColumn("_gold_timestamp", current_timestamp())
    .withColumn("_layer", lit("gold"))
    # Select and reorder columns for business use
    .select(
        "id",
        "author",
        "author_name",
        "text",
        "text_length",
        "word_count",
        "text_category",
        "_ingestion_timestamp",
        "_gold_timestamp",
        "_layer"
    )
)

In [0]:
print(f"Records in Gold layer: {df_gold.count():,}")
df_gold.show(5, truncate=False)

Records in Gold layer: 18,047
+-------+------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+----------+-------------+--------------------------+--------------------------+------+
|id     |author|author_name    |text                                                                                                                                                  |text_length|word_count|text_category|_ingestion_timestamp      |_gold_timestamp           |_layer|
+-------+------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+----------+-------------+--------------------------+--------------------------+------+
|id11321|HPL   |H.P. Lovecraft |As the Comte and his associates turned away from the lowly abode of the alchemists, the form

In [0]:
# Write to Delta table
(
    df_gold
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(GOLD_TABLE)
)


In [0]:
print(f"✓ Gold table '{GOLD_TABLE}' created successfully!")

✓ Gold table 'gold_spooky_authors' created successfully!


In [0]:
# Show distribution by author
print("Records by Author:")
df_gold.groupBy("author", "author_name").count().orderBy("author").show()

Records by Author:
+------+---------------+-----+
|author|    author_name|count|
+------+---------------+-----+
|   EAP|Edgar Allan Poe| 7044|
|   HPL| H.P. Lovecraft| 5451|
|   MWS|   Mary Shelley| 5552|
+------+---------------+-----+



In [0]:
# Show distribution by text category
print("\nRecords by Text Category:")
df_gold.groupBy("text_category").count().orderBy("text_category").show()


Records by Text Category:
+-------------+-----+
|text_category|count|
+-------------+-----+
|         Long| 1963|
|       Medium| 9606|
|        Short| 6372|
|    Very Long|  106|
+-------------+-----+



In [0]:
# Get counts from all layers
bronze_count = spark.table("bronze_spooky_authors").count()
silver_count = spark.table("silver_spooky_authors").count()
gold_count = spark.table(GOLD_TABLE).count()

In [0]:
print("=" * 70)
print("MEDALLION PIPELINE - SUMMARY")
print("=" * 70)
print(f"""
┌─────────────────────────────────────────────────────────────────────┐
│  BRONZE LAYER                                                       │
│  Table: bronze_spooky_authors                                       │
│  Records: {bronze_count:,}                                               │
│  Description: Raw data as ingested from CSV files                   │
└───────────────────────────────┬─────────────────────────────────────┘
                                │
                                ▼
┌─────────────────────────────────────────────────────────────────────┐
│  SILVER LAYER                                                       │
│  Table: silver_spooky_authors                                       │
│  Records: {silver_count:,}                                               │
│  Description: Cleaned, validated, deduplicated                      │
└───────────────────────────────┬─────────────────────────────────────┘
                                │
                                ▼
┌─────────────────────────────────────────────────────────────────────┐
│  GOLD LAYER                                                         │
│  Table: {GOLD_TABLE}                                       │
│  Records: {gold_count:,}                                               │
│  Description: Business-ready, enriched data                         │
└─────────────────────────────────────────────────────────────────────┘
""")
print("✓ Medallion Pipeline completed successfully!")

MEDALLION PIPELINE - SUMMARY

┌─────────────────────────────────────────────────────────────────────┐
│  BRONZE LAYER                                                       │
│  Table: bronze_spooky_authors                                       │
│  Records: 19,579                                               │
│  Description: Raw data as ingested from CSV files                   │
└───────────────────────────────┬─────────────────────────────────────┘
                                │
                                ▼
┌─────────────────────────────────────────────────────────────────────┐
│  SILVER LAYER                                                       │
│  Table: silver_spooky_authors                                       │
│  Records: 18,047                                               │
│  Description: Cleaned, validated, deduplicated                      │
└───────────────────────────────┬─────────────────────────────────────┘
                                │
              

In [0]:
print(f"""
┌─────────────────────────────────────────────────────────────────────┐
│  BRONZE LAYER                                                       │
│  Table: bronze_spooky_authors                                       │
│  Records: {bronze_count:,}                                               │
│  Description: Raw data as ingested from CSV files                   │
└───────────────────────────────┬─────────────────────────────────────┘
                                │
                                ▼
┌─────────────────────────────────────────────────────────────────────┐
│  SILVER LAYER                                                       │
│  Table: silver_spooky_authors                                       │
│  Records: {silver_count:,}                                               │
│  Description: Cleaned, validated, deduplicated                      │
└───────────────────────────────┬─────────────────────────────────────┘
                                │
                                ▼
┌─────────────────────────────────────────────────────────────────────┐
│  GOLD LAYER                                                         │
│  Table: {GOLD_TABLE}                                       │
│  Records: {gold_count:,}                                               │
│  Description: Business-ready, enriched data                         │
└─────────────────────────────────────────────────────────────────────┘
""")



┌─────────────────────────────────────────────────────────────────────┐
│  BRONZE LAYER                                                       │
│  Table: bronze_spooky_authors                                       │
│  Records: 19,579                                               │
│  Description: Raw data as ingested from CSV files                   │
└───────────────────────────────┬─────────────────────────────────────┘
                                │
                                ▼
┌─────────────────────────────────────────────────────────────────────┐
│  SILVER LAYER                                                       │
│  Table: silver_spooky_authors                                       │
│  Records: 18,047                                               │
│  Description: Cleaned, validated, deduplicated                      │
└───────────────────────────────┬─────────────────────────────────────┘
                                │
                                ▼
┌────────

In [0]:
# Return success for job orchestration
dbutils.notebook.exit(f"SUCCESS: Gold layer created with {gold_count} records")