In [0]:
SOURCE_PATH = "/Volumes/workspace/default/spooky_authors_dataset/train_part*.csv"  # Path to your CSV files
BRONZE_TABLE = "bronze_spooky_authors"

In [0]:
# import libraries
from pyspark.sql.functions import current_timestamp, col, lit
from pyspark.sql.types import StringType

In [0]:
# Read all CSV files from source
# Using _metadata.file_path for Unity Catalog compatibility
df_bronze = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(SOURCE_PATH)
)


In [0]:
# Add metadata columns
df_bronze = (
    df_bronze
    .withColumn("_ingestion_timestamp", current_timestamp())
    .withColumn("_layer", lit("bronze"))
)

In [0]:
# Display sample data
print("Sample raw data:")
df_bronze.show(5, truncate=False)

Sample raw data:
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+--------------------------+------+
|id     |text                                                                                                                                                                                                                                                                         |author|_ingestion_timestamp      |_layer|
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+--------------------------+------+
|id12369|This spot, 

In [0]:
# Show schema
print("Schema:")
df_bronze.printSchema()

Schema:
root
 |-- id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- author: string (nullable = true)
 |-- _ingestion_timestamp: timestamp (nullable = false)
 |-- _layer: string (nullable = false)



In [0]:
# Record count
record_count = df_bronze.count()
print(f"\nTotal records to ingest: {record_count:,}")


Total records to ingest: 19,579


In [0]:
# Check for nulls in key columns
print("\nNull counts per column:")
for col_name in df_bronze.columns:
    null_count = df_bronze.filter(col(col_name).isNull()).count()
    print(f"  {col_name}: {null_count:,} nulls")


Null counts per column:
  id: 0 nulls
  text: 0 nulls
  author: 0 nulls
  _ingestion_timestamp: 0 nulls
  _layer: 0 nulls


In [0]:
# Write to Delta table
(
    df_bronze
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(BRONZE_TABLE)
)

print(f"✓ Bronze table '{BRONZE_TABLE}' created successfully!")

✓ Bronze table 'bronze_spooky_authors' created successfully!


In [0]:
# Verify the table was created
bronze_count = spark.table(BRONZE_TABLE).count()
print(f"✓ Bronze table contains {bronze_count:,} records")

✓ Bronze table contains 19,579 records


In [0]:
# Show sample from table
print("\nSample from Bronze table:")
spark.sql(f"SELECT * FROM {BRONZE_TABLE} LIMIT 5").show(truncate=False)


Sample from Bronze table:
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+--------------------------+------+
|id     |text                                                                                                                                                                                                                                   |author|_ingestion_timestamp      |_layer|
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+--------------------------+------+
|id26305|This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circu

In [0]:
# Return success for job orchestration
dbutils.notebook.exit(f"SUCCESS: Bronze layer created with {bronze_count} records")