# ```Part 01: INGESTION```

## Step 0: Setup the environment using ```Widgets```
--
- Input widgets allow you to add parameters to your notebooks and dashboards

In [None]:
# create the widget parameters 
dbutils.widgets.text("usernameWidget","","Enter Your Name Here: ")
dbutils.widgets.text("environmentWidget","","Enter Environment: ")

# save as variables 
usernameWidget = dbutils.widgets.get("usernameWidget")
environmentWidget = dbutils.widgets.get("environmentWidget")
print(usernameWidget)
print(environmentWidget)

## Step 1: Examine the raw data

In [None]:
# we have 4 years' worth of sales data to ingest 
display(dbutils.fs.ls("/mnt/00-mchan-demo/db-workshop-vn-2022/t0-raw-layer/"))

## Step 2: Ingest the data using PySpark

In [None]:
# import relevant libraries 
from pyspark.sql.functions import *
from pyspark.sql.types import * 

# Path to the t0-raw-layer in S3
rawPath = "/mnt/00-mchan-demo/db-workshop-vn-2022/t0-raw-layer/"

# Ingest 4 years' worth of historical sales data from an e-commerce retailer 
df = (
  spark.read
       .format("csv")
       .option("header", "true")
       .option("sep", ",")
       .option("inferSchema", "true")
       .option("dateFormat", "yyyy-mm-dd")
       .load(rawPath)
)

In [None]:
# Examine the schema of the data 
df.printSchema()

In [None]:
display(df)

### Step 3: Add helper metadata columns

In [None]:
df = (
    df.withColumn("file_path", input_file_name())
      .withColumn("ingest_time", current_timestamp())
)

In [None]:
display(df.select("rowID","orderID", "file_path", "ingest_time").orderBy("rowID"))

### Step 4: Write the ingested data to the ```BRONZE``` layer table

In [None]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {environmentWidget}_{usernameWidget}_db")
spark.sql(f"USE {environmentWidget}_{usernameWidget}_db")

In [None]:
# basic pattern: df.write.mode("overwrite").saveAsTable("database.tableName")
(
  df.write
    .mode("overwrite")
    .partitionBy("state")
    .saveAsTable(f"{environmentWidget}_{usernameWidget}_db.t1_bronze_orders")
)

# -- END OF TASK 01 --