In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
%sql
USE CATALOG adventureworks;
USE SCHEMA bronze;

### Reading Product.csv

In [0]:
product_schema = """
ProductKey INT,
Product STRING,
`Standard Cost` STRING,
Color STRING,
Subcategory STRING,
Category STRING,
`Background Color Format` STRING,
`Font Color Format` STRING
"""

In [0]:
df_product = spark.read.format("csv")\
                        .option("header", "true")\
                        .schema(product_schema)\
                        .option("delimiter", "\t")\
                        .load("abfss://bronze@adventureworksdls001.dfs.core.windows.net/Product.csv")


In [0]:
df_product = df_product.withColumnRenamed("ProductKey", "product_key")\
                        .withColumnRenamed("Product", "product")\
                        .withColumnRenamed("Standard Cost", "standard_cost")\
                        .withColumnRenamed("Color", "color")\
                        .withColumnRenamed("Subcategory", "subcategory")\
                        .withColumnRenamed("Category", "category")\
                        .withColumnRenamed("Background Color Format", "background_color_format")\
                        .withColumnRenamed("Font Color Format", "font_color_format")

### Reading Region.csv

In [0]:
region_schema = """
SalesTerritoryKey INT,
Region STRING,
Country STRING,
Group STRING
"""

In [0]:
df_region = spark.read.format("csv")\
                        .option("header", "true")\
                        .schema(region_schema)\
                        .option("delimiter", "\t")\
                        .load("abfss://bronze@adventureworksdls001.dfs.core.windows.net/Region.csv")

In [0]:
df_region = df_region.withColumnRenamed("SalesTerritoryKey", "sales_territory_key")\
                        .withColumnRenamed("Region", "region")\
                        .withColumnRenamed("Country", "country")\
                        .withColumnRenamed("Group", "continent")

### Reading Reseller.csv

In [0]:
reseller_schema = """
ResellerKey INT,
`Business Type` STRING,
Reseller STRING,
City STRING,
`State-Province` STRING,
`Country-Region` STRING
"""

In [0]:
df_reseller = spark.read.format("csv")\
                        .option("header", "true")\
                        .schema(reseller_schema)\
                        .option("delimiter", "\t")\
                        .load("abfss://bronze@adventureworksdls001.dfs.core.windows.net/Reseller.csv")

In [0]:
df_reseller = df_reseller.withColumnRenamed("ResellerKey", "reseller_key")\
                        .withColumnRenamed("Business Type", "business_type")\
                        .withColumnRenamed("Reseller", "reseller")\
                        .withColumnRenamed("City", "city")\
                        .withColumnRenamed("State-Province", "state_province")\
                        .withColumnRenamed("Country-Region", "country")

### Reading Sales.csv

In [0]:
df_sales= spark.read.format("csv")\
                        .option("header", "true")\
                        .option("inferSchema", "true")\
                        .option("delimiter", "\t")\
                        .load("abfss://bronze@adventureworksdls001.dfs.core.windows.net/Sales.csv")

In [0]:
df_sales = df_sales.withColumnRenamed("SalesOrderNumber", "sales_order_number")\
                        .withColumnRenamed("OrderDate", "order_date")\
                        .withColumnRenamed("ProductKey", "product_key")\
                        .withColumnRenamed("ResellerKey", "reseller_key")\
                        .withColumnRenamed("EmployeeKey", "employee_key")\
                        .withColumnRenamed("SalesTerritoryKey", "sales_territory_key")\
                        .withColumnRenamed("Quantity", "quantity")\
                        .withColumnRenamed("Unit Price", "unit_price")\
                        .withColumnRenamed("Sales", "sales")\
                        .withColumnRenamed("Cost", "cost")

In [0]:
df_sales = df_sales.withColumn("product_key", col("product_key").cast("int"))\
                   .withColumn("reseller_key", col("reseller_key").cast("int"))\
                   .withColumn("employee_key", col("employee_key").cast("int"))\
                   .withColumn("sales_territory_key", col("sales_territory_key").cast("int"))\
                   .withColumn("quantity", col("quantity").cast("int"))

### Reading Salesperson.csv

In [0]:
salesperson_schema = """
EmployeeKey INT,
EmployeeID INT,
Salesperson STRING,
Title STRING,
UPN STRING
"""

In [0]:
df_salesperson = spark.read.format("csv")\
                        .option("header", "true")\
                        .schema(salesperson_schema)\
                        .option("delimiter", "\t")\
                        .load("abfss://bronze@adventureworksdls001.dfs.core.windows.net/Salesperson.csv")

In [0]:
df_salesperson = df_salesperson.withColumnRenamed("EmployeeKey", "employee_key")\
                        .withColumnRenamed("EmployeeID", "employee_id")\
                        .withColumnRenamed("Salesperson", "salesperson")\
                        .withColumnRenamed("Title", "title")\
                        .withColumnRenamed("UPN", "email")

### Reading SalespersonRegion.csv

In [0]:
salespersonregion_schema = """
EmployeeKey INT,
SalesTerritoryKey INT
"""

In [0]:
df_salespersonregion = spark.read.format("csv")\
                        .option("header", "true")\
                        .schema(salespersonregion_schema)\
                        .option("delimiter", "\t")\
                        .load("abfss://bronze@adventureworksdls001.dfs.core.windows.net/SalespersonRegion.csv")

In [0]:
df_salespersonregion = df_salespersonregion.withColumnRenamed("EmployeeKey", "employee_key")\
                    .withColumnRenamed("SalesTerritoryKey", "sales_territory_key")

### Reading Targets.csv

In [0]:

df_targets = spark.read.format("csv")\
                        .option("header", "true")\
                        .option("inferSchema", "true")\
                        .option("delimiter", "\t")\
                        .load("abfss://bronze@adventureworksdls001.dfs.core.windows.net/Targets.csv")

In [0]:
df_targets = df_targets.withColumnRenamed("EmployeeID", "employee_id")\
                        .withColumnRenamed("TargetMonth", "target_month")\
                        .withColumnRenamed("Target", "target")

### Writing all the files to Bronze Layer

In [0]:
def write_to_bronze_tables(df,table_name):
    df.write.format('delta')\
            .mode('overwrite')\
            .option('overwriteSchema','true')\
            .saveAsTable(f"adventureworks.bronze.{table_name}")

In [0]:
write_to_bronze_tables(df_product,'bronze_product')
write_to_bronze_tables(df_region,'bronze_region')
write_to_bronze_tables(df_reseller,'bronze_reseller')
write_to_bronze_tables(df_sales,'bronze_sales')
write_to_bronze_tables(df_salesperson,'bronze_salesperson')
write_to_bronze_tables(df_salespersonregion,'bronze_salespersonregion')
write_to_bronze_tables(df_targets,'bronze_targets')