In [0]:
%sql
CREATE CATALOG IF NOT EXISTS ifood_cat;
USE CATALOG ifood_cat;

-- criando as camadas de dados
CREATE SCHEMA IF NOT EXISTS bronze;
CREATE SCHEMA IF NOT EXISTS silver;
CREATE SCHEMA IF NOT EXISTS gold;

-- cria um volume chamado 'raw_files' para armazenar arquivos de origem
CREATE VOLUME IF NOT EXISTS ifood_cat.bronze.raw_files;

In [0]:
import pyspark.sql.functions as f
import pyspark.sql.types as t

from utils import cast_to_timestamp
from schemas import (
    ORDER_SCHEMA,
    RESTAURANT_SCHEMA,
    CONSUMER_SCHEMA,
    AB_TEST_REF_SCHEMA
)
from constants import (
    BRONZE_VOLUME_PATH,
    BRONZE_LAYER_PATH,
    URLS, 
    ORDER_COLUMN_COMMENTS,
    CONSUMER_COLUMN_COMMENTS,
    RESTAURANT_COLUMN_COMMENTS,
    AB_TEST_REF_COLUMN_COMMENTS,
)

import urllib.request
import tarfile
import os

In [0]:
def add_column_comments(table_name, comments_dict):
    for col_name, comment_text in comments_dict.items():
        safe_comment = comment_text.replace('"', '\\"')
        
        sql_command = f"""
        ALTER TABLE {table_name} ALTER COLUMN {col_name} COMMENT "{safe_comment}"
        """
        spark.sql(sql_command)

In [0]:
print("\nIniciando download e extração para o Volume...")
for filename, url in URLS.items():
    destination_path = os.path.join(BRONZE_VOLUME_PATH, filename)
    os.system(f"wget -O {destination_path} -nc {url}")

tar_path = os.path.join(BRONZE_VOLUME_PATH, "ab_test_ref.tar.gz")
os.system(f"tar -xzf {tar_path} -C {BRONZE_VOLUME_PATH}/")
print("Download e extração concluídos.")

# Verificar os arquivos no Volume
print("\nListando arquivos no Volume para confirmação:")
display(dbutils.fs.ls(BRONZE_VOLUME_PATH))

### orders

In [0]:
src_order_file_path = f"{BRONZE_VOLUME_PATH}/order.json.gz"
bronze_order_table_name = f"{BRONZE_LAYER_PATH}.order"

spark.sql(f"DROP TABLE IF EXISTS {bronze_order_table_name}")

orders_df = (spark.read  
            .format("json")
            .schema(ORDER_SCHEMA)
            .load(src_order_file_path))

orders_df = orders_df.withColumn("partition_date", f.to_date(f.col("order_created_at")))

display(orders_df.select("order_id", "order_created_at", "partition_date").limit(5))

(orders_df.write
 .partitionBy("partition_date")
 .format("delta")
 .mode("overwrite")
 .saveAsTable(bronze_order_table_name))

# adicionando descrição das colunas
add_column_comments(bronze_order_table_name, ORDER_COLUMN_COMMENTS)

print("Tabela 'order' da camada Bronze criada com sucesso!")

In [0]:
orders_df = spark.table(f"{BRONZE_LAYER_PATH}.order")

print("Schema da tabela 'orders':")
orders_df.printSchema()

print("\nAmostra dos dados da tabela 'orders':")
display(orders_df.limit(5))

### consumers

In [0]:
src_consumer_file_path = f"{BRONZE_VOLUME_PATH}/consumer.csv.gz"
bronze_consumer_table_name = f"{BRONZE_LAYER_PATH}.consumer"

spark.sql(f"DROP TABLE IF EXISTS {bronze_consumer_table_name}")

consumers_df = (spark.read 
                .format("csv")
                .option("header", "true")
                .schema(CONSUMER_SCHEMA)
                .load(src_consumer_file_path))

(consumers_df.write
             .format("delta")
             .mode("overwrite")
             .saveAsTable(bronze_consumer_table_name))

# adicionando descrição das colunas
add_column_comments(bronze_consumer_table_name, CONSUMER_COLUMN_COMMENTS)

print("Tabela 'consumer' da camada Bronze criada com sucesso!")

### restaurants

In [0]:
src_restaurant_file_path = f"{BRONZE_VOLUME_PATH}/restaurant.csv.gz"
bronze_restaurant_table_name = f"{BRONZE_LAYER_PATH}.restaurant"

spark.sql(f"DROP TABLE IF EXISTS {bronze_restaurant_table_name}")

restaurants_df = (spark.read 
                .format("csv")
                .option("header", "true")
                .schema(RESTAURANT_SCHEMA)
                .load(src_restaurant_file_path))

(restaurants_df.write
             .format("delta")
             .mode("overwrite")
             .saveAsTable(bronze_restaurant_table_name))

# adicionando descrição das colunas
add_column_comments(bronze_restaurant_table_name, RESTAURANT_COLUMN_COMMENTS)

print("Tabela 'restaurant' da camada Bronze criada com sucesso!")

### ab_test_ref

In [0]:
src_ab_test_ref_file_path = f"{BRONZE_VOLUME_PATH}/ab_test_ref.csv"
bronze_ab_test_ref_table_name = f"{BRONZE_LAYER_PATH}.ab_test_ref"

spark.sql(f"DROP TABLE IF EXISTS {bronze_ab_test_ref_table_name}")

ab_test_refs_df = (spark.read 
                .format("csv")
                .option("header", "true")
                .schema(AB_TEST_REF_SCHEMA)
                .load(src_ab_test_ref_file_path))

(ab_test_refs_df.write
             .format("delta")
             .mode("overwrite")
             .saveAsTable(bronze_ab_test_ref_table_name))

# adicionando descrição das colunas
add_column_comments(bronze_ab_test_ref_table_name, AB_TEST_REF_COLUMN_COMMENTS)

print("Tabela 'ab_test_ref' da camada Bronze criada com sucesso!")