<h1>Configuração<h1>
<hr>

In [23]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import upper
from pyspark.sql.functions import col
from pyspark.sql.functions import date_format
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import col
from pyspark.sql.functions import when, lit

# Configurar a sessão do Spark
spark = SparkSession.builder \
    .appName("ETL") \
    .config("spark.jars", "postgresql-8.2-506.jdbc3.jar") \
    .getOrCreate()
sqlContext = SQLContext(spark)




<h1>Extração<h1>
<hr>

In [24]:
# Obter os data frames das tabelas
query_products = "select * from products"
query_categories = "select * from categories"
query_suppliers = "select * from suppliers"
query_sales_items = "select * from sales_items"
query_sales = "select * from sales"
query_sellers = "select * from sellers"
query_customers = "select * from customers"


df_products = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as products'.format(query_products),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_suppliers = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as suppliers'.format(query_suppliers),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_categories = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as categories'.format(query_categories),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_sales_items = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as sales_items'.format(query_sales_items),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_sales = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as sales'.format(query_sales),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_sellers = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as sellers'.format(query_sellers),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_customers = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as customers'.format(query_customers),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

states_schema = StructType([
    StructField("id_uf", IntegerType(), False),
    StructField("sigla_uf", StringType(), False),
    StructField("state_code", StringType(), False),
    StructField("nome_uf", StringType(), False),
    StructField("id_regiao", IntegerType(), False)
])

regions_schema = StructType([
    StructField("id_regiao", IntegerType(), False),
    StructField("sigla_regiao", StringType(), False),
    StructField("nome_regiao", StringType(), False)
])

# Tabela regions
df_regions = spark.read.option("multiline", "true").schema(regions_schema).json("regioes.json")
df_regions = df_regions.withColumnRenamed("id_regiao", "region_id") \
                     .withColumnRenamed("sigla_regiao", "region_acronym") \
                     .withColumnRenamed("nome_regiao", "region_name")

# Tabela states
df_states = spark.read.option("multiline", "true").schema(states_schema).json("estados.json")
df_states = df_states.withColumnRenamed("id_uf", "state_id") \
                     .withColumnRenamed("sigla_uf", "state_acronym") \
                     .withColumnRenamed("nome_uf", "state_name") \
                     .withColumnRenamed("id_regiao", "region_id")



<h1>Transformação<h1>
<hr>

In [25]:
# Alterando os dados das tabelas, construindo a tabela fatos

# Join states com regions
df_joined_st_re = df_states.join(df_regions, "region_id")
#df_joined_st_re.show()

# Fazendo upper em supplier_name e email em suppliers
df_suppliers = df_suppliers.withColumn("email", upper(df_suppliers["email"])) \
                           .withColumn("supplier_name", upper(df_suppliers["supplier_name"]))

# Divide a date em year, month, day
df_sales = df_sales.withColumn("year", year("date")) \
                   .withColumn("month", month("date")) \
                   .withColumn("day", dayofmonth("date"))

# Muda o formato da data
df_sales = df_sales.withColumn("date", date_format("date", "yyyyMMdd"))

# Criar df_date
df_date = df_sales.select("date", "year", "month", "day").distinct()
df_date = df_sales.select("date", "year", "month", "day").distinct() \
                  .withColumn("date_id", monotonically_increasing_id() + 1)
df_date = df_date.withColumn("date", col("date").cast("integer")) \
                 .withColumn("date_id", col("date_id").cast("integer"))


# Criar coluna quarter(trimestre)
quarter = when(df_date["month"].between(1,3), 1) \
          .when(df_date["month"].between(4,6), 2) \
          .when(df_date["month"].between(7,9), 3) \
          .otherwise(4)
df_date = df_date.withColumn("quarter", quarter)

# Adiciona o nome dos meses
df_date = df_date.withColumn("month_name", 
                   when(df_date.month == 1, lit("January"))
                   .when(df_date.month == 2, lit("February"))
                   .when(df_date.month == 3, lit("March"))
                   .when(df_date.month == 4, lit("April"))
                   .when(df_date.month == 5, lit("May"))
                   .when(df_date.month == 6, lit("June"))
                   .when(df_date.month == 7, lit("July"))
                   .when(df_date.month == 8, lit("August"))
                   .when(df_date.month == 9, lit("September"))
                   .when(df_date.month == 10, lit("October"))
                   .when(df_date.month == 11, lit("November"))
                   .when(df_date.month == 12, lit("December"))
                   .otherwise(None))
#df_date.show()

# Join df_sales com df_date
df_sales = df_sales.join(df_date, "date")
df_sales = df_sales.select("sales_id", "customer_id", "seller_id", "date_id", "total_price")
#df_sales.show()

# Join suppliers com states, troca a coluna states por states_id
df_joined_sup_st = df_suppliers.join(df_states, df_suppliers["state"] == df_states["state_acronym"], "inner")
df_suppliers = df_joined_sup_st.select("supplier_id", "supplier_name", "email","state_id")
#df_suppliers.show()

# Fazendo upper em supplier_name e email em sellers
df_sellers = df_sellers.withColumn("email", upper(df_sellers["email"])) \
                       .withColumn("seller_name", upper(df_sellers["seller_name"]))

# Join sellers com states, troca a coluna states por states_id
df_joined_se_st = df_sellers.join(df_states, df_sellers["state"] == df_states["state_acronym"], "inner")
df_sellers = df_joined_se_st.select("seller_id", "seller_name", "email", "tx_commission","state_id")
#df_sellers.show()

# Fazendo upper em supplier_name e email em customers
df_customers = df_customers.withColumn("email", upper(df_customers["email"])) \
                           .withColumn("customer_name", upper(df_customers["customer_name"]))

# Join customers com states, troca a coluna states por states_id
df_joined_cu_st = df_customers.join(df_states, df_customers["state"] == df_states["state_acronym"], "inner")
df_customers = df_joined_cu_st.select("customer_id", "customer_name", "email","state_id")
#df_customers.show()

# Fazendo upper em product_name em products
df_products = df_products.withColumn("product_name", upper(df_products["product_name"]))

# Join entre products e suppliers
df_products = df_products.drop("price")
df_joined_products_suppliers = df_products.join(df_suppliers, "supplier_id")
#df_joined_products_suppliers.show()

# Fazendo upper em category_name em categories
df_categories = df_categories.withColumn("category_name", upper(df_categories["category_name"]))

# Join com categories
df_joined_products_suppliers_categories = df_joined_products_suppliers.join(df_categories, "category_id")
#df_joined_products_suppliers_categories.show()

#Join com sales_items
df_sales_items = df_sales_items.withColumn("sell_price", df_sales_items["price"])
df_joined_prod_sup_cat_si = df_joined_products_suppliers_categories.join(df_sales_items, "product_id")
#df_joined_prod_sup_cat_si.show()

#join com sales
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si.join(df_sales, "sales_id")
#df_joined_prod_sup_cat_si_sa.show()

# Calculando sub_total = sell_price * quantity
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.withColumn("sub_total", col("sell_price") * col("quantity"))

# Calculando total_price = sum(sub_total)
df_joined_prod_sup_cat_si_sa.createOrReplaceTempView("sales_data")
result = spark.sql("""
    select 
        sales_id,
        sum(sub_total) as total_price_2
    from
        sales_data
    group by
        sales_id
""")
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.join(result, "sales_id", "inner") \
.withColumn("total_price", result["total_price_2"])

df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.select("sales_id", "product_id", "date_id", "customer_id", "seller_id", 
                                                                   "total_price", "supplier_id", "state_id", "category_id", "quantity", 
                                                                   "sell_price", "sub_total")
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.withColumn("total_price", col("total_price").cast("decimal(10,2)"))
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.withColumn("sell_price", col("sell_price").cast("decimal(10,2)"))
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.withColumn("sub_total", col("sub_total").cast("decimal(10,2)"))
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.withColumn("date_id", col("date_id").cast("integer"))
#df_joined_prod_sup_cat_si_sa.show()
#df_joined_prod_sup_cat_si_sa.printSchema()


+--------+----+-----+---+-------+-------+----------+
|    date|year|month|day|date_id|quarter|month_name|
+--------+----+-----+---+-------+-------+----------+
|20220820|2022|    8| 20|      1|      3|    August|
|20230515|2023|    5| 15|      2|      2|       May|
+--------+----+-----+---+-------+-------+----------+



<h1>Carga<h1>
<hr>

In [26]:
postgres_url = "jdbc:postgresql://localhost:5432/fatorvgestao2"
properties = {
    "user": "fatorv",
    "password": "123456",
    "driver": "org.postgresql.Driver",
    "url" : postgres_url
}

df_categories.write.jdbc(postgres_url, "dim_categories", "overwrite", properties)
df_customers.write.jdbc(postgres_url, "dim_customers", "overwrite", properties)
df_sellers.write.jdbc(postgres_url, "dim_sellers", "overwrite", properties)
df_suppliers.write.jdbc(postgres_url, "dim_suppliers", "overwrite", properties)
df_date.write.jdbc(postgres_url, "dim_date", "overwrite", properties)
df_joined_st_re.write.jdbc(postgres_url, "dim_states", "overwrite", properties)
df_products.write.jdbc(postgres_url, "dim_products", "overwrite", properties)
df_joined_prod_sup_cat_si_sa.write.jdbc(postgres_url, "fato_sales_items", "overwrite", properties)

# Fecha a sessão do Spark
spark.stop()