<h1>Configuração<h1>
<hr>

In [11]:
import psycopg2
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import upper
from pyspark.sql.functions import col
from pyspark.sql.functions import date_format
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql.functions import sum
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import col, format_number
from pyspark.sql.functions import when

# Configurar a sessão do Spark
spark = SparkSession.builder \
    .appName("ETL") \
    .config("spark.jars", "postgresql-8.2-506.jdbc3.jar") \
    .getOrCreate()
sqlContext = SQLContext(spark)


<h1>Extração<h1>
<hr>

In [12]:
# Obter os data frames das tabelas
query_products = "select * from products"
query_categories = "select * from categories"
query_suppliers = "select * from suppliers"
query_sales_items = "select * from sales_items"
query_sales = "select * from sales"
query_sellers = "select * from sellers"
query_customers = "select * from customers"


df_products = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as products'.format(query_products),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_suppliers = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as suppliers'.format(query_suppliers),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_categories = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as categories'.format(query_categories),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_sales_items = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as sales_items'.format(query_sales_items),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_sales = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as sales'.format(query_sales),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_sellers = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as sellers'.format(query_sellers),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_customers = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as customers'.format(query_customers),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

states_schema = StructType([
    StructField("id_uf", IntegerType(), False),
    StructField("sigla_uf", StringType(), False),
    StructField("state_code", StringType(), False),
    StructField("nome_uf", StringType(), False),
    StructField("id_regiao", IntegerType(), False)
])

regions_schema = StructType([
    StructField("id_regiao", IntegerType(), False),
    StructField("sigla_regiao", StringType(), False),
    StructField("nome_regiao", StringType(), False)
])

# Tabela regions
df_regions = spark.read.option("multiline", "true").schema(regions_schema).json("regioes.json")
df_regions = df_regions.withColumnRenamed("id_regiao", "region_id") \
                     .withColumnRenamed("sigla_regiao", "region_acronym") \
                     .withColumnRenamed("nome_regiao", "region_name")

# Tabela states
df_states = spark.read.option("multiline", "true").schema(states_schema).json("estados.json")
df_states = df_states.withColumnRenamed("id_uf", "state_id") \
                     .withColumnRenamed("sigla_uf", "state_acronym") \
                     .withColumnRenamed("nome_uf", "state_name") \
                     .withColumnRenamed("id_regiao", "region_id")



<h1>Transformação<h1>
<hr>

In [13]:
# Alterando os dados das tabelas, construindo a tabela fatos

# Join states com regions
df_joined_st_re = df_states.join(df_regions, "region_id")
#df_joined_st_re.show()

# Fazendo upper em supplier_name e email em suppliers
df_suppliers = df_suppliers.withColumn("email", upper(df_suppliers["email"])) \
                           .withColumn("supplier_name", upper(df_suppliers["supplier_name"]))

# Divide a date em year, month, day
df_sales = df_sales.withColumn("year", year("date")) \
                   .withColumn("month", month("date")) \
                   .withColumn("day", dayofmonth("date"))

# Muda o formato da data
df_sales = df_sales.withColumn("date", date_format("date", "yyyyMMdd"))

# Criar df_date
df_date = df_sales.select("date", "year", "month", "day").distinct()
df_date = df_sales.select("date", "year", "month", "day").distinct() \
                  .withColumn("date_id", monotonically_increasing_id() + 1)
df_date = df_date.withColumn("date", col("date").cast("integer")) \
                 .withColumn("date_id", col("date_id").cast("integer"))


# Criar coluna quarter(trimestre)
quarter = when(df_date["month"].between(1,3), 1) \
          .when(df_date["month"].between(4,6), 2) \
          .when(df_date["month"].between(7,9), 3) \
          .otherwise(4)
df_date = df_date.withColumn("quarter", quarter)
#df_date.show()

# Join df_sales com df_date
df_sales = df_sales.join(df_date, "date")
df_sales = df_sales.select("sales_id", "customer_id", "seller_id", "date_id", "total_price")
#df_sales.show()

# Join suppliers com states, troca a coluna states por states_id
df_joined_sup_st = df_suppliers.join(df_states, df_suppliers["state"] == df_states["state_acronym"], "inner")
df_suppliers = df_joined_sup_st.select("supplier_id", "supplier_name", "email","state_id")
#df_suppliers.show()

# Fazendo upper em supplier_name e email em sellers
df_sellers = df_sellers.withColumn("email", upper(df_sellers["email"])) \
                       .withColumn("seller_name", upper(df_sellers["seller_name"]))

# Join sellers com states, troca a coluna states por states_id
df_joined_se_st = df_sellers.join(df_states, df_sellers["state"] == df_states["state_acronym"], "inner")
df_sellers = df_joined_se_st.select("seller_id", "seller_name", "email", "tx_commission","state_id")
#df_sellers.show()

# Fazendo upper em supplier_name e email em customers
df_customers = df_customers.withColumn("email", upper(df_customers["email"])) \
                           .withColumn("customer_name", upper(df_customers["customer_name"]))

# Join customers com states, troca a coluna states por states_id
df_joined_cu_st = df_customers.join(df_states, df_customers["state"] == df_states["state_acronym"], "inner")
df_customers = df_joined_cu_st.select("customer_id", "customer_name", "email","state_id")
#df_customers.show()

# Fazendo upper em product_name em products
df_products = df_products.withColumn("product_name", upper(df_products["product_name"]))

# Join entre products e suppliers
df_products = df_products.drop("price")
df_joined_products_suppliers = df_products.join(df_suppliers, "supplier_id")
#df_joined_products_suppliers.show()

# Fazendo upper em category_name em categories
df_categories = df_categories.withColumn("category_name", upper(df_categories["category_name"]))

# Join com categories
df_joined_products_suppliers_categories = df_joined_products_suppliers.join(df_categories, "category_id")
#df_joined_products_suppliers_categories.show()

#Join com sales_items
df_sales_items = df_sales_items.withColumn("sell_price", df_sales_items["price"])
df_joined_prod_sup_cat_si = df_joined_products_suppliers_categories.join(df_sales_items, "product_id")
#df_joined_prod_sup_cat_si.show()

#join com sales
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si.join(df_sales, "sales_id")
#df_joined_prod_sup_cat_si_sa.show()

# Calculando sub_total = sell_price * quantity
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.withColumn("sub_total", col("sell_price") * col("quantity"))

# Calculando total_price = sum(sub_total)
df_joined_prod_sup_cat_si_sa.createOrReplaceTempView("sales_data")
result = spark.sql("""
    select 
        sales_id,
        sum(sub_total) as total_price_2
    from
        sales_data
    group by
        sales_id
""")
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.join(result, "sales_id", "left_outer") \
.withColumn("total_price", result["total_price_2"])

df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.select("sales_id", "product_id", "date_id", "customer_id", "seller_id", 
                                                                   "total_price", "supplier_id", "state_id", "category_id", "quantity", 
                                                                   "sell_price", "sub_total")
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.withColumn("total_price", col("total_price").cast("decimal(10,2)"))
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.withColumn("sell_price", col("sell_price").cast("decimal(10,2)"))
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.withColumn("sub_total", col("sub_total").cast("decimal(10,2)"))
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.withColumn("date_id", col("date_id").cast("integer"))
#df_joined_prod_sup_cat_si_sa.show()
#df_joined_prod_sup_cat_si_sa.printSchema()


root
 |-- sales_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- date_id: integer (nullable = false)
 |-- customer_id: integer (nullable = true)
 |-- seller_id: integer (nullable = true)
 |-- total_price: decimal(10,2) (nullable = true)
 |-- supplier_id: integer (nullable = true)
 |-- state_id: integer (nullable = true)
 |-- category_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- sell_price: decimal(10,2) (nullable = true)
 |-- sub_total: decimal(10,2) (nullable = true)



<h1>Carga<h1>
<hr>

In [14]:
postgres_url = "jdbc:postgresql://localhost:5432/fatorvgestao2"
properties = {
    "user": "fatorv",
    "password": "123456",
    "driver": "org.postgresql.Driver",
    "url" : postgres_url
}

df_categories.write.jdbc(postgres_url, "dim_categories", "overwrite", properties)
df_customers.write.jdbc(postgres_url, "dim_customers", "overwrite", properties)
df_sellers.write.jdbc(postgres_url, "dim_sellers", "overwrite", properties)
df_suppliers.write.jdbc(postgres_url, "dim_suppliers", "overwrite", properties)
df_date.write.jdbc(postgres_url, "dim_date", "overwrite", properties)
df_states.write.jdbc(postgres_url, "dim_states", "overwrite", properties)
df_products.write.jdbc(postgres_url, "dim_products", "overwrite", properties)
df_joined_prod_sup_cat_si_sa.write.jdbc(postgres_url, "fato_sales_items", "overwrite", properties)

# Fecha a sessão do Spark
spark.stop()

Py4JJavaError: An error occurred while calling o932.jdbc.
: org.postgresql.util.PSQLException: ERROR: cannot drop table dim_categories because other objects depend on it
  Detalhe: constraint dim_categories_fato_sales_items_fk on table fato_sales_items depends on table dim_categories
  Dica: Use DROP ... CASCADE to drop the dependent objects too.
	at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2713)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2401)
	at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:368)
	at org.postgresql.jdbc.PgStatement.executeInternal(PgStatement.java:498)
	at org.postgresql.jdbc.PgStatement.execute(PgStatement.java:415)
	at org.postgresql.jdbc.PgStatement.executeWithFlags(PgStatement.java:335)
	at org.postgresql.jdbc.PgStatement.executeCachedSql(PgStatement.java:321)
	at org.postgresql.jdbc.PgStatement.executeWithFlags(PgStatement.java:297)
	at org.postgresql.jdbc.PgStatement.executeUpdate(PgStatement.java:270)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.executeStatement(JdbcUtils.scala:1094)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.dropTable(JdbcUtils.scala:81)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:63)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:48)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:859)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:388)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:361)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:248)
	at org.apache.spark.sql.DataFrameWriter.jdbc(DataFrameWriter.scala:756)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
