<h1>Configuração<h1>
<hr>

In [98]:
import psycopg2
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import upper
from pyspark.sql.functions import col
from pyspark.sql.functions import date_format
from pyspark.sql.functions import year, month, dayofmonth

# Configurar a sessão do Spark
spark = SparkSession.builder \
    .appName("ETL") \
    .config("spark.jars", "postgresql-8.2-506.jdbc3.jar") \
    .getOrCreate()
sqlContext = SQLContext(spark)

# Conectar ao banco de dados usando psycopg2
conn = psycopg2.connect(
    dbname="fatorv",
    user="fatorv",
    password="123456",
    host="localhost"
)

<h1>Extração<h1>
<hr>

In [99]:
# Obter os data frames das tabelas
query_products = "select * from products"
query_categories = "select * from categories"
query_suppliers = "select * from suppliers"
query_sales_items = "select * from sales_items"
query_sales = "select * from sales"
query_sellers = "select * from sellers"
query_customers = "select * from customers"


df_products = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as products'.format(query_products),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_suppliers = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as suppliers'.format(query_suppliers),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_categories = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as categories'.format(query_categories),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_sales_items = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as sales_items'.format(query_sales_items),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_sales = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as sales'.format(query_sales),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_sellers = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as sellers'.format(query_sellers),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_customers = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as customers'.format(query_customers),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

conn.close()

states_schema = StructType([
    StructField("id_uf", IntegerType(), False),
    StructField("sigla_uf", StringType(), False),
    StructField("state_code", StringType(), False),
    StructField("nome_uf", StringType(), False),
    StructField("id_regiao", IntegerType(), False)
])

regions_schema = StructType([
    StructField("id_regiao", IntegerType(), False),
    StructField("sigla_regiao", StringType(), False),
    StructField("nome_regiao", StringType(), False)
])

# Tabela regions
df_regions = spark.read.option("multiline", "true").schema(regions_schema).json("regioes.json")
df_regions = df_regions.withColumnRenamed("id_regiao", "region_id") \
                     .withColumnRenamed("sigla_regiao", "region_acronym") \
                     .withColumnRenamed("nome_regiao", "region_name")

# Tabela states
df_states = spark.read.option("multiline", "true").schema(states_schema).json("estados.json")
df_states = df_states.withColumnRenamed("id_uf", "state_id") \
                     .withColumnRenamed("sigla_uf", "state_acronym") \
                     .withColumnRenamed("nome_uf", "state_name") \
                     .withColumnRenamed("id_regiao", "region_id")



<h1>Transformação<h1>
<hr>

In [100]:
# Alterando os dados das tabelas, construindo a tabela fatos

# Join states com regions
df_joined_st_re = df_states.join(df_regions, "region_id")
#df_joined_st_re.show()

# Fazendo upper em supplier_name e email em suppliers
df_suppliers = df_suppliers.withColumn("email", upper(df_suppliers["email"])) \
                           .withColumn("supplier_name", upper(df_suppliers["supplier_name"]))

# Divide a date em year, month, day
df_sales = df_sales.withColumn("year", year("date")) \
                   .withColumn("month", month("date")) \
                   .withColumn("day", dayofmonth("date"))

# Muda o formato da data
df_sales = df_sales.withColumn("date", date_format("date", "yyyyMMdd"))


# Join suppliers com states, troca a coluna states por states_id
df_joined_sup_st = df_suppliers.join(df_states, df_suppliers["state"] == df_states["state_acronym"], "inner")
df_suppliers = df_joined_sup_st.select("supplier_id", "supplier_name", "email","state_id")
#df_suppliers.show()

# Fazendo upper em supplier_name e email em sellers
df_sellers = df_sellers.withColumn("email", upper(df_sellers["email"])) \
                       .withColumn("seller_name", upper(df_sellers["seller_name"]))

# Join sellers com states, troca a coluna states por states_id
df_joined_se_st = df_sellers.join(df_states, df_sellers["state"] == df_states["state_acronym"], "inner")
df_sellers = df_joined_se_st.select("seller_id", "seller_name", "email","state_id")
#df_sellers.show()

# Fazendo upper em supplier_name e email em customers
df_customers = df_customers.withColumn("email", upper(df_customers["email"])) \
                           .withColumn("customer_name", upper(df_customers["customer_name"]))

# Join customers com states, troca a coluna states por states_id
df_joined_cu_st = df_customers.join(df_states, df_customers["state"] == df_states["state_acronym"], "inner")
df_customers = df_joined_cu_st.select("customer_id", "customer_name", "email","state_id")
#df_customers.show()

# Fazendo upper em product_name em products
df_products = df_products.withColumn("product_name", upper(df_products["product_name"]))

# Join entre products e suppliers
df_products = df_products.drop("price")
df_joined_products_suppliers = df_products.join(df_suppliers, "supplier_id")
#df_joined_products_suppliers.show()

# Fazendo upper em category_name em categories
df_categories = df_categories.withColumn("category_name", upper(df_categories["category_name"]))

# Join com categories
df_joined_products_suppliers_categories = df_joined_products_suppliers.join(df_categories, "category_id")
#df_joined_products_suppliers_categories.show()

#Join com sales_items
df_joined_prod_sup_cat_si = df_joined_products_suppliers_categories.join(df_sales_items, "product_id")
#df_joined_prod_sup_cat_si.show()

#join com sales
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si.join(df_sales, "sales_id")
#df_joined_prod_sup_cat_si_sa.show()

# Calculando total_price baseado em price * quantity
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si_sa.withColumn("total_price", col("price") * col("quantity"))
#df_joined_prod_sup_cat_si_sa.show()

# Fechar a sessão do Spark
spark.stop()