<h1>Configuração<h1>
<hr>

In [4]:
import psycopg2
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Configurar a sessão do Spark
spark = SparkSession.builder \
    .appName("Exemplo de Join") \
    .config("spark.jars", "/home/rene/Downloads/architect-1.0.9/jdbc/postgresql-8.2-506.jdbc3.jar") \
    .getOrCreate()
sqlContext = SQLContext(spark)

# Conectar ao banco de dados usando psycopg2
conn = psycopg2.connect(
    dbname="fatorv",
    user="fatorv",
    password="123456",
    host="localhost"
)

<h1>Extração<h1>
<hr>

In [5]:
# Obter os data frames das tabelas
query_products = "select * from products"
query_categories = "select * from categories"
query_suppliers = "select * from suppliers"
query_sales_items = "select * from sales_items"
query_sales = "select * from sales"
query_sellers = "select * from sellers"
query_customers = "select * from customers"


df_products = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as products'.format(query_products),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_suppliers = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as suppliers'.format(query_suppliers),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_categories = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as categories'.format(query_categories),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_sales_items = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as sales_items'.format(query_sales_items),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_sales = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as sales'.format(query_sales),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_sellers = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as sellers'.format(query_sellers),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

df_customers = sqlContext.read.format('jdbc').options(
    url='jdbc:postgresql://localhost/fatorv',
    dbtable='({}) as customers'.format(query_customers),
    user='fatorv',
    password='123456',
    driver='org.postgresql.Driver').load()

conn.close()

states_schema = StructType([
    StructField("id_uf", IntegerType(), False),
    StructField("sigla_uf", StringType(), False),
    StructField("state_code", StringType(), False),
    StructField("nome_uf", StringType(), False),
    StructField("id_regiao", IntegerType(), False)
])

regions_schema = StructType([
    StructField("id_regiao", IntegerType(), False),
    StructField("sigla_regiao", StringType(), False),
    StructField("nome_regiao", StringType(), False)
])

# Tabela regions
df_regions = spark.read.option("multiline", "true").schema(regions_schema).json("regioes.json")
df_regions = df_regions.withColumnRenamed("id_regiao", "region_id") \
                     .withColumnRenamed("sigla_regiao", "region_acronym") \
                     .withColumnRenamed("nome_regiao", "region_name")

# Tabela states
df_states = spark.read.option("multiline", "true").schema(states_schema).json("estados.json")
df_states = df_states.withColumnRenamed("id_uf", "state_id") \
                     .withColumnRenamed("sigla_uf", "state_acronym") \
                     .withColumnRenamed("nome_uf", "state_name") \
                     .withColumnRenamed("id_regiao", "region_id")



<h1>Transformação<h1>
<hr>

In [6]:
# Transformações

# Join states com regions
df_joined_st_re = df_states.join(df_regions, "region_id")
#df_joined_st_re.show()

# Join suppliers com states
df_joined_sup_st = df_suppliers.join(df_states, df_suppliers["state"] == df_states["state_acronym"], "inner")
df_suppliers = df_joined_sup_st.select("supplier_id", "supplier_name", "email","state_id")
df_suppliers.show()

# Join sellers com states
df_joined_se_st = df_sellers.join(df_states, df_sellers["state"] == df_states["state_acronym"], "inner")
df_sellers = df_joined_se_st.select("seller_id", "seller_name", "state_id")
#df_sellers.show()

# Join customers com states
df_joined_cu_st = df_customers.join(df_states, df_customers["state"] == df_states["state_acronym"], "inner")
df_customers = df_joined_cu_st.select("customer_id", "customer_name", "email","state_id")
#df_customers.show()

# Join entre products e suppliers
df_joined_products_suppliers = df_products.join(df_suppliers, "supplier_id")
#df_joined_products_suppliers.show()

# Join com categories
df_joined_products_suppliers_categories = df_joined_products_suppliers.join(df_categories, "category_id")
#df_joined_products_suppliers_categories.show()

#Join com sales_items
df_joined_prod_sup_cat_si = df_joined_products_suppliers_categories.join(df_sales_items, "product_id")
#df_joined_prod_sup_cat_si.show()

#join com sales
df_joined_prod_sup_cat_si_sa = df_joined_prod_sup_cat_si.join(df_sales, "sales_id")
#df_joined_prod_sup_cat_si_sa.show()

# Fechar a sessão do Spark
spark.stop()

+-----------+-------------------+--------------------+--------+
|supplier_id|      supplier_name|               email|state_id|
+-----------+-------------------+--------------------+--------+
|          1| Tech Supplies Inc.|    tech@example.com|       6|
|          2|Fashion Trends Ltd.|fashiontrends@exa...|      36|
|          3|         Book Haven|bookhaven@example...|      48|
|          4|    Appliance World|appliances@exampl...|      12|
|          5|        Toy Kingdom|    toys@example.com|       6|
|          6|Furniture Warehouse|furniture@example...|      53|
|          7|   Sports Suppliers|  sports@example.com|      48|
|          8|    Beauty Supplies|  beauty@example.com|      36|
|          9|  Food Distributors|    food@example.com|      12|
+-----------+-------------------+--------------------+--------+

