# Instanciar Spark

In [35]:
import pyspark.sql.functions as f
from pyspark.sql.functions import col, from_json , explode, expr, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType
from pyspark.sql import SparkSession
from sqlalchemy import create_engine
import pandas as pd
from sqlalchemy.dialects.postgresql import insert
import psycopg2


spark_session = SparkSession.builder \
    .appName("Ibge_silver") \
    .config("spark.master", "local[*]") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.7.1") \
    .config("spark.executor.instances", "3") \
    .getOrCreate()

# PANDAS 


In [36]:
def call_upsert(table, conn, keys, data_iter):

    data = [dict(zip(keys, row)) for row in data_iter]

    insert_statement = insert(table.table).values(data)
    upsert_statement = insert_statement.on_conflict_do_update(
        constraint=f"{table.table.name}_pkey",
        set_={c.key: c for c in insert_statement.excluded},
    )

    conn.execute(upsert_statement)

# SQL CONNECTION

In [37]:
# Configurações de conexão com o PostgreSQL
# Este item é totalmente editavel, para uma conexão de sua preferencia (Vamos fazer conforme o docker-compose que subimos)

host = 'localhost'
port = '8085'

bronze_url = f"jdbc:postgresql://{host}:{port}/bronze_data"
silver_url = f"jdbc:postgresql://{host}:{port}/silver_data"
properties = {"user": "ibge", "password": "ibge", "driver": "org.postgresql.Driver"}

print(properties['user'])

# Criar um engine do SQLAlchemy
bronze_engine = create_engine(f"postgresql://{properties['user']}:{properties['password']}@{host}:{port}/bronze_data")
silver_engine = create_engine(f"postgresql://{properties['user']}:{properties['password']}@{host}:{port}/silver_data")



ibge


# REIGAO

In [38]:
# INSERÇÃO DA DIMENSÃO REGIÃO USANDO PANDAS

query = 'SELECT id_regiao,  regiao FROM public.regiao'

for data_frame in pd.read_sql(query,bronze_engine,chunksize=10000):
    try:
        pd.DataFrame(data_frame).to_sql(
            'regiao', 
            silver_engine, 
            schema='public', 
            index=False, 
            if_exists="append", 
            method=call_upsert
        )
        print("UpInsert executado")
    except Exception as e :
        print(f'Exceção {str(e)}')            

UpInsert executado


In [39]:
# INSERÇÃO DA DIMENSÃO ESTADO USANDO PANDAS

query = """
select
	id_estado,
	uf,
	estado,
	1 as latitude,
	1 as longitude,
	id_regiao as id_regiao
from
	public.estado"""

for data_frame in pd.read_sql(query,bronze_engine,chunksize=10000):
    try:
        pd.DataFrame(data_frame).to_sql(
            'estado', 
            silver_engine, 
            schema='public', 
            index=False, 
            if_exists="append", 
            method=call_upsert
        )
        print("UpInsert executado")
    except Exception as e :
        print(f'Exceção {str(e)}')      

UpInsert executado


In [40]:
# INSERÇÃO DA DIMENSÃO ESTADO USANDO PANDAS

sql_municipio = """
    select
	m.*,
	case when c.capital is not null then true else false end as flag_capital
from
	public.capitais c
right join
	municipios m 
	on
	c.uf = m.uf
	and c.capital = m.municipio
"""
for data_frame in pd.read_sql(sql_municipio,bronze_engine,chunksize=10000):
    try:
        pd.DataFrame(data_frame).to_sql(
            'municipio', 
            silver_engine, 
            schema='public', 
            index=False, 
            if_exists="append", 
            method=call_upsert
        )
        print("UpInsert executado")
    except Exception as e :
        print(f'Exceção {str(e)}')   


UpInsert executado


# PESQUISAS (leitura)

In [41]:
# IDH

df_pesquisas_idh = spark_session.read.format("jdbc") \
    .option("url", bronze_url) \
    .option("driver", properties["driver"]) \
    .option("dbtable", "pesquisas_idh") \
    .option("user", properties["user"]) \
    .option("password", properties["password"]) \
    .load()

# Definir a consulta SQL
sql_query = """
    select
	e.id_estado ,
	m.id_municipio as id_capital
from
	public.capitais c
inner join
	municipios m 
on
	c.uf = m.uf
	and c.capital = m.municipio
inner join
	public.estado e 
on
	c.uf  = e.uf 
"""

# Ler os dados do banco de dados usando Spark
df_estado_municipio = spark_session.read.jdbc(url=bronze_url, table=f"({sql_query}) as subquery", properties=properties)

# Mostrar o DataFrame resultante
# Realize o left join
df_idh_final = df_pesquisas_idh.join(
    df_estado_municipio,
    df_pesquisas_idh["cd_estado"] == df_estado_municipio["id_estado"],
    how="left"
)

df_idh_final = df_idh_final.select(
    col('cd_pesquisa').astype(StringType()),
    col('id_capital').alias('municipio').astype(IntegerType()),
    col('cd_estado').astype(IntegerType()),
    col('ano').astype(IntegerType()),
    col('valor').astype(StringType())
)


df_idh_final.show()


+-----------+---------+---------+----+-----+
|cd_pesquisa|municipio|cd_estado| ano|valor|
+-----------+---------+---------+----+-----+
|      30255|  3205309|       32|1991|0.505|
|      30255|  3106200|       31|1991|0.478|
|      30255|  3106200|       31|2000|0.624|
|      30255|  4106902|       41|1991|0.507|
|      30255|  3304557|       33|1991|0.573|
|      30255|  3304557|       33|2000|0.664|
|      30255|  3304557|       33|2010|0.761|
|      30255|  3304557|       33|2012|0.762|
|      30255|  3304557|       33|2013|0.768|
|      30255|  3304557|       33|2014| 0.78|
|      30255|  3304557|       33|2015|0.785|
|      30255|  3304557|       33|2016|0.789|
|      30255|  3304557|       33|2017|0.791|
|      30255|  3304557|       33|2018|0.805|
|      30255|  3304557|       33|2019|0.809|
|      30255|  3304557|       33|2020|0.785|
|      30255|  3304557|       33|2021|0.762|
|      30255|  2800308|       28|1991|0.408|
|      30255|  3550308|       35|1991|0.578|
|      302

In [42]:
# POPULACAO
df_pesquisas_populacao = spark_session.read.format("jdbc") \
    .option("url", bronze_url) \
    .option("driver", properties["driver"]) \
    .option("dbtable", "pesquisas_populacao") \
    .option("user", properties["user"]) \
    .option("password", properties["password"]) \
    .load()

df_pesquisas_populacao = df_pesquisas_populacao.select(
    col('cd_pesquisa').astype(StringType()),
    col('municipio').astype(IntegerType()),
    lit(None).alias('cd_estado').astype(IntegerType()),
    col('ano').astype(IntegerType()),
    col('valor').astype(StringType())
)

df_pesquisas_populacao.show()

+-----------+---------+---------+----+-----+
|cd_pesquisa|municipio|cd_estado| ano|valor|
+-----------+---------+---------+----+-----+
|  9324_6579|  3100104|     NULL|2020| 7006|
|  9324_6579|  3100203|     NULL|2020|23250|
|  9324_6579|  3100302|     NULL|2020|13444|
|  9324_6579|  3100401|     NULL|2020| 3994|
|  9324_6579|  3100500|     NULL|2020| 9368|
|  9324_6579|  3100609|     NULL|2020|13523|
|  9324_6579|  3100708|     NULL|2020| 1992|
|  9324_6579|  3100807|     NULL|2020| 4522|
|  9324_6579|  3100906|     NULL|2020|19247|
|  9324_6579|  3101003|     NULL|2020|13599|
|  9324_6579|  3101102|     NULL|2020|25141|
|  9324_6579|  3101201|     NULL|2020| 5976|
|  9324_6579|  3101300|     NULL|2020| 2665|
|  9324_6579|  3101409|     NULL|2020| 3011|
|  9324_6579|  3101508|     NULL|2020|35401|
|  9324_6579|  3101607|     NULL|2020|80494|
|  9324_6579|  3101631|     NULL|2020| 6981|
|  9324_6579|  3101706|     NULL|2020|42143|
|  9324_6579|  3101805|     NULL|2020| 7436|
|  9324_65

In [43]:
# PIB

df_pesquisas_pib = spark_session.read.format("jdbc") \
    .option("url", bronze_url) \
    .option("driver", properties["driver"]) \
    .option("dbtable", "pesquisas_pib") \
    .option("user", properties["user"]) \
    .option("password", properties["password"]) \
    .load()

df_pesquisas_pib = df_pesquisas_pib.select(
    col('cd_pesquisa').astype(StringType()),
    col('municipio').astype(IntegerType()),
    lit(None).alias('cd_estado').astype(IntegerType()),
    col('ano').astype(IntegerType()),
    col('valor').astype(StringType())
)

df_pesquisas_pib.show()

+-----------+---------+---------+----+---------+
|cd_pesquisa|municipio|cd_estado| ano|    valor|
+-----------+---------+---------+----+---------+
|    37_5938|  5300108|     NULL|2019|273613711|
|    37_5938|  5300108|     NULL|2020|265847334|
|    37_5938|  5300108|     NULL|2021|286943782|
|    37_5938|  2900108|     NULL|2019|    62610|
|    37_5938|  2900207|     NULL|2019|   151220|
|    37_5938|  2900306|     NULL|2019|   126737|
|    37_5938|  2900355|     NULL|2019|   173041|
|    37_5938|  2900405|     NULL|2019|   129713|
|    37_5938|  2900504|     NULL|2019|    65747|
|    37_5938|  2900603|     NULL|2019|    45792|
|    37_5938|  2900702|     NULL|2019|  4349226|
|    37_5938|  2900801|     NULL|2019|   290438|
|    37_5938|  2900900|     NULL|2019|    53031|
|    37_5938|  2901007|     NULL|2019|   398310|
|    37_5938|  2901106|     NULL|2019|   340719|
|    37_5938|  2901155|     NULL|2019|   133272|
|    37_5938|  2901205|     NULL|2019|   184992|
|    37_5938|  29013

# Pesquisas (Tratamento)

In [44]:
df_pesquisas = df_pesquisas_populacao.union(df_idh_final).union(df_pesquisas_pib)

# Inserir no banco de dados usando o modo append
# df_pesquisas.write \
#         .jdbc(url=silver_url, table="pesquisas", mode="append", properties=properties)

df_pesquisas.write.format("jdbc").option("url", silver_url) \
.option("user", "ibge") \
.option("password", "ibge") \
.option("dbtable", "public.pesquisas") \
.option("truncate", True) \
.mode('overwrite') \
.save()



