In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [None]:
spark = SparkSession\
    .builder\
    .appName("Spark Exploration App")\
    .config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.1.0")\
    .config("spark.sql.extensions","com.datastax.spark.connector.CassandraSparkExtensions") \
    .config("spark.cassandra.connection.host","10.128.0.9") \
    .config("spark.cassandra.connection.port","9042") \
    .config("spark.cassandra.output.batch.grouping.buffer.size", "3000") \
    .config("spark.cassandra.output.concurrent.writes", "1500") \
    .config("cassandra.output.throughput_mb_per_sec", "128") \
    .config("spark.cassandra.output.batch.size.bytes", "2056") \
    .config("cassandra.connection.keep_alive_ms", "30000") \
    .getOrCreate()
spark

In [None]:
keyspace = "telecomunicacao"

In [None]:
def loadData(table: str):
    df = spark.read \
        .format("org.apache.spark.sql.cassandra") \
        .option("keyspace", keyspace) \
        .option("table", table) \
        .load()
    return df

In [None]:
def saveData(df, table:str):
    df.write \
        .format("org.apache.spark.sql.cassandra") \
        .option("keyspace", keyspace) \
        .option("table", table) \
        .mode('append') \
        .save()

#### # 1. Carregamos os dados anteriormente tratados do nosso **bucket**

In [6]:
dfCobertura = spark.read.parquet('gs://parquetcassandranatalsoul/Cobertura')
dfCobertura.printSchema()

                                                                                

root
 |-- ano: integer (nullable = true)
 |-- operadora: string (nullable = true)
 |-- tecnologia: string (nullable = true)
 |-- setor: string (nullable = true)
 |-- cod_uf: integer (nullable = true)
 |-- cod_municipio: integer (nullable = true)
 |-- uf: string (nullable = true)
 |-- regiao: string (nullable = true)
 |-- domicilios: integer (nullable = true)
 |-- moradores: integer (nullable = true)
 |-- percentual_cobertura: string (nullable = true)



In [7]:
dfReclamacao = spark.read.parquet('gs://parquetcassandranatalsoul/Reclamacao')
dfReclamacao.printSchema()

[Stage 1:>                                                          (0 + 1) / 1]

root
 |-- ano: integer (nullable = true)
 |-- mes: integer (nullable = true)
 |-- uf: string (nullable = true)
 |-- cidade: string (nullable = true)
 |-- cod_municipio: integer (nullable = true)
 |-- canal: string (nullable = true)
 |-- marca: string (nullable = true)
 |-- assunto: string (nullable = true)



                                                                                

In [8]:
dfQualidade = spark.read.parquet('gs://parquetcassandranatalsoul/Qualidade')
dfQualidade.printSchema()

root
 |-- servico: string (nullable = true)
 |-- empresa: string (nullable = true)
 |-- ano: integer (nullable = true)
 |-- uf: string (nullable = true)
 |-- meta_indicador: string (nullable = true)
 |-- cumpriu: integer (nullable = true)
 |-- descumpriu: integer (nullable = true)



In [None]:
dfDadosibge = spark.read.parquet('gs://parquetcassandranatalsoul/Dados_ibge')
dfDadosibge.printSchema()

#### # 2. Acrescentamos um campo ***uuid*** e geramos o código para adicionar no cassandra

In [None]:
dfCobertura = dfCobertura.withColumn("id_cobertura", expr("uuid()")) # cobertura_operadoras -> id_cobertura
dfCobertura.show(1)


In [11]:
dfReclamacao = dfReclamacao.withColumn("id_reclamacao", expr("uuid()")) # reclamacoes -> id_reclamacao
dfReclamacao.show(1, truncate=False)

[Stage 5:>                                                          (0 + 1) / 1]

+----+---+---+---------------+-------------+-----------+-----+--------+------------------------------------+
|ano |mes|uf |cidade         |cod_municipio|canal      |marca|assunto |id_reclamacao                       |
+----+---+---+---------------+-------------+-----------+-----+--------+------------------------------------+
|2017|6  |MG |Santa Margarida|57906        |Call Center|CLARO|Cobrança|2614e029-281c-4e58-9e18-01e1935d5abe|
+----+---+---+---------------+-------------+-----------+-----+--------+------------------------------------+
only showing top 1 row



                                                                                

In [12]:
dfQualidade = dfQualidade.withColumn("id_qualidade", expr("uuid()")) # qualidade -> id_qualidade
dfQualidade.show(1, truncate=False)

[Stage 6:>                                                          (0 + 1) / 1]

+---------------+-------+----+---+--------------+-------+----------+------------------------------------+
|servico        |empresa|ano |uf |meta_indicador|cumpriu|descumpriu|id_qualidade                        |
+---------------+-------+----+---+--------------+-------+----------+------------------------------------+
|Telefonia Móvel|TIM    |2018|SP |>= 90%        |0      |1         |9cf16987-526a-4b9b-8b7a-118dd2c21d12|
+---------------+-------+----+---+--------------+-------+----------+------------------------------------+
only showing top 1 row



                                                                                

In [13]:
dfDadosibge = dfDadosibge.withColumn("id", expr("uuid()")) # dados_ibge -> id
dfDadosibge.show(1, truncate=False)

[Stage 7:>                                                          (0 + 1) / 1]

+--------+---+--------------+---------+----+-------------+------+-------------+------------------------------------+
|cod_ibge|uf |nome_municipio|populacao|ano |cod_municipio|cod_uf|pib_municipio|id                                  |
+--------+---+--------------+---------+----+-------------+------+-------------+------------------------------------+
|3522158 |SP |Itaoca        |3332     |2017|22158        |35    |39835016     |72bd7f0e-9a4f-4c66-a2e5-b4b1df49ead3|
+--------+---+--------------+---------+----+-------------+------+-------------+------------------------------------+
only showing top 1 row



                                                                                

In [14]:
saveData(dfCobertura,'cobertura_operadoras')

                                                                                

In [None]:
saveData(dfReclamacao,'reclamacoes')



In [None]:
saveData(dfQualidade,'qualidade')

[Stage 9:>                                                          (0 + 1) / 1]

In [14]:
saveData(dfDadosibge,'dados_ibge')

                                                                                