# **Rodrigo_Mesquita_DR4_TP3**

# 📂 PARTE 1 -  Análise Estática (Spark SQL + PostgreSQL + S3)

### 1. Realize a leitura da tabela apostas do PostgreSQL e transforme a coluna timestamp corretamente.

In [None]:
from pyspark.sql import SparkSession

try:
    spark.stop()
except:
    pass


properties = {"user": "admin","password": "admin","driver": "org.postgresql.Driver"}
spark = (
    SparkSession
        .builder
        .appName("TP4")
        .master("spark://spark:7077")
        .config("spark.jars.packages", "org.postgresql:postgresql:42.2.24,com.amazonaws:aws-java-sdk-bundle:1.12.262,org.apache.hadoop:hadoop-aws:3.3.4")
        .config('spark.hadoop.fs.s3a.endpoint', 'minio:9000')
        .config('spark.hadoop.fs.s3a.access.key', 'admin')
        .config('spark.hadoop.fs.s3a.secret.key', 'admin123')
        .config('spark.hadoop.fs.s3a.path.style.access', 'true')
        .config('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false')
        .config('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
        .getOrCreate()
)

In [159]:
df_apostas = spark.read.jdbc("jdbc:postgresql://postgres:5432/betalert"  , "apostas", properties=properties)
df_apostas.createOrReplaceTempView("apostas")

df_apostas = spark.sql("""
    SELECT
    aposta_id,
    apostador_id,
    jogo_id,
    ROUND(valor, 2) AS valor_aposta,
    ROUND(odd, 2) AS odd,
    timestamp AS hora_aposta,
    concat(date_format(timestamp, 'yyyy-MM-dd'), '_', apostador_id) AS chave
    FROM apostas
""")
df_apostas.createOrReplaceTempView("apostas")

df_apostas.printSchema()

root
 |-- aposta_id: string (nullable = true)
 |-- apostador_id: string (nullable = true)
 |-- jogo_id: string (nullable = true)
 |-- valor_aposta: decimal(23,2) (nullable = true)
 |-- odd: decimal(23,2) (nullable = true)
 |-- hora_aposta: timestamp (nullable = true)
 |-- chave: string (nullable = true)



### 2. Realize a leitura da tabela transacoes_financeiras e normalize o nome da coluna de valor.


In [157]:
df_transacoes = spark.read.jdbc("jdbc:postgresql://postgres:5432/betalert"  , "transacoes_financeiras", properties=properties)
df_transacoes.createOrReplaceTempView("transacoes")


df_transacoes = spark.sql("""
    SELECT
    id,
    apostador_id,
    ROUND(valor, 2) AS valor_transacao,
    tipo,
    data AS hora_deposito,
    concat(date_format(data, 'yyyy-MM-dd'), '_', apostador_id) AS chave
    FROM transacoes
""")

df_transacoes.createOrReplaceTempView("transacoes")
df_transacoes.show()

+---+------------+---------------+--------+-------------------+--------------+
| id|apostador_id|valor_transacao|    tipo|      hora_deposito|         chave|
+---+------------+---------------+--------+-------------------+--------------+
|  1|         u69|       14890.81|deposito|2025-01-01 10:00:00|2025-01-01_u69|
|  2|         u94|        5616.48|deposito|2025-01-01 10:01:00|2025-01-01_u94|
|  3|         u95|       16376.42|   saque|2025-01-01 10:02:00|2025-01-01_u95|
|  4|         u11|       15335.80|deposito|2025-01-01 10:03:00|2025-01-01_u11|
|  5|         u88|       19559.30|   saque|2025-01-01 10:04:00|2025-01-01_u88|
|  6|         u32|        4797.95|deposito|2025-01-01 10:05:00|2025-01-01_u32|
|  7|         u14|        1494.86|   saque|2025-01-01 10:06:00|2025-01-01_u14|
|  8|         u23|       15609.39|deposito|2025-01-01 10:07:00|2025-01-01_u23|
|  9|          u8|       14916.72|deposito|2025-01-01 10:08:00| 2025-01-01_u8|
| 10|         u92|       12104.17|   saque|2025-01-0

### 3. Faça o join entre apostas e o arquivo apostadores.csv do S3 para incluir o país e dados extras.

In [160]:
df_apostadores = spark.read.csv('s3a://betalogs/apostadores.csv', header=True)
df_apostadores.createOrReplaceTempView("apostadores")

apostas_full = spark.sql("""
    SELECT *
    FROM apostas a
    LEFT JOIN apostadores b 
    ON a.apostador_id = b.id
""")

apostas_full.createOrReplaceTempView("apostas_full")

apostas_full.show()

+---------+------------+-------+------------+----+-------------------+---------------+----+-----------+--------------------+----+
|aposta_id|apostador_id|jogo_id|valor_aposta| odd|        hora_aposta|          chave|  id|       nome|               email|pais|
+---------+------------+-------+------------+----+-------------------+---------------+----+-----------+--------------------+----+
| b61f4f08|         u79| jogo31|     1813.57|4.49|2025-01-05 15:42:00| 2025-01-05_u79| u79| Jogador 79|jogador79@exemplo...|  AR|
| 88c44f52|         u23| jogo33|      830.60|2.87|2025-01-05 09:54:00| 2025-01-05_u23| u23| Jogador 23|jogador23@exemplo...|  FR|
| 4c32051c|         u72| jogo72|     1829.92|2.09|2025-01-03 23:34:00| 2025-01-03_u72| u72| Jogador 72|jogador72@exemplo...|  ES|
| 1baa492c|         u70| jogo67|     1614.20|4.21|2025-01-04 16:50:00| 2025-01-04_u70| u70| Jogador 70|jogador70@exemplo...|  ES|
| 2820b7bb|         u40| jogo50|      989.73|2.98|2025-01-02 20:37:00| 2025-01-02_u40| u40

# 🔍 PARTE 2 - Detecção de Padrões

### 1. Identifique apostas-relâmpago, ou seja, apostas feitas até 10 segundos após depósitos.


In [161]:
from pyspark.sql import functions as F

depositos = spark.sql("""
    SELECT *
    FROM transacoes
    WHERE tipo = "deposito"
""")
depositos.createOrReplaceTempView("depositos")


apostas_relampago = spark.sql("""
    SELECT 
        a.*, 
        b.*,
        (unix_timestamp(b.hora_aposta) - unix_timestamp(a.hora_deposito)) AS diferenca_em_segundos
    FROM depositos a
    LEFT JOIN apostas_full b
    ON a.chave = b.chave
    WHERE (unix_timestamp(b.hora_aposta) - unix_timestamp(a.hora_deposito)) < 10 AND (unix_timestamp(b.hora_aposta) - unix_timestamp(a.hora_deposito)) >= 0
""")

apostas_relampago.createOrReplaceTempView("apostas_relampago")

apostas_relampago.show()

+---+------------+---------------+--------+-------------------+--------------+---------+------------+-------+------------+-----+-------------------+--------------+---+----------+--------------------+----+---------------------+
| id|apostador_id|valor_transacao|    tipo|      hora_deposito|         chave|aposta_id|apostador_id|jogo_id|valor_aposta|  odd|        hora_aposta|         chave| id|      nome|               email|pais|diferenca_em_segundos|
+---+------------+---------------+--------+-------------------+--------------+---------+------------+-------+------------+-----+-------------------+--------------+---+----------+--------------------+----+---------------------+
|503|         u41|       15000.00|deposito|2025-01-11 20:02:00|2025-01-11_u41| 066f9055|         u41| jogo71|    15000.00|16.46|2025-01-11 20:02:09|2025-01-11_u41|u41|Jogador 41|jogador41@exemplo...|  ES|                    9|
|502|         u33|       15000.00|deposito|2025-01-11 20:01:00|2025-01-11_u33| d6002cae|    

### 2. Exiba apostas-relâmpago com valor acima de R$500.

In [162]:
apostas_relampago_500 = spark.sql("""
    SELECT * FROM apostas_relampago
    WHERE valor_aposta > 500
""")

apostas_relampago_500.show()

+---+------------+---------------+--------+-------------------+--------------+---------+------------+-------+------------+-----+-------------------+--------------+---+----------+--------------------+----+---------------------+
| id|apostador_id|valor_transacao|    tipo|      hora_deposito|         chave|aposta_id|apostador_id|jogo_id|valor_aposta|  odd|        hora_aposta|         chave| id|      nome|               email|pais|diferenca_em_segundos|
+---+------------+---------------+--------+-------------------+--------------+---------+------------+-------+------------+-----+-------------------+--------------+---+----------+--------------------+----+---------------------+
|173|         u19|        8595.74|deposito|2025-01-01 12:52:00|2025-01-01_u19| df00489c|         u19| jogo95|      574.71| 2.60|2025-01-01 12:52:00|2025-01-01_u19|u19|Jogador 19|jogador19@exemplo...|  BR|                    0|
|501|         u17|       15000.00|deposito|2025-01-11 20:00:00|2025-01-11_u17| 5b0e994f|    

### 3. Exiba apostas-relâmpago com valor acima de R$10.000.

In [163]:
apostas_relampago_10k = spark.sql("""
    SELECT * FROM apostas_relampago
    WHERE valor_aposta > 10000
""")

apostas_relampago_10k.show()

+---+------------+---------------+--------+-------------------+--------------+---------+------------+-------+------------+-----+-------------------+--------------+---+----------+--------------------+----+---------------------+
| id|apostador_id|valor_transacao|    tipo|      hora_deposito|         chave|aposta_id|apostador_id|jogo_id|valor_aposta|  odd|        hora_aposta|         chave| id|      nome|               email|pais|diferenca_em_segundos|
+---+------------+---------------+--------+-------------------+--------------+---------+------------+-------+------------+-----+-------------------+--------------+---+----------+--------------------+----+---------------------+
|501|         u17|       15000.00|deposito|2025-01-11 20:00:00|2025-01-11_u17| 5b0e994f|         u17|  jogo8|    15000.00|15.32|2025-01-11 20:00:02|2025-01-11_u17|u17|Jogador 17|jogador17@exemplo...|  FR|                    2|
|502|         u33|       15000.00|deposito|2025-01-11 20:01:00|2025-01-11_u33| d6002cae|    

### 4. Detecte jogadores que realizaram 10 ou mais apostas em um mesmo jogo.

In [164]:

multiplas_apostas = spark.sql("""
    SELECT jogo_id, apostador_id, COUNT(apostador_id) as qtd_apostas
    FROM apostas
    GROUP BY jogo_id, apostador_id
    HAVING COUNT(apostador_id) >= 10
""")

multiplas_apostas.show()

+-------+------------+-----------+
|jogo_id|apostador_id|qtd_apostas|
+-------+------------+-----------+
| jogo77|         u88|         13|
+-------+------------+-----------+



### 5. Exiba o total e a média de valores apostados por país.

In [166]:
media_apostas = spark.sql("""
    SELECT pais,
    ROUND(SUM(valor),2) AS valor_total_apostas,
    ROUND(AVG(valor),2) AS valor_medio_apostas
    FROM apostasXapostadores
    GROUP BY pais
""")

media_apostas.show()

+----+-------------------+-------------------+
|pais|valor_total_apostas|valor_medio_apostas|
+----+-------------------+-------------------+
|  PT|          627445.73|            1012.01|
|  BR|         1129415.56|            1070.54|
|  ES|         1323581.35|            1075.21|
|  FR|         1265400.11|            1038.06|
|  AR|          945906.52|            1066.41|
+----+-------------------+-------------------+



# 📡 PARTE 3 - Streaming em Tempo Real (Kafka + Spark Structured Streaming)

### 1. Consuma o tópico Kafka stream_apostas e transforme os dados conforme o schema.

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, expr
from pyspark.sql.types import *
import time 

    
try:
    spark.stop()
except:
    pass
    
    
spark = (
    SparkSession.builder
        .appName("TP4")
        .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,org.postgresql:postgresql:42.2.24,com.amazonaws:aws-java-sdk-bundle:1.12.262,org.apache.hadoop:hadoop-aws:3.3.4")
        .config('spark.hadoop.fs.s3a.endpoint', 'minio:9000')
        .config('spark.hadoop.fs.s3a.access.key', 'admin')
        .config('spark.hadoop.fs.s3a.secret.key', 'admin123')
        .config('spark.hadoop.fs.s3a.path.style.access', 'true')
        .config('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false')
        .config('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
        .getOrCreate()
)


schema = StructType([
    StructField("aposta_id", StringType()),
    StructField("apostador_id", StringType()),
    StructField("jogo_id", StringType()),
    StructField("valor", DoubleType()),    
    StructField("odd", FloatType()),        
    StructField("timestamp", TimestampType())
])

df_raw = (
    spark.readStream
        .format("kafka")
        .option("kafka.bootstrap.servers", "kafka1:9092")
        .option("subscribe", "stream_apostas")
        .load()
)

In [18]:
df_json = df_raw.selectExpr("CAST (value as STRING) as json") \
        .select(from_json(col("json"), schema).alias("data")) \
        .select("data.*")

In [5]:
query = df_json.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

time.sleep(3)

query.stop()
query.awaitTermination() 

25/06/04 03:09:03 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-dcdac508-28a9-43ee-a7e1-b80e63230afc. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/06/04 03:09:03 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/06/04 03:09:03 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+---------+------------+-------+-----+---+---------+
|aposta_id|apostador_id|jogo_id|valor|odd|timestamp|
+---------+------------+-------+-----+---+---------+
+---------+------------+-------+-----+---+---------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+---------+------------+-------+-------+----+--------------------+
|aposta_id|apostador_id|jogo_id|  valor| odd|           timestamp|
+---------+------------+-------+-------+----+--------------------+
| c534f654|          u4| jogo67|3690.78|3.38|2025-06-04 03:09:...|
| ea4d904d|         u89| jogo89| 3430.4|6.28|2025-06-04 03:09:...|
| 29075068|         u38| jogo55|4669.07|6.87|2025-06-04 03:09:...|
| df3a5fc5|         u49| jogo82|3897.22|3.57|2025-06-04 03:09:...|
| b4837ba2|         u82| jogo98|4115.12|6.55|2025-06-04 03:09:...|
| d43a9488|         u27| jogo94|4195.53|3.48|2025-06-04 03:09:...|
| dfa666a1|         u97| jogo10|3776.34|5.83|2025-06-04 03:09:...|
| 7eb0ccbb|         u80| jogo45|4089.89|8.07|2025-06-04 03:09:...|
| 5579fafe|         u34| jogo87|4065.66|4.48|2025-06-04 03:09:...|
+---------+------------+-------+-------+----+--------------------+

-------------------------------

### 2. Enriqueça o stream com os dados de apostadores do S3.

In [12]:
df_apostadores = spark.read.csv('s3a://betalogs/apostadores.csv', header=True)
df_join = df_json.join(df_apostadores, df_json.apostador_id == df_apostadores.id, "inner")

query2 = df_join.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

time.sleep(3)

query2.stop()
query2.awaitTermination() 

25/06/04 03:27:26 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-306ca8c9-d277-434d-9589-4cad9108e820. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/06/04 03:27:26 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/06/04 03:27:26 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+---------+------------+-------+-----+---+---------+---+----+-----+----+
|aposta_id|apostador_id|jogo_id|valor|odd|timestamp| id|nome|email|pais|
+---------+------------+-------+-----+---+---------+---+----+-----+----+
+---------+------------+-------+-----+---+---------+---+----+-----+----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+---------+------------+-------+-------+----+--------------------+---+----------+--------------------+----+
|aposta_id|apostador_id|jogo_id|  valor| odd|           timestamp| id|      nome|               email|pais|
+---------+------------+-------+-------+----+--------------------+---+----------+--------------------+----+
| 04185d59|         u58| jogo39|3899.83|3.62|2025-06-04 03:27:...|u58|Jogador 58|jogador58@exemplo...|  BR|
| eb51fc23|         u56| jogo90|3440.01|8.93|2025-06-04 03:27:...|u56|Jogador 56|jogador56@exemplo...|  ES|
| 2f43b8d8|         u41| jogo55|2196.31|5.54|2025-06-04 03:27:...|u41|Jogador 41|jogador41@exemplo...|  ES|
| 60310221|         u66| jogo71|3513.35|1.88|2025-06-04 03:27:...|u66|Jogador 66|jogador66@exemplo...|  AR|
| 97622c3d|         u96| jogo62|3024.61|1.94|2025-06-04 03:27:...|u96|Jogador 96|jogador96@exemplo...|  PT|
| 57a2483d|         u58|  jogo5|3223.13

25/06/04 03:27:29 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 2, writer: ConsoleWriter[numRows=20, truncate=true]] is aborting.
25/06/04 03:27:29 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 2, writer: ConsoleWriter[numRows=20, truncate=true]] aborted.
25/06/04 03:27:29 ERROR Utils: Aborting task
org.apache.spark.TaskKilledException
	at org.apache.spark.TaskContextImpl.killTaskIfInterrupted(TaskContextImpl.scala:267)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:36)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:513)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.a

### 3. Crie uma coluna booleana suspeita para apostas com valor > R$12.000 e odd > 15.

In [6]:
apostas_suspeitas = df_json.withColumn("suspeita", expr("valor > 12000 AND odd > 15"))

query = apostas_suspeitas.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

time.sleep(3)

query.stop()
query.awaitTermination() 

25/06/04 03:09:25 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-1f2d281b-e394-4cca-8444-c79269f4a82e. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/06/04 03:09:25 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/06/04 03:09:25 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+---------+------------+-------+-----+---+---------+--------+
|aposta_id|apostador_id|jogo_id|valor|odd|timestamp|suspeita|
+---------+------------+-------+-----+---+---------+--------+
+---------+------------+-------+-----+---+---------+--------+

-------------------------------------------
Batch: 1
-------------------------------------------
+---------+------------+-------+-------+----+--------------------+--------+
|aposta_id|apostador_id|jogo_id|  valor| odd|           timestamp|suspeita|
+---------+------------+-------+-------+----+--------------------+--------+
| 05298021|         u67| jogo73| 947.27|7.04|2025-06-04 03:09:...|   false|
| 2a4d098d|         u52| jogo50| 249.58|3.66|2025-06-04 03:09:...|   false|
| fbe83e84|         u83| jogo93|1060.97|5.65|2025-06-04 03:09:...|   false|
| 290c51b9|         u90| jogo46|2313.95| 3.7|2025-06-04 03:09:...|   false|
| 3a479b2d|         u94| 

25/06/04 03:09:28 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 4, writer: ConsoleWriter[numRows=20, truncate=true]] is aborting.
25/06/04 03:09:28 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 4, writer: ConsoleWriter[numRows=20, truncate=true]] aborted.
25/06/04 03:09:29 ERROR Utils: Aborting task
org.apache.spark.TaskKilledException
	at org.apache.spark.TaskContextImpl.killTaskIfInterrupted(TaskContextImpl.scala:267)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:36)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeSt

### 4. A cada batch de 1 minuto, acumule:
- ####  Média de valor por país.
- ####  Contagem de apostas suspeitas por país.
- ####  Volume total de apostas.
- ####  Contagem de apostas suspeitas por apostador.

In [25]:
from pyspark.sql.functions import window, avg

query = (
    df_json
        .withWatermark("timestamp", "2 minutes")
        .groupBy(window("timestamp", "1 minute"),"jogo_id","pais")
        .agg(
            avg("valor").alias("media_valor"),
            .count(when(col("suspeita") == "sim", True)).alias("contagem_suspeitas"),
            sum("valor").alias("volume_total")
            )
        .writeStream
        .outputMode("complete")
        .format("console")
        .start()
    
)


time.sleep(8)

query.stop()
query.awaitTermination() 

25/06/04 22:57:13 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-87bc1b59-2384-489b-8118-7e302ab394bc. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/06/04 22:57:13 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/06/04 22:57:13 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+-------+-----------+
|window|jogo_id|media_valor|
+------+-------+-----------+
+------+-------+-----------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------------+-------+------------------+
|              window|jogo_id|       media_valor|
+--------------------+-------+------------------+
|{2025-06-04 22:57...| jogo82|           1301.44|
|{2025-06-04 22:57...| jogo43|            4022.9|
|{2025-06-04 22:57...| jogo56|           1841.84|
|{2025-06-04 22:57...| jogo46|           4385.14|
|{2025-06-04 22:57...| jogo80|           2890.51|
|{2025-06-04 22:57...| jogo39|           1275.68|
|{2025-06-04 22:57...| jogo95|           3269.94|
|{2025-06-04 22:57...| jogo13|           2508.33|
|{2025-06-04 22:57...| jogo52|            3288.9|
|{2025-06-04 22:57...| jogo51|            694.62|
|{2025-06-04 22:57...| jogo34|           4297.94|
|{2025-06-04 22:57...| jogo63|           1557.71|
|{2025-06-04 22:57...| jogo77|           3167.69|
|{2025-06-04 22:57...| jogo12|2918.7700000000004|
|{2025-06-04 22:57...| jogo16|            1974.8|
|{2

25/06/04 22:57:21 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 2, writer: ConsoleWriter[numRows=20, truncate=true]] is aborting.
25/06/04 22:57:21 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 2, writer: ConsoleWriter[numRows=20, truncate=true]] aborted.
25/06/04 22:57:21 WARN TaskSetManager: Lost task 14.0 in stage 159.0 (TID 15471) (8e63186473ca executor driver): TaskKilled (Stage cancelled: Job 79 cancelled part of cancelled job group d92daf3d-52b0-4845-ae92-acc2eb0a0ac0)
25/06/04 22:57:21 WARN TaskSetManager: Lost task 18.0 in stage 159.0 (TID 15475) (8e63186473ca executor driver): TaskKilled (Stage cancelled: Job 79 cancelled part of cancelled job group d92daf3d-52b0-4845-ae92-acc2eb0a0ac0)
25/06/04 22:57:21 WARN Shell: Interrupted while joining on: Thread[Thread-163288,5,main]
java.lang.InterruptedException
	at java.base/java.lang.Object.wait(Native Method)
	at java.base/java.lang.Thread.join(Thread.java:1313)
	

### 5. Após += 15 minutos, exiba o consolidado de todos os dados acumulados.

In [None]:
from pyspark.sql.functions import window, avg

query = (
    df_json
        .withWatermark("timestamp", "15 minutes")
        .groupBy(window("timestamp", "15 minutes"),"jogo_id")
        .agg(avg("valor").alias("media_valor"))
        .writeStream
        .outputMode("complete")
        .format("console")
        .start()
    
)

query.awaitTermination() 