In [1]:
import socket

def check_port(host, port):
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.settimeout(5)  # Timeout de 5 secondes
    result = sock.connect_ex((host, port))
    if result == 0:
        print(f"Connexion réussie à {host}:{port}")
    else:
        print(f"Impossible de se connecter à {host}:{port}")
    sock.close()

check_port("kafka1", 9092)


Connexion réussie à kafka1:9092


In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

conf = SparkConf() \
    .setAppName('SparkApp') \
    .setMaster('spark://spark:7077') \
    .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
    .set("spark.sql.shuffle.partitions", "10")
 

sc = SparkContext.getOrCreate(conf=conf)


# Créer un SQLContext pour les opérations SQL
sql_context = SQLContext(sc)

:: loading settings :: url = jar:file:/opt/conda/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1ac147ce-b9d2-4a70-92dd-b132009b63dc;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.3 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.3 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	

## 1. Requête Batch sans fenêtre

Comptage des arrêts par ligne à partir du topic « arrets »

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, count
from pyspark.sql.types import StructType, StructField, StringType, FloatType

kafka_broker = "kafka1:9092"
kafka_topic = "arrets"

# Définition du schéma pour le topic "arrets"
schema_arrets = StructType([
    StructField("codeLieu", StringType(), True),
    StructField("libelle", StringType(), True),
    StructField("distance", FloatType(), True),
    StructField("ligne", StringType(), True)
])

# Lecture brute des messages Kafka
df_kafka = sql_context.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_broker) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .load()

# Conversion du champ "value" (encodé en JSON) en colonnes structurées
df_arrets = df_kafka.selectExpr("CAST(value AS STRING) AS message") \
    .select(from_json(col("message"), schema_arrets).alias("data")) \
    .select("data.*")

# Agrégation : compter le nombre d'arrêts par "ligne"
result = df_arrets.groupBy("ligne").agg(count("*").alias("nb_arrets"))

# Affichage du résultat
result.show()
result.describe().show()

25/02/16 16:37:33 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/02/16 16:37:34 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


+--------------------+---------+
|               ligne|nb_arrets|
+--------------------+---------+
| [{"numLigne":"79"}]|        6|
|[{"numLigne":"127...|        2|
|[{"numLigne":"119...|        7|
| [{"numLigne":"80"}]|       10|
|[{"numLigne":"172...|        2|
|[{"numLigne":"75"...|        6|
|[{"numLigne":"30"...|        7|
| [{"numLigne":"91"}]|       14|
|[{"numLigne":"11"...|        4|
| [{"numLigne":"C3"}]|       12|
|[{"numLigne":"101...|        2|
| [{"numLigne":"30"}]|       12|
|[{"numLigne":"112...|        1|
|[{"numLigne":"1"}...|        1|
|[{"numLigne":"179...|        2|
|[{"numLigne":"107...|        3|
|[{"numLigne":"1"}...|        1|
|[{"numLigne":"127...|        2|
|[{"numLigne":"50"...|        4|
|[{"numLigne":"105...|        4|
+--------------------+---------+
only showing top 20 rows





+-------+--------------------+-----------------+
|summary|               ligne|        nb_arrets|
+-------+--------------------+-----------------+
|  count|                 434|              434|
|   mean|                NULL|2.686635944700461|
| stddev|                NULL|3.224968441931372|
|    min|                  []|                1|
|    max|[{"numLigne":"NBI"}]|               28|
+-------+--------------------+-----------------+



                                                                                

## Requête Batch avec fenêtre

Vélos disponibles par contrat sur des fenêtres temporelles à partir du topic « velos ».
Malheuresement on a qu'une donnée par station donc ca permet pas de faire de l'aggrégation comme la moyenne de vélo sur la période..... faut essayer de trouver des données plus pertinentes.

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window, avg, to_timestamp, count, min, max, last
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

kafka_broker = "kafka1:9092"
kafka_topic = "velos"

# Définition du schéma pour le champ "position"
position_schema = StructType([
    StructField("lon", DoubleType(), True),
    StructField("lat", DoubleType(), True)
])

# Définition du schéma pour le topic "velos"
schema_velos = StructType([
    StructField("number", StringType(), True),
    StructField("name", StringType(), True),
    StructField("address", StringType(), True),
    StructField("position", position_schema, True),
    StructField("banking", StringType(), True),
    StructField("bonus", StringType(), True),
    StructField("status", StringType(), True),
    StructField("contract_name", StringType(), True),
    StructField("bike_stands", IntegerType(), True),
    StructField("available_bike_stands", IntegerType(), True),
    StructField("available_bikes", IntegerType(), True),
    StructField("last_update", StringType(), True)
])

# Lecture brute des messages Kafka
df_kafka = sql_context.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_broker) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .load()

# Conversion du champ "value" en JSON
df_velos = df_kafka.selectExpr("CAST(value AS STRING) AS message") \
    .select(from_json(col("message"), schema_velos).alias("data")) \
    .select("data.*")

# Conversion de "last_update" en timestamp (format ISO 8601)
df_velos = df_velos.withColumn("last_update_ts", 
                               to_timestamp(col("last_update"), "yyyy-MM-dd'T'HH:mm:ssXXX"))

# Agrégation par fenêtre (ici, on allonge la fenêtre à 30 minutes pour accumuler plusieurs mises à jour) et par station
result_window = df_velos.groupBy(
    window(col("last_update_ts"), "30 minutes"), 
    col("number"),
    col("contract_name")
).agg(
    count("*").alias("nb_updates"),
    last("available_bikes", ignorenulls=True).alias("latest_available_bikes")
)

result_window.show()

25/02/16 17:02:28 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.

+--------------------+------+-------------+----------+----------------------+
|              window|number|contract_name|nb_updates|latest_available_bikes|
+--------------------+------+-------------+----------+----------------------+
|{2025-02-16 15:30...|    15|       nantes|         1|                    14|
|{2025-02-16 15:30...|    20|       nantes|         1|                     9|
|{2025-02-16 15:30...|    50|       nantes|         1|                     7|
|{2025-02-16 15:30...|     1|       nantes|         1|                    23|
|{2025-02-16 15:30...|    44|       nantes|         1|                     9|
|{2025-02-16 15:30...|    72|       nantes|         1|                     6|
|{2025-02-16 15:30...|    52|       nantes|         1|                     1|
|{2025-02-16 15:30...|    96|       nantes|         1|                     6|
|{2025-02-16 15:30...|   124|       nantes|         1|                     1|
|{2025-02-16 15:30...|    17|       nantes|         1|          

                                                                                

## Requêtes Streaming

In [None]:
# Lecture en streaming des messages Kafka
df_kafka_stream = sql_context.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_broker) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .load()

# Conversion du champ "value" en chaîne et parsing du JSON selon le schéma
df_velos_stream = df_kafka_stream.selectExpr("CAST(value AS STRING) AS message") \
    .select(from_json(col("message"), schema_velos).alias("data")) \
    .select("data.*")

# Conversion du champ "last_update" en timestamp avec le format ISO 8601
df_velos_stream = df_velos_stream.withColumn("last_update_ts", 
                                               to_timestamp(col("last_update"), "yyyy-MM-dd'T'HH:mm:ssXXX"))

# Agrégation en mode streaming : moyenne des vélos disponibles par contrat sur des fenêtres de 5 minutes
streaming_result = df_velos_stream.groupBy(
    window(col("last_update_ts"), "10 seconds"), 
    col("number"),
    col("contract_name")
).agg(
    count("*").alias("nb_updates"),
    last("available_bikes", ignorenulls=True).alias("latest_available_bikes")
)

# Ecriture du résultat en console (mode streaming)
query = streaming_result.writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "false") \
    .start()

query.awaitTermination()

25/02/16 17:16:22 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-ac2ee7fe-255a-4280-996c-8f089c4b77ae. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/02/16 17:16:22 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/02/16 17:16:22 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+------+-------------+----------+----------------------+
|window                                    |number|contract_name|nb_updates|latest_available_bikes|
+------------------------------------------+------+-------------+----------+----------------------+
|{2025-02-16 15:45:30, 2025-02-16 15:45:40}|73    |nantes       |1         |1                     |
|{2025-02-16 15:45:40, 2025-02-16 15:45:50}|114   |nantes       |1         |9                     |
|{2025-02-16 15:43:50, 2025-02-16 15:44:00}|97    |nantes       |1         |0                     |
|{2025-02-16 15:45:40, 2025-02-16 15:45:50}|6     |nantes       |1         |4                     |
|{2025-02-16 15:38:00, 2025-02-16 15:38:10}|40    |nantes       |1         |6                     |
|{2025-02-16 15:39:30, 2025-02-16 15:39:40}|80    |nantes       |1         |7                     |
|{2

In [16]:
query.stop()