Apprendre spark

In [1]:
from pyspark.sql import SparkSession
import logging

# Reduce logging level
logging.getLogger("pyspark").setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)

# Create Spark session with quieter configuration
spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

# Set Spark's log level to ERROR (shows only errors)
spark.sparkContext.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/22 04:38:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Create DataFrame

In [3]:
# From a list of dictionaries
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
columns = ["name", "age"]
df_manuel = spark.createDataFrame(data, columns)

# From a CSV file
df_csv = spark.read.csv("input/file.csv", header=True, inferSchema=True)

# From JSON
df_json = spark.read.json("input/file.json")

In [4]:
df_manuel

DataFrame[name: string, age: bigint]

In [5]:
df_csv

DataFrame[id: int, name: string, age: int, department: string, salary: int, hire_date: date]

In [6]:
df = df_csv

Import de base

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, max, min, year, current_date, datediff
from pyspark.sql.types import *

In [8]:

# Initialiser Spark

print("Données originales :")
df.show()

print("Schéma :")
df.printSchema()

# 2. OPÉRATIONS DE BASE
print("\n=== OPÉRATIONS DE BASE ===")

print("Nombre total d'employés :")
print(df.count())

print("\nPremières 5 lignes :")
df.show(5)

print("\nColonnes spécifiques :")
df.select("name", "department", "salary").show()

Données originales :
+---+--------------+---+-----------+------+----------+
| id|          name|age| department|salary| hire_date|
+---+--------------+---+-----------+------+----------+
|  1| Alice Johnson| 28|Engineering| 75000|2020-03-15|
|  2|     Bob Smith| 34|  Marketing| 65000|2019-07-22|
|  3| Charlie Brown| 41|Engineering| 95000|2018-01-10|
|  4|  Diana Prince| 29|         HR| 58000|2021-05-03|
|  5| Edward Wilson| 52|    Finance| 82000|2017-11-28|
|  6|   Fiona Davis| 26|  Marketing| 52000|2022-02-14|
|  7| George Miller| 38|Engineering| 88000|2019-09-05|
|  8|  Helen Garcia| 45|    Finance| 76000|2018-06-18|
|  9|Ivan Rodriguez| 31|         HR| 61000|2020-12-01|
| 10|Julia Anderson| 27|Engineering| 72000|2021-08-20|
+---+--------------+---+-----------+------+----------+

Schéma :
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- h

In [7]:
# 3. FILTRAGE
print("\n=== FILTRAGE ===")

print("Employés avec salaire > 70000 :")
df.filter(col("salary") > 70000).show()

print("Employés du département Engineering :")
df.filter(col("department") == "Engineering").show()

print("Employés de moins de 30 ans :")
df.filter(col("age") < 30).show()


=== FILTRAGE ===
Employés avec salaire > 70000 :
+---+--------------+---+-----------+------+----------+
| id|          name|age| department|salary| hire_date|
+---+--------------+---+-----------+------+----------+
|  1| Alice Johnson| 28|Engineering| 75000|2020-03-15|
|  3| Charlie Brown| 41|Engineering| 95000|2018-01-10|
|  5| Edward Wilson| 52|    Finance| 82000|2017-11-28|
|  7| George Miller| 38|Engineering| 88000|2019-09-05|
|  8|  Helen Garcia| 45|    Finance| 76000|2018-06-18|
| 10|Julia Anderson| 27|Engineering| 72000|2021-08-20|
+---+--------------+---+-----------+------+----------+

Employés du département Engineering :
+---+--------------+---+-----------+------+----------+
| id|          name|age| department|salary| hire_date|
+---+--------------+---+-----------+------+----------+
|  1| Alice Johnson| 28|Engineering| 75000|2020-03-15|
|  3| Charlie Brown| 41|Engineering| 95000|2018-01-10|
|  7| George Miller| 38|Engineering| 88000|2019-09-05|
| 10|Julia Anderson| 27|Enginee

In [8]:
# 4. AGRÉGATIONS
print("\n=== AGRÉGATIONS ===")

print("Statistiques générales des salaires :")
df.agg(
    min("salary").alias("salaire_min"),
    max("salary").alias("salaire_max"),
    avg("salary").alias("salaire_moyen")
).show()

print("Nombre d'employés par département :")
df.groupBy("department").count().orderBy("count", ascending=False).show()

print("Salaire moyen par département :")
df.groupBy("department") \
  .agg(avg("salary").alias("salaire_moyen")) \
  .orderBy("salaire_moyen", ascending=False) \
  .show()

print("Age moyen par département :")
df.groupBy("department") \
  .agg(avg("age").alias("age_moyen")) \
  .show()



=== AGRÉGATIONS ===
Statistiques générales des salaires :
+-----------+-----------+-------------+
|salaire_min|salaire_max|salaire_moyen|
+-----------+-----------+-------------+
|      52000|      95000|      72400.0|
+-----------+-----------+-------------+

Nombre d'employés par département :
+-----------+-----+
| department|count|
+-----------+-----+
|Engineering|    4|
|         HR|    2|
|    Finance|    2|
|  Marketing|    2|
+-----------+-----+

Salaire moyen par département :
+-----------+-------------+
| department|salaire_moyen|
+-----------+-------------+
|Engineering|      82500.0|
|    Finance|      79000.0|
|         HR|      59500.0|
|  Marketing|      58500.0|
+-----------+-------------+

Age moyen par département :
+-----------+---------+
| department|age_moyen|
+-----------+---------+
|Engineering|     33.5|
|         HR|     30.0|
|    Finance|     48.5|
|  Marketing|     30.0|
+-----------+---------+



In [10]:
# 5. TRANSFORMATIONS AVANCÉES
print("\n=== TRANSFORMATIONS AVANCÉES ===")

# Ajouter une colonne catégorie d'âge
from pyspark.sql.functions import when

df_with_category = df.withColumn(
    "age_category",
    when(col("age") < 30, "Jeune")
    .when(col("age") < 40, "Moyen")
    .otherwise("Senior")
)

print("Avec catégorie d'âge :")
df_with_category.select("name", "age", "age_category").show()

# Calculer l'ancienneté - VERSION SIMPLIFIÉE
df_with_tenure = df.withColumn(
    "tenure_years",
    (datediff(current_date(), col("hire_date")) / 365).cast("int")
)

print("Avec ancienneté :")
df_with_tenure.select("name", "hire_date", "tenure_years") \
               .orderBy("tenure_years", ascending=False) \
               .show()  # Pas de point ici !



=== TRANSFORMATIONS AVANCÉES ===
Avec catégorie d'âge :
+--------------+---+------------+
|          name|age|age_category|
+--------------+---+------------+
| Alice Johnson| 28|       Jeune|
|     Bob Smith| 34|       Moyen|
| Charlie Brown| 41|      Senior|
|  Diana Prince| 29|       Jeune|
| Edward Wilson| 52|      Senior|
|   Fiona Davis| 26|       Jeune|
| George Miller| 38|       Moyen|
|  Helen Garcia| 45|      Senior|
|Ivan Rodriguez| 31|       Moyen|
|Julia Anderson| 27|       Jeune|
+--------------+---+------------+

Avec ancienneté :
+--------------+----------+------------+
|          name| hire_date|tenure_years|
+--------------+----------+------------+
| Charlie Brown|2018-01-10|           7|
| Edward Wilson|2017-11-28|           7|
|  Helen Garcia|2018-06-18|           7|
|     Bob Smith|2019-07-22|           6|
| George Miller|2019-09-05|           6|
| Alice Johnson|2020-03-15|           5|
|  Diana Prince|2021-05-03|           4|
|Ivan Rodriguez|2020-12-01|           

In [11]:
# 6. ANALYSES SPÉCIFIQUES
print("\n=== ANALYSES SPÉCIFIQUES ===")

print("Top 3 des salaires les plus élevés :")
df.select("name", "department", "salary") \
  .orderBy("salary", ascending=False) \
  .limit(3) \
  .show()

print("Employés embauchés en 2020 ou après :")
df.filter(year("hire_date") >= 2020) \
  .select("name", "department", "hire_date") \
  .orderBy("hire_date") \
  .show()


=== ANALYSES SPÉCIFIQUES ===
Top 3 des salaires les plus élevés :
+-------------+-----------+------+
|         name| department|salary|
+-------------+-----------+------+
|Charlie Brown|Engineering| 95000|
|George Miller|Engineering| 88000|
|Edward Wilson|    Finance| 82000|
+-------------+-----------+------+

Employés embauchés en 2020 ou après :
+--------------+-----------+----------+
|          name| department| hire_date|
+--------------+-----------+----------+
| Alice Johnson|Engineering|2020-03-15|
|Ivan Rodriguez|         HR|2020-12-01|
|  Diana Prince|         HR|2021-05-03|
|Julia Anderson|Engineering|2021-08-20|
|   Fiona Davis|  Marketing|2022-02-14|
+--------------+-----------+----------+



In [12]:
# 7. RENOMMER LES COLONNES
print("\n=== RENOMMAGE DES COLONNES ===")

df_french = df.withColumnRenamed("name", "nom") \
              .withColumnRenamed("age", "age") \
              .withColumnRenamed("department", "departement") \
              .withColumnRenamed("salary", "salaire") \
              .withColumnRenamed("hire_date", "date_embauche")

print("DataFrame avec noms français :")
df_french.show()


=== RENOMMAGE DES COLONNES ===
DataFrame avec noms français :
+---+--------------+---+-----------+-------+-------------+
| id|           nom|age|departement|salaire|date_embauche|
+---+--------------+---+-----------+-------+-------------+
|  1| Alice Johnson| 28|Engineering|  75000|   2020-03-15|
|  2|     Bob Smith| 34|  Marketing|  65000|   2019-07-22|
|  3| Charlie Brown| 41|Engineering|  95000|   2018-01-10|
|  4|  Diana Prince| 29|         HR|  58000|   2021-05-03|
|  5| Edward Wilson| 52|    Finance|  82000|   2017-11-28|
|  6|   Fiona Davis| 26|  Marketing|  52000|   2022-02-14|
|  7| George Miller| 38|Engineering|  88000|   2019-09-05|
|  8|  Helen Garcia| 45|    Finance|  76000|   2018-06-18|
|  9|Ivan Rodriguez| 31|         HR|  61000|   2020-12-01|
| 10|Julia Anderson| 27|Engineering|  72000|   2021-08-20|
+---+--------------+---+-----------+-------+-------------+



In [None]:











# 8. STATISTIQUES PAR DÉPARTEMENT
print("\n=== STATISTIQUES DÉTAILLÉES PAR DÉPARTEMENT ===")

dept_stats = df.groupBy("department").agg(
    count("*").alias("nombre_employes"),
    avg("salary").alias("salaire_moyen"),
    min("salary").alias("salaire_min"),
    max("salary").alias("salaire_max"),
    avg("age").alias("age_moyen")
)

dept_stats.show()

# Fermer Spark
# spark.stop()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/workspaces/Daily/.venv/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/Daily/.venv/lib/python3.11/site-packages/py4j/clientserver.py", line 535, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


Données originales :
+---+--------------+---+-----------+------+----------+
| id|          name|age| department|salary| hire_date|
+---+--------------+---+-----------+------+----------+
|  1| Alice Johnson| 28|Engineering| 75000|2020-03-15|
|  2|     Bob Smith| 34|  Marketing| 65000|2019-07-22|
|  3| Charlie Brown| 41|Engineering| 95000|2018-01-10|
|  4|  Diana Prince| 29|         HR| 58000|2021-05-03|
|  5| Edward Wilson| 52|    Finance| 82000|2017-11-28|
|  6|   Fiona Davis| 26|  Marketing| 52000|2022-02-14|
|  7| George Miller| 38|Engineering| 88000|2019-09-05|
|  8|  Helen Garcia| 45|    Finance| 76000|2018-06-18|
|  9|Ivan Rodriguez| 31|         HR| 61000|2020-12-01|
| 10|Julia Anderson| 27|Engineering| 72000|2021-08-20|
+---+--------------+---+-----------+------+----------+

Schéma :
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- h

KeyboardInterrupt: 

In [11]:
# Show the data
df.show()

# Display schema
df.printSchema()

# Select columns
df.select("name", "age").show()

# Filter rows
df.filter(df.age > 25).show()



+---+--------------+---+-----------+------+----------+
| id|          name|age| department|salary| hire_date|
+---+--------------+---+-----------+------+----------+
|  1| Alice Johnson| 28|Engineering| 75000|2020-03-15|
|  2|     Bob Smith| 34|  Marketing| 65000|2019-07-22|
|  3| Charlie Brown| 41|Engineering| 95000|2018-01-10|
|  4|  Diana Prince| 29|         HR| 58000|2021-05-03|
|  5| Edward Wilson| 52|    Finance| 82000|2017-11-28|
|  6|   Fiona Davis| 26|  Marketing| 52000|2022-02-14|
|  7| George Miller| 38|Engineering| 88000|2019-09-05|
|  8|  Helen Garcia| 45|    Finance| 76000|2018-06-18|
|  9|Ivan Rodriguez| 31|         HR| 61000|2020-12-01|
| 10|Julia Anderson| 27|Engineering| 72000|2021-08-20|
+---+--------------+---+-----------+------+----------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hire_date: date (nullable = tru

In [13]:
# Add a new column
from pyspark.sql.functions import col, when
df_with_category = df.withColumn("category", 
    when(col("age") < 30, "young").otherwise("adult"))

# Group by and aggregate
df.groupBy("category").count().show()

{"ts": "2025-09-22 04:30:55.498", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `category` cannot be resolved. Did you mean one of the following? [`age`, `name`, `salary`, `id`, `department`]. SQLSTATE: 42703", "context": {"file": "java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)", "line": "", "fragment": "col", "errorClass": "UNRESOLVED_COLUMN.WITH_SUGGESTION"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o140.count.\n: org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `category` cannot be resolved. Did you mean one of the following? [`age`, `name`, `salary`, `id`, `department`]. SQLSTATE: 42703;\n'Aggregate ['category], ['category, count(1) AS count#194L]\n+- Relation [id#56,name#57,age#58,department#59,salary#60,hire_date#61] csv\n\n\

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `category` cannot be resolved. Did you mean one of the following? [`age`, `name`, `salary`, `id`, `department`]. SQLSTATE: 42703;
'Aggregate ['category], ['category, count(1) AS count#194L]
+- Relation [id#56,name#57,age#58,department#59,salary#60,hire_date#61] csv
