In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, when, col, round, abs
spark = SparkSession.builder \
    .appName("OpenFoodFactsDeepCleaning") \
    .getOrCreate()

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
df = spark.read.csv("/content/drive/My Drive/Colab Notebooks/dataset_nutrition.csv", sep=',', header=True, inferSchema=True)
print(df.count())

3809945


In [None]:
print("le nombre des valeures nulls dans chaque colonne d'un dataset: ")
null_count= df.select([count(when(col(c).isNull(),c).cast("int")).alias(c) for c in df.columns])
null_count.show()

le nombre des valeures nulls dans chaque colonne d'un dataset: 
+----+------------+------+-------------+---------+------+----------------+--------+---------+------+-----------+----------+--------+------------------+--------------+----------------+------------------+-----------+-----------------+------------+------------+-------------+--------------+------------+------------+----------+---------+---------------+-----------+------------+
|code|product_name|brands|main_category|countries|stores|ingredients_text|quantity|allergens|traces|energy_100g|categories|fat_100g|saturated-fat_100g|trans-fat_100g|cholesterol_100g|carbohydrates_100g|sugars_100g|added-sugars_100g|sucrose_100g|glucose_100g|fructose_100g|galactose_100g|lactose_100g|maltose_100g|fiber_100g|salt_100g|added-salt_100g|sodium_100g|alcohol_100g|
+----+------------+------+-------------+---------+------+----------------+--------+---------+------+-----------+----------+--------+------------------+--------------+----------------+-

In [None]:
df.select("energy_100g", "fat_100g", "sugars_100g", "salt_100g").summary().show()


+-------+--------------------+------------------+--------------------+--------------------+
|summary|         energy_100g|          fat_100g|         sugars_100g|           salt_100g|
+-------+--------------------+------------------+--------------------+--------------------+
|  count|             2824295|           2793474|             2689944|             2437954|
|   mean|2.106721791878621...|2067.0619647265266|4.367070147887408E26|3.2838519614562046E7|
| stddev|3.540478302048167E13|3400668.9511933764|7.162453037416465E29|5.123625161617243E10|
|    min|                 0.0|               0.0|               -1.45|                 0.0|
|    25%|               435.0|              0.97|                 0.6|                0.09|
|    50%|              1059.0|               7.0|                3.57|                0.58|
|    75%|              1661.0|              21.0|                16.0|                 1.4|
|    max|             5.95E16|  5.683552353652E9| 1.17471741414444E33|          

In [None]:
print(df.columns)

['code', 'product_name', 'brands', 'main_category', 'countries', 'stores', 'ingredients_text', 'quantity', 'allergens', 'traces', 'energy_100g', 'categories', 'fat_100g', 'saturated-fat_100g', 'trans-fat_100g', 'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g', 'added-sugars_100g', 'sucrose_100g', 'glucose_100g', 'fructose_100g', 'galactose_100g', 'lactose_100g', 'maltose_100g', 'fiber_100g', 'salt_100g', 'added-salt_100g', 'sodium_100g', 'alcohol_100g']


**En cherchant les valeurs aberantes par exemple on peut pas avoir un ingeredient par exemple 1 000g de Fats sur 100g de produit alimentaire**

In [None]:
print(df.filter((df.fat_100g > 0) & (df.fat_100g <= 100)).select("fat_100g").orderBy("fat_100g").count())

2380159




*   fat_100g	Quantité totale de matières grasses dans 100 g de produit	C'est la somme des graisses : saturées, insaturées, trans, etc.
*  saturated-fat_100g	Graisses saturées, généralement mauvaises pour la santé	Doit être ≤ fat_100g
*  trans-fat_100g	Graisses trans, très nocives, souvent issues de procédés industriels	Doit être ≤ fat_100g, souvent très faible (< 1 g)







In [None]:
df_incoherences_fat = df.filter(
    (col("saturated-fat_100g") > col("fat_100g")) |
    (col("trans-fat_100g") > col("fat_100g"))
)

print(df_incoherences_fat.count())

1770


In [None]:
df = df.filter(
    (col("saturated-fat_100g") < col("fat_100g")) |
    (col("trans-fat_100g") < col("fat_100g"))
)

In [None]:
print(df.filter((df.fat_100g > 0) & (df.fat_100g <= 100)).count()) #2219114

2219114


In [None]:
df= df.filter((df.fat_100g > 0) & (df.fat_100g <= 100))
print(df.count())

2219114


In [None]:
df.filter((df.fat_100g > 0) & (df.fat_100g <= 100)).show(10)
# ["categories", "traces","allergens", "ingredients_text", "stores", "countries", " main_category", "brands", "product_name"]

+-----+--------------------+--------------------+--------------------+--------------------+------+--------------------+---------+---------+--------------------+-----------+--------------------+--------+------------------+--------------+----------------+------------------+-----------+-----------------+------------+------------+-------------+--------------+------------+------------+----------+---------+---------------+-----------+------------+--------------+-------------+--------------+---------------+----------------+
| code|        product_name|              brands|       main_category|           countries|stores|    ingredients_text| quantity|allergens|              traces|energy_100g|          categories|fat_100g|saturated-fat_100g|trans-fat_100g|cholesterol_100g|carbohydrates_100g|sugars_100g|added-sugars_100g|sucrose_100g|glucose_100g|fructose_100g|galactose_100g|lactose_100g|maltose_100g|fiber_100g|salt_100g|added-salt_100g|sodium_100g|alcohol_100g|   sugar_level|   salt_level|   

In [None]:
print(df.filter(col("trans-fat_100g") >= 1).count())

4444


In [None]:
print(df.filter(col("trans-fat_100g") + col("saturated-fat_100g") > col("fat_100g") ).count())

316


In [None]:
df_coherence_fat = df.filter(
    when(
        col("trans-fat_100g").isNull() |
        col("saturated-fat_100g").isNull(),
        True
    ).otherwise(
        (col("trans-fat_100g") + col("saturated-fat_100g")) <= col("fat_100g")
    )
)
print(df_coherence_fat.count())

2218798


In [None]:
df= df_coherence_fat

In [None]:
print(df.count())

2218798


**Vérifier que : added-sugars_100g ≤ sugars_100g**

In [None]:
print(df.filter(col("sugars_100g").isNotNull()).count())


2168198


In [None]:
print(df.filter(col("added-sugars_100g") > col("sugars_100g")).count())

75


In [None]:
df = df.filter(col("sugars_100g").isNotNull())

In [None]:
df_incoherences_sugar = df.filter(
    col("sugars_100g") < (
        col("sucrose_100g") + col("glucose_100g") +
        col("fructose_100g") + col("lactose_100g") +
        col("maltose_100g")))

In [None]:
print(df_incoherences_sugar.count())

24


In [None]:
print(df.filter(
    col("sugars_100g") >= (
        col("sucrose_100g") + col("glucose_100g") +
        col("fructose_100g") + col("lactose_100g") +
        col("maltose_100g"))).count())

80


In [None]:
print(df.count())

2168198


### Référence européenne (Règlement INCO – UE 1169/2011) pour les added-sugars

* Riche en sucres: 	> 15 g / 100 g (produits solides)
* Faible en sucres	≤ 5 g / 100 g
* Sans sucres	≤ 0.5 g / 100 g






In [None]:
df = df.withColumn(
    "sugar_level",
    when(col("sugars_100g") > 15, "High sugar")
    .when((col("sugars_100g") >= 5) & (col("sugars_100g") <= 15), "Moderate sugar")
    .when(col("sugars_100g") < 5, "Low sugar")
    .otherwise("Unknown")
)

df.select("product_name", "sugars_100g", "sugar_level").show(10, truncate=False)



+----------------------------------------+-----------+--------------+
|product_name                            |sugars_100g|sugar_level   |
+----------------------------------------+-----------+--------------+
|Mozzarella Schnittfest Gerieben 45% Fett|1.0        |Low sugar     |
|Chocolate n 3                           |27.0       |High sugar    |
|Filets de poulet blanc x2               |6.2        |Moderate sugar|
|Light mayonaise                         |1.3        |Low sugar     |
|NULL                                    |0.98       |Low sugar     |
|NULL                                    |1.7        |Low sugar     |
|Carb balance                            |0.24       |Low sugar     |
|Powdered peanut butter                  |3.6        |Low sugar     |
|Madeleines ChocoLait                    |31.0       |High sugar    |
|Collagen For Her                        |5.5        |Moderate sugar|
+----------------------------------------+-----------+--------------+
only showing top 10 

Le sel (chlorure de sodium) est composé d'environ 40% de sodium. Donc, la relation approximative est : salt_100g ≈ sodium_100g × 2.5

In [None]:
# ~~0.1 marge d'erreur
incoherence_sodium_salt = df.filter(abs(col("salt_100g") - col("sodium_100g") * 2.5) > 0.1) \
  .select("product_name", "salt_100g", "sodium_100g")
incoherence_sodium_salt.show(10, truncate=False)

+--------------------------------+---------+-----------+
|product_name                    |salt_100g|sodium_100g|
+--------------------------------+---------+-----------+
|Frosted Mini Wheats Little Bites|44.6     |17.9       |
|NULL                            |101.0    |40.5       |
|Chunk Chicken Breast            |1010.0   |406.0      |
|Rising Crust Three Meat Pizza   |62.9     |25.1       |
|NULL                            |417.0    |167.0      |
|Shredded Mozzarella 32oz        |1880.0   |750.0      |
|Blackened Turkey Breast         |49.1     |19.6       |
|Ultimate Chocolate Chunk Cookies|893.0    |357.0      |
|Garlic Bread (frozen loaf)      |1270.0   |509.0      |
|Graham Crackers                 |1090.0   |435.0      |
+--------------------------------+---------+-----------+
only showing top 10 rows



In [None]:
print(incoherence_sodium_salt.count())


413


In [None]:
print(df.filter(abs(col("salt_100g") - col("sodium_100g") * 2.5) < 0.1).count())

1943253


In [None]:
df=df.filter(abs(col("salt_100g") - col("sodium_100g") * 2.5) < 0.1)

### **Selon l’organisation Mondiale de la Santé (OMS)**

In [None]:
df = df.withColumn(
    "salt_level",
    when(col("salt_100g") > 1.5, "High salt")
    .when((col("salt_100g") > 0.3) & (col("salt_100g") <= 1.5), "Moderate salt")
    .when(col("salt_100g") <= 0.3, "Low salt")
    .otherwise("Unknown")
)

df.select("product_name", "salt_100g", "salt_level").show(10, truncate=False)

+----------------------------------------+---------+-------------+
|product_name                            |salt_100g|salt_level   |
+----------------------------------------+---------+-------------+
|Mozzarella Schnittfest Gerieben 45% Fett|1.2      |Moderate salt|
|Chocolate n 3                           |0.025    |Low salt     |
|Filets de poulet blanc x2               |0.4      |Moderate salt|
|Light mayonaise                         |1.7      |High salt    |
|NULL                                    |0.95     |Moderate salt|
|NULL                                    |1.5      |Moderate salt|
|Carb balance                            |0.275    |Low salt     |
|Powdered peanut butter                  |0.0625   |Low salt     |
|Madeleines ChocoLait                    |0.48     |Moderate salt|
|Collagen For Her                        |1.25     |Moderate salt|
+----------------------------------------+---------+-------------+
only showing top 10 rows



In [None]:
print(df.count())

1943253


In [None]:
df.select("fiber_100g").show()

+----------+
|fiber_100g|
+----------+
|       2.2|
|       9.0|
|      20.0|
|      18.5|
|       0.0|
|       0.0|
|      3.45|
| 1.1299435|
|      3.33|
|      18.0|
|       8.0|
|      11.6|
|       0.0|
|     0.707|
|      16.1|
|      9.68|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
+----------+
only showing top 20 rows



In [None]:
df = df.withColumn(
    "fiber_level",
    when(col("fiber_100g") > 3, "High fiber")
    .when((col("fiber_100g") > 1) & (col("fiber_100g") <= 3), "Moderate fiber")
    .when((col("fiber_100g") <= 1) & (col("fiber_100g").isNotNull()), "Low fiber")
    .otherwise("Unknown")
)

df.select("product_name", "fiber_100g", "fiber_level").show(10, truncate=False)

+----------------------------------------+----------+--------------+
|product_name                            |fiber_100g|fiber_level   |
+----------------------------------------+----------+--------------+
|Mozzarella Schnittfest Gerieben 45% Fett|NULL      |Unknown       |
|Chocolate n 3                           |NULL      |Unknown       |
|Filets de poulet blanc x2               |2.2       |Moderate fiber|
|Light mayonaise                         |2.9       |Moderate fiber|
|NULL                                    |9.0       |High fiber    |
|NULL                                    |10.714286 |High fiber    |
|Carb balance                            |88.0      |High fiber    |
|Powdered peanut butter                  |NULL      |Unknown       |
|Madeleines ChocoLait                    |1.4       |Moderate fiber|
|Collagen For Her                        |NULL      |Unknown       |
+----------------------------------------+----------+--------------+
only showing top 10 rows



In [None]:
print(df.select("fiber_100g").count()) #just to make sure

1943253


In [None]:
df.selectExpr(
    "min(energy_100g) as min_energy_kj",
    "max(energy_100g) as max_energy_kj",
    "avg(energy_100g) as avg_energy_kj"
).show()

+-------------+-------------+-----------------+
|min_energy_kj|max_energy_kj|    avg_energy_kj|
+-------------+-------------+-----------------+
|          0.0|    2481598.0|1283.384471036146|
+-------------+-------------+-----------------+



In [None]:
df = df.withColumn(
    "energy_level",
    when(col("energy_100g") > 2100, "High energy")
    .when((col("energy_100g") >= 800) & (col("energy_100g") <= 2100), "Moderate energy")
    .when(col("energy_100g") < 800, "Low energy")
    .otherwise("Unknown")
)

df.select("product_name", "energy_100g", "energy_level").show(10, truncate=False)

+----------------------------------------+-----------+---------------+
|product_name                            |energy_100g|energy_level   |
+----------------------------------------+-----------+---------------+
|Mozzarella Schnittfest Gerieben 45% Fett|1389.0     |Moderate energy|
|Chocolate n 3                           |2415.0     |High energy    |
|Filets de poulet blanc x2               |392.0      |Low energy     |
|Light mayonaise                         |1620.0     |Moderate energy|
|NULL                                    |962.0      |Moderate energy|
|NULL                                    |1510.0     |Moderate energy|
|Carb balance                            |293.0      |Low energy     |
|Powdered peanut butter                  |188.0      |Low energy     |
|Madeleines ChocoLait                    |1926.0     |Moderate energy|
|Collagen For Her                        |1611.0     |Moderate energy|
+----------------------------------------+-----------+---------------+
only s

In [None]:
df = df.drop("alco_presence")


In [None]:
df = df.withColumn(
    "alcohol_presence",
    when(col("alcohol_100g") > 0, "Exists")
    .when(col("alcohol_100g") == 0.0, "halal")
    .otherwise("Unknown")
)

# Affichage de quelques exemples
df.select("product_name", "alcohol_100g", "alcohol_presence").show(10, truncate=False)

+----------------------------------------+------------+----------------+
|product_name                            |alcohol_100g|alcohol_presence|
+----------------------------------------+------------+----------------+
|Mozzarella Schnittfest Gerieben 45% Fett|NULL        |Unknown         |
|Chocolate n 3                           |NULL        |Unknown         |
|Filets de poulet blanc x2               |NULL        |Unknown         |
|Light mayonaise                         |NULL        |Unknown         |
|NULL                                    |0.0         |halal           |
|NULL                                    |NULL        |Unknown         |
|Carb balance                            |NULL        |Unknown         |
|Powdered peanut butter                  |NULL        |Unknown         |
|Madeleines ChocoLait                    |0.0         |halal           |
|Collagen For Her                        |NULL        |Unknown         |
+----------------------------------------+---------

In [None]:
df.groupBy("code").agg(count("*").alias("count")) \
  .filter(col("count") > 1) \
  .orderBy("count", ascending=False) \
  .show(10, truncate=False)

+---------------------+-----+
|code                 |count|
+---------------------+-----+
|2.3327665001535216E22|11   |
|2.3303976000010017E23|9    |
|1.954001423037061E22 |8    |
|2.3327665001305117E22|6    |
|2.3327665001335815E22|6    |
|2.330262200001003E23 |6    |
|2.3308122000010015E23|5    |
|2.3304354000010026E23|5    |
|2.3303561000010027E23|5    |
|2.338296100001004E23 |5    |
+---------------------+-----+
only showing top 10 rows



In [None]:
df.groupBy("code", "product_name", "main_category",  "sugar_level",   "salt_level",   "fiber_level", "energy_level", "alcohol_presence" ) \
  .agg(count("*").alias("count")) \
  .filter(col("count") > 1) \
  .orderBy("count", ascending=False) \
  .show(20, truncate=False)

+---------------------+---------------------------------+-------------------------------+--------------+-------------+--------------+---------------+----------------+-----+
|code                 |product_name                     |main_category                  |sugar_level   |salt_level   |fiber_level   |energy_level   |alcohol_presence|count|
+---------------------+---------------------------------+-------------------------------+--------------+-------------+--------------+---------------+----------------+-----+
|2.3327665001535216E22|Riz long basmati                 |en:white-basmati-rices         |Low sugar     |Low salt     |Unknown       |Moderate energy|Unknown         |7    |
|5.8943090051900122E17|Finísimo pavo menos sal          |NULL                           |Low sugar     |Moderate salt|Unknown       |Low energy     |Unknown         |3    |
|6.1320320052000115E17|Burger vegana                    |NULL                           |Low sugar     |High salt    |Unknown       |Lo

In [None]:
df = df.dropDuplicates([
    "code",
    "product_name",
    "main_category",
    "sugar_level",
    "salt_level",
    "fiber_level",
    "energy_level",
    "alcohol_presence"
])
print(df.count())

1941363


In [None]:
df.select("product_name","main_category","categories").show(20,truncate=False)

+--------------------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|product_name                          |main_category                          |categories                                                                                                                                                                                        |
+--------------------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Bramwells American Style Peanut Butter|en:peanut-butters                      |Plant-based foods and beverages, Plant-based foods, Legumes and their products, Spreads, Nut

In [None]:
print(df.filter(col("main_category").isNotNull() | col("categories").isNotNull()).count())


1058042


In [None]:
df = df.filter(col("main_category").isNotNull() | col("categories").isNotNull())

In [None]:
df.select("product_name","main_category","categories").show(20,truncate=False)

+-----------------------------------------+------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|product_name                             |main_category                       |categories                                                                                                                                                                                        |categories_list                                                                                                                                                                                     |
+-----------------------------------------+-----------

In [None]:
from pyspark.sql.functions import regexp_replace, trim, lower, split

df = df.withColumn("main_category", regexp_replace(col("main_category"), "^[a-z]{2}:", ""))

In [None]:
df = df.withColumn("categories", regexp_replace(col("categories"), "^[a-z]{2}:", ""))

In [None]:
df = df.withColumn("categories", trim(col("categories")))
df = df.withColumn("categories", lower(col("categories")))
df = df.filter((col("main_category") != "null") & (col("categories") != "null"))

In [None]:
# Extraire le premier élément de la colonne 'categories' (qui est une chaîne séparée par des virgules)
df = df.withColumn("first_category", split(col("categories"), ",")[0])

# Remplacer les nulls dans 'main_category' par 'first_category'
df = df.withColumn(
    "main_category",
    when(col("main_category").isNull(), col("first_category")).otherwise(col("main_category"))
).drop("first_category")

In [None]:
print(df.filter((col("main_category").isNull()) & (col("categories").isNotNull())).count())
print(df.filter((col("main_category").isNull()) & (col("categories").isNull())).count())

0
0


In [None]:
print(df.count())

1046318


In [None]:
print(df.filter(col("brands").isNotNull()).count())

820453


#### ["categories", "traces","allergens", "ingredients_text", "stores", "countries", " main_category", "brands", "product_name"]

In [None]:
df=df.filter(col("brands").isNotNull())

In [None]:
df.select("traces", "allergens", "ingredients_text", "stores", "countries").show(20, truncate=False)

+---------------------------------------------------------------+------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
colonnes_a_nettoyer = ["traces", "allergens", "ingredients_text", "stores", "countries", "main_category", "countries"]

for c in colonnes_a_nettoyer:
    # Supprime toutes les occurrences de deux lettres suivies de ':' partout dans la chaîne
    df = df.withColumn(c, regexp_replace(col(c), r"[a-z]{2}:", ""))

In [None]:
print(df.count())

820453


In [None]:
traces_values = [row['traces'] for row in df.select("traces").distinct().collect()]
allergens_values = [row['allergens'] for row in df.select("allergens").distinct().collect()]

print("Traces uniques :", traces_values)
print("Allergènes uniques :", allergens_values)

Traces uniques : ['eggs,gluten,milk,nuts,sesame-seeds', '  gluten    peanuts', ' eggs    peanuts', 'Lait', 'nuts,oeufs-ble-gluten', 'puede contener trazas de leche', '《Traces d’arachides et de sésame》', 'celery,crustaceans,eggs,peanuts,arachides œuf crustacés', 'celery,crustaceans,eggs,fish,molluscs,mustard,nuts,peanuts,sesame-seeds,soybeans', 'celery,mustard,nuts,peanuts', 'crustaceans,eggs,gluten,molluscs,mustard', 'celery,crustaceans,eggs,fish,milk,molluscs,mustard,nuts,peanuts,sesame-seeds', 'eggs,gluten,nuts,CONTENE: SOYA. PUEDE CONTENER: LECHE', 'celery,crustaceans,eggs,gluten,milk,mustard,nuts,sesame-seeds,soybeans,sulphur-dioxide-and-sulphites', 'celery,crustaceans,eggs,fish,gluten,lupin,milk,mustard,nuts,sesame-seeds,soybeans,sulphur-dioxide-and-sulphites', 'Produkt kann Spuren von Sellerie enthalten.', 'eggs,fish,gluten,nuts,peanuts,soybeans', ' milk    nuts', "Traces éventuelles d'arachides ou d'autres fruits à coque.", 'milk,soybeans,Autres fruits à coques et petits morceau

In [None]:
from pyspark.sql.functions import udf, split, col, array_union, array_distinct, concat_ws, when, lit
from pyspark.sql.types import ArrayType, StringType

def clean_array(arr):
    if arr is None:
        return []
    return list(set([x.strip().lower() for x in arr if x and x.strip() != ""]))

clean_array_udf = udf(clean_array, ArrayType(StringType()))

df = df.withColumn("traces_array", split(col("traces"), ",")) \
       .withColumn("allergens_array", split(col("allergens"), ",")) \
       .withColumn("traces_array_clean", clean_array_udf(col("traces_array"))) \
       .withColumn("allergens_array_clean", clean_array_udf(col("allergens_array")))

df = df.withColumn("combined_array", array_distinct(array_union(col("traces_array_clean"), col("allergens_array_clean"))))

df = df.withColumn(
    "combined_values",
    when(
        ( (col("traces").isNull() | (col("traces") == "")) & (col("allergens").isNull() | (col("allergens") == "")) ),
        lit("unknown")
    ).otherwise(
        when(
            (col("combined_array").isNull()) | (col("combined_array").getItem(0).isNull()),
            lit(None)
        ).otherwise(concat_ws(",", col("combined_array")))
    )
)

df.select("traces", "allergens", "combined_values").show(truncate=False)

df = df.withColumnRenamed("combined_values", "allergens and traces")

df = df.withColumn(
    "allergens and traces",
    when(col("allergens and traces") == "unknown", lit(None)).otherwise(col("allergens and traces"))
)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
df.select("traces", "allergens", "allergens and traces").show(truncate=False)


+------------------------------+--------------------+-------------------------------------------+
|traces                        |allergens           |allergens and traces                       |
+------------------------------+--------------------+-------------------------------------------+
|NULL                          |NULL                |NULL                                       |
|eggs,gluten,milk,nuts,soybeans|sesame-seeds        |gluten,eggs,nuts,soybeans,milk,sesame-seeds|
|milk,nuts,soybeans            |NULL                |nuts,soybeans,milk                         |
|eggs,gluten,milk,nuts,peanuts |NULL                |gluten,eggs,nuts,peanuts,milk              |
| Può contenere latte          |NULL                |può contenere latte                        |
|NULL                          |NULL                |NULL                                       |
|NULL                          |NULL                |NULL                                       |
|NULL               

In [None]:
df=df.drop("traces", "allergens")

In [None]:
from pyspark.sql.functions import mean, min, max, stddev

df.select(
    mean("cholesterol_100g").alias("mean_cholesterol"),
    min("cholesterol_100g").alias("min_cholesterol"),
    max("cholesterol_100g").alias("max_cholesterol"),
    stddev("cholesterol_100g").alias("stddev_cholesterol"),
).show()

+--------------------+---------------+---------------+------------------+
|    mean_cholesterol|min_cholesterol|max_cholesterol|stddev_cholesterol|
+--------------------+---------------+---------------+------------------+
|0.052808608333606184|            0.0|          260.0|1.4749144058753205|
+--------------------+---------------+---------------+------------------+



Cholestérol (en mg/100g probablement):

Moyenne ~ 0.05 : Très faible. Ça suggère que la plupart des produits ont quasiment pas de cholestérol, ce qui est cohérent pour beaucoup d’aliments végétaux ou transformés.

Min = 0 : OK, beaucoup de produits sans cholestérol.

Max = 260 : C’est plausible — certains produits d’origine animale (jaune d’œuf, beurre, charcuterie) peuvent avoir un cholestérol élevé autour de cette valeur.
explication : Un œuf entier de taille moyenne (~50g) contient environ 186 mg de cholestérol (dans le jaune).

Écart-type ~ 1.47 : Assez faible, car la majorité des produits ont peu ou pas de cholestérol, seuls quelques-uns ont des valeurs élevées, ce qui n’influence pas trop la moyenne.

Conclusion : Les valeurs ont l’air logiques pour un dataset alimentaire très large qui inclut beaucoup de produits faibles en cholestérol.



In [None]:
print(df.filter(col("cholesterol_100g").isNotNull()).count())

119547


###Références usuelles pour le cholestérol alimentaire (en mg/100g) :

*   Low : < 20 mg/100g
*   Moderate : 20 à 80 mg/100g
*   High : > 80 mg/100g

In [None]:
df = df.withColumn(
    "cholesterol_level",
    when(col("cholesterol_100g").isNull(), "Unknown")
    .when(col("cholesterol_100g") < 20, "Low")
    .when((col("cholesterol_100g") >= 20) & (col("cholesterol_100g") <= 80), "Moderate")
    .otherwise("High")
)

In [None]:
df=df.drop("traces_array","allergens_array","traces_array_clean","allergens_array_clean","combined_array","combined_values")

In [None]:
df.show(truncate=False)

+---------+-------------------------------------------+--------------------------------+------------------------------------+---------------------+--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
print(df.count())

820453


In [None]:
df.printSchema()

root
 |-- code: double (nullable = true)
 |-- product_name: string (nullable = true)
 |-- brands: string (nullable = true)
 |-- main_category: string (nullable = true)
 |-- countries: string (nullable = true)
 |-- stores: string (nullable = true)
 |-- ingredients_text: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- energy_100g: double (nullable = true)
 |-- categories: string (nullable = true)
 |-- fat_100g: double (nullable = true)
 |-- saturated-fat_100g: double (nullable = true)
 |-- trans-fat_100g: double (nullable = true)
 |-- cholesterol_100g: double (nullable = true)
 |-- carbohydrates_100g: double (nullable = true)
 |-- sugars_100g: double (nullable = true)
 |-- added-sugars_100g: double (nullable = true)
 |-- sucrose_100g: double (nullable = true)
 |-- glucose_100g: double (nullable = true)
 |-- fructose_100g: double (nullable = true)
 |-- galactose_100g: double (nullable = true)
 |-- lactose_100g: double (nullable = true)
 |-- maltose_100g: double (null

In [None]:
# # 1. Écrire le DataFrame dans un dossier temporaire avec une seule partition
# df.coalesce(1).write.mode("overwrite").option("header", True).csv("/content/temp_nutrition")

# # 2. Déplacer le fichier CSV généré dans un emplacement final avec un nom simple
# import os
# import shutil

# # Dossier Spark temporaire
# temp_folder = "/content/temp_nutrition"

# # Chemin de destination (à adapter selon ton Google Drive)
# final_path = "/content/drive/My Drive/Colab Notebooks/result_of_work.csv"

# # Cherche le fichier CSV généré et le renomme
# for file_name in os.listdir(temp_folder):
#     if file_name.startswith("part-") and file_name.endswith(".csv"):
#         temp_file_path = os.path.join(temp_folder, file_name)
#         shutil.move(temp_file_path, final_path)
#         print(f"✅ Fichier enregistré avec succès à : {final_path}")
#         break
# else:
#     print("❌ Fichier CSV non trouvé dans le dossier temporaire.")


✅ Fichier enregistré avec succès à : /content/drive/My Drive/Colab Notebooks/result_of_work.csv


In [None]:
# df = spark.read.csv("/content/drive/My Drive/Colab Notebooks/result_of_work.csv", sep=',', header=True, inferSchema=True)
# print(df.count())

820453


In [None]:
# print(df.select("ingredients_text").count())

820453


In [None]:
# Étape 1 : récupérer les 100 000 premières lignes
df_100k = df.limit(100000)

# Étape 2 : compter les occurrences de chaque valeur de main_category
counts = df_100k.groupBy("main_category").count()

# Étape 3 : afficher les résultats
counts.orderBy("count", ascending=False).show(truncate=False)


+--------------------------+-----+
|main_category             |count|
+--------------------------+-----+
|groceries                 |3391 |
|undefined                 |2343 |
|snacks                    |2253 |
|biscuits                  |2140 |
|yogurts                   |1010 |
|dark-chocolates           |807  |
|beverages                 |739  |
|frozen-desserts           |737  |
|virgin-olive-oils         |708  |
|confectioneries           |683  |
|cakes                     |677  |
|chocolate-candies         |605  |
|white-hams                |599  |
|sandwiches                |536  |
|ice-creams                |526  |
|toppings-ingredients      |514  |
|milks                     |509  |
|cereals-and-their-products|508  |
|frozen-foods              |503  |
|milk-chocolates           |463  |
+--------------------------+-----+
only showing top 20 rows



In [None]:
# df_100k = df.limit(100000)
# # 1. Écrire le DataFrame dans un dossier temporaire avec une seule partition
# df_100k.coalesce(1).write.mode("overwrite").option("header", True).csv("/content/temp_nutrition")

# # 2. Déplacer le fichier CSV généré dans un emplacement final avec un nom simple
# import os
# import shutil

# # Dossier Spark temporaire
# temp_folder = "/content/temp_nutrition"

# # Chemin de destination (à adapter selon ton Google Drive)
# final_path = "/content/drive/My Drive/Colab Notebooks/100K_df.csv"

# # Cherche le fichier CSV généré et le renomme
# for file_name in os.listdir(temp_folder):
#     if file_name.startswith("part-") and file_name.endswith(".csv"):
#         temp_file_path = os.path.join(temp_folder, file_name)
#         shutil.move(temp_file_path, final_path)
#         print(f"✅ Fichier enregistré avec succès à : {final_path}")
#         break
# else:
#     print("❌ Fichier CSV non trouvé dans le dossier temporaire.")

✅ Fichier enregistré avec succès à : /content/drive/My Drive/Colab Notebooks/100K_df.csv


In [None]:
# print(df_100k.count())

100000
