In [1]:
import polars as pl

df = pl.read_parquet("../openfoodfacts_products.parquet")

In [None]:
import polars as pl

def detect_empty_columns(df: pl.DataFrame, threshold: float = 0.95):
    total_rows = df.height
    results = []

    for col in df.columns:
        # Count nulls
        nulls = df.select(pl.col(col).is_null().sum()).item()
        
        # Count empty strings (only for string columns)
        empties = 0
        if df.schema[col] == pl.Utf8:
            empties = df.select((pl.col(col) == "").sum()).item()
        
        # Compute total missing
        total_missing = nulls + empties
        ratio = total_missing / total_rows

        if ratio >= threshold:
            results.append((col, ratio))

    return sorted(results, key=lambda x: -x[1])



In [9]:

columns_to_drop = detect_empty_columns(df, threshold=0.95)

for col, ratio in columns_to_drop:
    print(f"{col}: {ratio:.2%} de valeurs manquantes")


cities: 100.00% de valeurs manquantes
allergens_en: 100.00% de valeurs manquantes
nutrition-score-uk_100g: 100.00% de valeurs manquantes
additives: 100.00% de valeurs manquantes
elaidic-acid_100g: 100.00% de valeurs manquantes
glycemic-index_100g: 100.00% de valeurs manquantes
chlorophyl_100g: 100.00% de valeurs manquantes
erucic-acid_100g: 100.00% de valeurs manquantes
water-hardness_100g: 100.00% de valeurs manquantes
carbohydrates-total_100g: 100.00% de valeurs manquantes
caproic-acid_100g: 100.00% de valeurs manquantes
galactose_100g: 100.00% de valeurs manquantes
gamma-linolenic-acid_100g: 100.00% de valeurs manquantes
nervonic-acid_100g: 100.00% de valeurs manquantes
lignoceric-acid_100g: 100.00% de valeurs manquantes
dihomo-gamma-linolenic-acid_100g: 100.00% de valeurs manquantes
caprylic-acid_100g: 100.00% de valeurs manquantes
cerotic-acid_100g: 100.00% de valeurs manquantes
capric-acid_100g: 100.00% de valeurs manquantes
myristic-acid_100g: 100.00% de valeurs manquantes
mead-

In [4]:
# Suppression automatique des colonnes vides à plus de 95%
columns_to_drop = [col for col, ratio in detect_empty_columns(df, threshold=0.95)]

# Suppression du DataFrame
df_cleaned = df.drop(columns_to_drop)

In [10]:
len(df_cleaned.columns)

93

In [11]:
df_cleaned.head(5)

code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,last_modified_by,last_updated_t,last_updated_datetime,product_name,quantity,packaging,packaging_tags,packaging_en,brands,brands_tags,brands_en,categories,categories_tags,categories_en,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,…,environmental_score_score,environmental_score_grade,nutrient_levels_tags,product_quantity,unique_scans_n,popularity_tags,completeness,last_image_t,last_image_datetime,main_category,main_category_en,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kj_100g,energy-kcal_100g,energy_100g,fat_100g,saturated-fat_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,vitamin-a_100g,vitamin-c_100g,potassium_100g,calcium_100g,iron_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,nutrition-score-fr_100g
i64,str,str,i64,str,i64,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,i64,str,str,f64,i64,str,f64,i64,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
54,"""http://world-en.openfoodfacts.…","""kiliweb""",1582569031,"""2020-02-24T18:30:31Z""",1733085204,"""2024-12-01T20:33:24Z""",,1740205422,"""2025-02-22T06:23:42Z""","""Limonade artisanale a la rose""",,,,,,,,,,,,,,,,,,,,"""en:fr""","""en:france""","""France""",,,,,…,,"""unknown""",,,,,0.1625,1733085204,"""2024-12-01T20:33:24Z""",,,"""https://images.openfoodfacts.o…","""https://images.openfoodfacts.o…",,,,,,,,,,,,,,,,,,,,,,,,
63,"""http://world-en.openfoodfacts.…","""kiliweb""",1673620307,"""2023-01-13T14:31:47Z""",1746258398,"""2025-05-03T07:46:38Z""","""roboto-app""",1746258398,"""2025-05-03T07:46:38Z""","""CIABATTA OLIVE""",,,,,"""EDEKA""","""xx:edeka""","""edeka""",,,,,,,,,,,,,"""en:fr""","""en:france""","""France""","""Weizenmehl, Rapsöl, Speisesalz…","""en:weizenmehl,en:rapsol,en:spe…","""en:palm-oil-content-unknown,en…",,…,,"""unknown""",,,1.0,"""top-75-percent-scans-2024,top-…",0.5625,1746257766,"""2025-05-03T07:36:06Z""",,,"""https://images.openfoodfacts.o…","""https://images.openfoodfacts.o…",,,,,,332.0,1389.0,25.0,17.3,,,3.0,1.0,,23.0,1.2,0.48,,,,,,0.0,
114,"""http://world-en.openfoodfacts.…","""kiliweb""",1580066482,"""2020-01-26T19:21:22Z""",1737247862,"""2025-01-19T00:51:02Z""","""smoothie-app""",1743312145,"""2025-03-30T05:22:25Z""","""Chocolate n 3""","""80 g""",,,,"""Jeff de Bruges""","""xx:jeff-de-bruges""","""jeff-de-bruges""",,,,,,"""Point Vert, Fabriqué en France""","""en:green-dot,en:made-in-france""","""Green Dot,Made in France""",,,,,"""France""","""en:france""","""France""",,,,,…,,"""unknown""",,80.0,1.0,"""bottom-25-percent-scans-2022,b…",0.475,1737247860,"""2025-01-19T00:51:00Z""",,,"""https://images.openfoodfacts.o…","""https://images.openfoodfacts.o…",,,"""https://images.openfoodfacts.o…","""https://images.openfoodfacts.o…",2415.0,,2415.0,44.0,28.0,,,30.0,27.0,,7.1,0.025,0.01,,,,,,,
1,"""http://world-en.openfoodfacts.…","""inf""",1634745456,"""2021-10-20T15:57:36Z""",1746721833,"""2025-05-08T16:30:33Z""","""foodless""",1746721833,"""2025-05-08T16:30:33Z""","""KOJI MISO PASTE""","""280gr. 320 Kapseln""",,,,"""Brandt""","""xx:brandt""","""brandt""","""Nahrungsergänzungsmittel""","""en:dietary-supplements""","""Dietary supplements""",,,"""No gluten, Organic, Vegetarian…","""en:no-gluten,en:organic,en:veg…","""No gluten,Organic,Vegetarian,E…",,,,,"""Allemagne, États-Unis, en:fr""","""en:france,en:germany,en:united…","""France,Germany,United States""","""Mandeln blanchiert""","""en:blanches-almonds,en:nut,en:…","""en:palm-oil-free,en:vegan,en:v…",,…,,"""unknown""",,280.0,1.0,"""top-75-percent-scans-2024,top-…",0.85,1746387820,"""2025-05-04T19:43:40Z""","""en:dietary-supplements""","""Dietary supplements""","""https://images.openfoodfacts.o…","""https://images.openfoodfacts.o…","""https://images.openfoodfacts.o…","""https://images.openfoodfacts.o…","""https://images.openfoodfacts.o…","""https://images.openfoodfacts.o…",,,,,,,,,,,,,,,,,,,100.0,
105,"""http://world-en.openfoodfacts.…","""kiliweb""",1572117743,"""2019-10-26T19:22:23Z""",1738073570,"""2025-01-28T14:12:50Z""",,1743653496,"""2025-04-03T04:11:36Z""","""Paleta gran reserva - Sierra n…","""750ml""",,,,"""AdvoCare""","""xx:advocare""","""advocare""","""Bebidas y preparaciones de beb…","""en:beverages-and-beverages-pre…","""Beverages and beverages prepar…",,,,,,,,,,"""Spanien, Germany""","""en:germany,en:spain""","""Germany,Spain""","""Thiamin, Biotin, Chromium, Gar…","""en:thiamin,en:biotin,en:vitami…","""en:may-contain-palm-oil,en:veg…",,…,,"""unknown""",,750.0,1.0,"""top-75-percent-scans-2024,top-…",0.675,1738073557,"""2025-01-28T14:12:37Z""","""en:beverages""","""Beverages""","""https://images.openfoodfacts.o…","""https://images.openfoodfacts.o…",,,"""https://images.openfoodfacts.o…","""https://images.openfoodfacts.o…",,,,,,,,,,,,,,,,,,,0.011335,


In [None]:
df_cleaned.write_csv("../openfoodfacts_no_empty_columns.csv")

In [13]:
df_cleaned.write_ndjson("../openfoodfacts_no_empty_columns.jsonl")

In [None]:
from pymongo import MongoClient

# Requires the PyMongo package.
# https://api.mongodb.com/python/current

client = MongoClient('mongodb://localhost:27017/')
filter={}
project={
    'url': True
}

result = client['nutriwizer']['products'].find(
  filter=filter,
  projection=project
)