In [1]:
import pandas as pd

In [11]:
pd.options.display.max_rows = 100

In [40]:
pd.options.display.max_columns = 150

In [2]:
# OFF ~ open food facts
OFF_DUMP_PATH = '/Users/rgareev/data/openfoodfacts/20220831/en.openfoodfacts.org.products.csv'

In [3]:
df_head = pd.read_csv(OFF_DUMP_PATH, sep='\t', nrows=1000)

In [4]:
df_head.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 186 entries, code to carnitine_100g
dtypes: float64(122), int64(2), object(61), uint64(1)
memory usage: 1.4+ MB


In [64]:
print(*df_head.columns, sep='\n')

code
url
creator
created_t
created_datetime
last_modified_t
last_modified_datetime
product_name
abbreviated_product_name
generic_name
quantity
packaging
packaging_tags
packaging_en
packaging_text
brands
brands_tags
categories
categories_tags
categories_en
origins
origins_tags
origins_en
manufacturing_places
manufacturing_places_tags
labels
labels_tags
labels_en
emb_codes
emb_codes_tags
first_packaging_code_geo
cities
cities_tags
purchase_places
stores
countries
countries_tags
countries_en
ingredients_text
ingredients_tags
allergens
allergens_en
traces
traces_tags
traces_en
serving_size
serving_quantity
no_nutriments
additives_n
additives
additives_tags
additives_en
nutriscore_score
nutriscore_grade
nova_group
pnns_groups_1
pnns_groups_2
food_groups
food_groups_tags
food_groups_en
states
states_tags
states_en
brand_owner
ecoscore_score
ecoscore_grade
main_category
main_category_en
image_url
image_small_url
image_ingredients_url
image_ingredients_small_url
image_nutrition_url
image_nutri

In [36]:
# nutrition fact columns
nf_columns = [col for col in df_head.columns if col.endswith('_100g')]
print(len(nf_columns))
nf_columns

112


['energy-kj_100g',
 'energy-kcal_100g',
 'energy_100g',
 'energy-from-fat_100g',
 'fat_100g',
 'saturated-fat_100g',
 '-butyric-acid_100g',
 '-caproic-acid_100g',
 '-caprylic-acid_100g',
 '-capric-acid_100g',
 '-lauric-acid_100g',
 '-myristic-acid_100g',
 '-palmitic-acid_100g',
 '-stearic-acid_100g',
 '-arachidic-acid_100g',
 '-behenic-acid_100g',
 '-lignoceric-acid_100g',
 '-cerotic-acid_100g',
 '-montanic-acid_100g',
 '-melissic-acid_100g',
 'monounsaturated-fat_100g',
 'polyunsaturated-fat_100g',
 'omega-3-fat_100g',
 '-alpha-linolenic-acid_100g',
 '-eicosapentaenoic-acid_100g',
 '-docosahexaenoic-acid_100g',
 'omega-6-fat_100g',
 '-linoleic-acid_100g',
 '-arachidonic-acid_100g',
 '-gamma-linolenic-acid_100g',
 '-dihomo-gamma-linolenic-acid_100g',
 'omega-9-fat_100g',
 '-oleic-acid_100g',
 '-elaidic-acid_100g',
 '-gondoic-acid_100g',
 '-mead-acid_100g',
 '-erucic-acid_100g',
 '-nervonic-acid_100g',
 'trans-fat_100g',
 'cholesterol_100g',
 'carbohydrates_100g',
 'sugars_100g',
 '-suc

What to use as ID?

`code`, but there are few duplicates (< 0.1%)

Useful columns:
* `product_name` - 4% is empty
* `categories` and `categories_tags` - 60% is empty
* `energy-kcal_100g`
* `fat_100g`
* `saturated-fat_100g`
* `carbohydrates_100g`
* `sugars_100g`
* `fiber_100g`, only 800k
* `proteins_100g`
* `salt_100g`, ~1.7M
* `sodium_100g`, ~1.7M
* `fruits-vegetables-nuts-estimate-from-ingredients_100g`, ~800K
* `nutrition-score-fr_100g`, ~800K

Do not use:
* `generic_name` - 95% is empty
* `traces` and `traces_tags` - more than 95% is empty

What can be predicted:
* `[multiclass]` categories (*choose level*) given nutrition facts per 100g
* `[multiclass]` NOVA groups (1-4) given ingredients list
  

In [72]:
df = pd.read_csv(OFF_DUMP_PATH, sep='\t', usecols=['countries_tags', 'nova_group', 'ingredients_tags'])

In [73]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2508100 entries, 0 to 2508099
Data columns (total 3 columns):
 #   Column            Dtype  
---  ------            -----  
 0   countries_tags    object 
 1   ingredients_tags  object 
 2   nova_group        float64
dtypes: float64(1), object(2)
memory usage: 552.0 MB


In [76]:
df.nova_group.notnull().mean()

0.2809054662892229

In [77]:
df.ingredients_tags.notnull().mean()

0.31111638291934135

In [81]:
(df.ingredients_tags.notnull() & df.nova_group.notnull()).sum()

676995

In [21]:
df.code.nunique() / len(df)

0.9998863681671385

In [24]:
df.product_name.isnull().mean()

0.03574897332642239

In [45]:
df.ingredients_text.isnull().mean()

0.6881962441688928

In [50]:
df.countries_tags.isnull().mean()

0.0028184681631513893

In [54]:
df.countries_tags.value_counts(normalize=True).iloc[:50]

en:france                      0.337668
en:united-states               0.203379
en:spain                       0.104787
en:italy                       0.069177
en:germany                     0.047558
en:united-kingdom              0.029793
en:canada                      0.025207
en:switzerland                 0.025152
en:belgium                     0.025092
en:australia                   0.009919
en:ireland                     0.009263
en:france,en:germany           0.005367
en:france,en:spain             0.004307
en:netherlands                 0.004056
en:russia                      0.003876
en:belgium,en:france           0.003586
en:france,en:switzerland       0.003361
en:germany,en:ireland          0.003258
en:poland                      0.002923
en:mexico                      0.002807
en:morocco                     0.002427
en:sweden                      0.002133
en:portugal                    0.002098
en:luxembourg                  0.002096
en:austria                     0.002015


In [78]:
df.sample(n=100)

Unnamed: 0,countries_tags,ingredients_tags,nova_group
908942,en:france,,
604628,en:united-states,"en:tomato,en:vegetable,en:onion,en:root-vegeta...",3.0
1740814,en:france,,
2277476,en:france,,
796706,en:france,,
167760,en:united-states,"en:corn,en:cereal,en:black-beans,en:legume,en:...",1.0
1586957,en:france,,
1723830,"en:germany,en:spain",es:hazelnuts,
463304,en:united-states,,
2442354,en:democratic-republic-of-the-congo,,


In [71]:
df.nova_group.value_counts()

4.0    454561
3.0    137070
1.0     76892
2.0     36016
Name: nova_group, dtype: int64

In [63]:
df.main_category.value_counts().iloc[:50]

en:snacks                               34531
en:sauces                               17408
en:biscuits                             16887
en:cheeses                              16621
en:confectioneries                      16510
en:beverages                            13158
en:breads                               10669
en:frozen-desserts                       9505
en:yogurts                               8622
en:cereals-and-their-products            8566
en:sweetened-beverages                   7881
en:salted-snacks                         7676
en:frozen-foods                          7636
en:condiments                            7623
en:cakes                                 7459
en:extra-virgin-olive-oils               6895
en:pastas                                6801
en:plant-based-beverages                 6713
en:prepared-meats                        6197
en:chocolate-candies                     6101
en:sodas                                 5887
en:dips                           