In [81]:
import pandas as pd
import numpy as np


## Load Dataset

In [83]:
#Load the additives data
df_additives = pd.read_csv('../data/addtitives_processed.csv')
df_additives.head()

Unnamed: 0,ingredient,purpose,health_concern,alternative_names,bad_ingredients,bad_ingredients_preprocessed,processed_bad_ingredients
0,Acesulfame potassium,Sweetener,"Cancer, Hormone Disruption, Risks to Pregnant ...","Ace-K, Acesulfame K, E950,Sunett","Acesulfame potassium,Ace-K, Acesulfame K, E950...","acesulfame potassium, ace-k, acesulfame k, e95...","['acesulfame potassium', 'ace-k', 'acesulfame ..."
1,Allura Red AC,Coloring,May cause hyperactivity in children.,"E129,Red 40","Allura Red AC,E129,Red 40","allura red ac, e129, red 40","['allura red ac', 'e129', 'red 40']"
2,Aloe vera,Flavoring,Cancer,"Aloe barbadensis, Aloe leaf extract","Aloe vera,Aloe barbadensis, Aloe leaf extract","aloe vera, aloe barbadensis, aloe leaf extract","['aloe vera', 'aloe barbadensis', 'aloe leaf e..."
3,Amaranth,Coloring,carcinogenic effects,"E123,Red No. 2","Amaranth,E123,Red No. 2","amaranth, e123, red no. 2","['amaranth', 'e123', 'red no. 2']"
4,Aspartame,Sweetener,"Cancer,Linked to headaches, dizziness","Equal, NutraSweet, E951, AminoSweet","Aspartame ,Equal, NutraSweet, E951, AminoSweet","aspartame, equal, nutrasweet, e951, aminosweet","['aspartame', 'equal', 'nutrasweet', 'e951', '..."


In [84]:
#Load the food products dataset
df_food = pd.read_csv('../data/food_processed.csv')
df_food.head()

Unnamed: 0,brands,product_name,categories,countries,food_groups_tags,food_groups,ingredients_text,category_name,country,processed_ingredients
0,"Mutti,POLPA",pulpe de tomates,"Pflanzliche Lebensmittel und Getränke,Pflanzli...","Australien,Österreich,Belgien,Kanada,Frankreic...","['en:fruits-and-vegetables', 'en:vegetables']",en:vegetables,"tomatoes 99.8%, salt",Plant-based foods and beverages,canada,"tomatoes 99.8%, salt"
1,"Maïzena, Unilever",Maizena Fleur de Maïs Sans Gluten 400g,"Plant-based foods and beverages, Plant-based f...","Belgique, Canada, France, Martinique, La Réuni...","['en:cereals-and-potatoes', 'en:cereals']",en:cereals,Amidon de maïs.,Plant-based foods and beverages,canada,amidon de maïs.
2,Barilla,Lasagne all'uovo,"Cibi e bevande a base vegetale, Cibi a base ve...","Belgium,Canada,Croatia,France,Germany,Greece,H...","['en:cereals-and-potatoes', 'en:cereals']",en:cereals,"Semola di grano duro, uova fresche di categori...",Plant-based foods and beverages,canada,"semola di grano duro, uova fresche di categori..."
3,Tipiak,Fine chapelure de pain,"Aliments et boissons à base de végétaux,Alimen...","Canada,France","['en:cereals-and-potatoes', 'en:bread']",en:bread,"Farine de blé (gluten), sel, levure, Traces po...",Plant-based foods and beverages,canada,"farine de blé (gluten), sel, levure, traces po..."
4,"Zespri,Sungold,Catania,ALDI Zespri",Kiwi Sungold,"Aliments et boissons à base de végétaux,Alimen...","Belgique,Canada,France,Allemagne,Pologne,Espag...","['en:fruits-and-vegetables', 'en:fruits']",en:fruits,Kiwifruit,Plant-based foods and beverages,canada,kiwifruit


### Labelling food product based on the ingredients as healthy and not healthy


In [86]:
# bad ingredients into a single set
bad_ingredients_set = set()
df_additives['bad_ingredients_preprocessed'].str.split(', ').apply(bad_ingredients_set.update)
bad_ingredients_set.discard('')
bad_ingredients_set

{'ace-k',
 'acesulfame k',
 'acesulfame potassium',
 'ada',
 'allura red ac',
 'aloe barbadensis',
 'aloe leaf extract',
 'aloe vera',
 'amaranth',
 'aminosweet',
 'antioxidant 319',
 'aspartame',
 'azodicarbonamide',
 'azorubine',
 'benzoic acid salt',
 'bha',
 'bht',
 'black pn',
 'blue 1',
 'brilliant black bn',
 'brilliant blue fcf.',
 'bromated flour',
 'bromic acid potassium salt',
 'brominated palm oil',
 'brominated soybean oil',
 'brominated vegetable fat',
 'brominated vegetable oil',
 'butylated hydroxyanisole',
 'butylated hydroxytoluene',
 'bvo',
 'calcium bisulfite',
 'calcium propionate',
 'calcium sulfite',
 'cap',
 'caramel coloring',
 'carboxymethylcellulose',
 'carmoisine',
 'carrageenan',
 'chile saltpeter',
 'chloramphenicol',
 'cochineal red a',
 'cyclamate',
 'cyclamates',
 'cyclamic acid',
 'dough conditioner',
 'e102',
 'e110',
 'e122',
 'e123',
 'e124',
 'e127',
 'e129',
 'e133',
 'e150c',
 'e150d',
 'e151',
 'e171',
 'e202',
 'e211',
 'e220',
 'e221',
 'e222'

In [87]:
#function to assign health_label based on the presence of bad ingredients in food database ingredient_list

def health_label(ingredients_text):
    # split the preprocessed ingredient text
    product_ingredients = set(ingredients_text.split(', '))
    # discard any empty strings 
    product_ingredients.discard('')
    # check for product ingredients in the set of bad ingredients
    if product_ingredients & bad_ingredients_set:
        return 'not healthy'
    else:
        return 'healthy'


In [88]:

df_food['health_label'] = df_food['processed_ingredients'].apply(assign_health_label)
df_food.head()

Unnamed: 0,brands,product_name,categories,countries,food_groups_tags,food_groups,ingredients_text,category_name,country,processed_ingredients,health_label
0,"Mutti,POLPA",pulpe de tomates,"Pflanzliche Lebensmittel und Getränke,Pflanzli...","Australien,Österreich,Belgien,Kanada,Frankreic...","['en:fruits-and-vegetables', 'en:vegetables']",en:vegetables,"tomatoes 99.8%, salt",Plant-based foods and beverages,canada,"tomatoes 99.8%, salt",healthy
1,"Maïzena, Unilever",Maizena Fleur de Maïs Sans Gluten 400g,"Plant-based foods and beverages, Plant-based f...","Belgique, Canada, France, Martinique, La Réuni...","['en:cereals-and-potatoes', 'en:cereals']",en:cereals,Amidon de maïs.,Plant-based foods and beverages,canada,amidon de maïs.,healthy
2,Barilla,Lasagne all'uovo,"Cibi e bevande a base vegetale, Cibi a base ve...","Belgium,Canada,Croatia,France,Germany,Greece,H...","['en:cereals-and-potatoes', 'en:cereals']",en:cereals,"Semola di grano duro, uova fresche di categori...",Plant-based foods and beverages,canada,"semola di grano duro, uova fresche di categori...",healthy
3,Tipiak,Fine chapelure de pain,"Aliments et boissons à base de végétaux,Alimen...","Canada,France","['en:cereals-and-potatoes', 'en:bread']",en:bread,"Farine de blé (gluten), sel, levure, Traces po...",Plant-based foods and beverages,canada,"farine de blé (gluten), sel, levure, traces po...",healthy
4,"Zespri,Sungold,Catania,ALDI Zespri",Kiwi Sungold,"Aliments et boissons à base de végétaux,Alimen...","Belgique,Canada,France,Allemagne,Pologne,Espag...","['en:fruits-and-vegetables', 'en:fruits']",en:fruits,Kiwifruit,Plant-based foods and beverages,canada,kiwifruit,healthy


In [89]:
#check for unhealthy food labels
unhealthy_entries = df_food[df_food['health_label'] == 'not healthy']
unhealthy_entries

Unnamed: 0,brands,product_name,categories,countries,food_groups_tags,food_groups,ingredients_text,category_name,country,processed_ingredients,health_label
18,"Post, Shredded Wheat",Spoon Size Shredded Wheat & Bran - Canada,"Plant-based foods and beverages, Plant-based f...","Canada, World","['en:cereals-and-potatoes', 'en:breakfast-cere...",en:breakfast-cereals,"Whole grain wheat, Wheat bran, BHT",Plant-based foods and beverages,canada,"whole grain wheat, wheat bran, bht",not healthy
52,Prime,Ice Pop Hydration Drink,"Plant-based foods and beverages, Beverages, Pl...",Canada,"['en:beverages', 'en:sweetened-beverages']",en:sweetened-beverages,"Filtered water, Coconut water from concentrate...",Plant-based foods and beverages,canada,"filtered water, coconut water from concentrate...",not healthy
93,Prime,Blue Raspberry Hydration Drink,"Plant-based foods and beverages, Beverages, Pl...",Canada,"['en:beverages', 'en:artificially-sweetened-be...",en:artificially-sweetened-beverages,"Filtered water, Coconut water from concentrate...",Plant-based foods and beverages,canada,"filtered water, coconut water from concentrate...",not healthy
94,D'Italiano,Original Thick Sliced White Bread,"Plant-based foods and beverages, Plant-based f...","France,Canada","['en:cereals-and-potatoes', 'en:bread']",en:bread,"Enriched wheat flour, Water, Sugar, Yeast, Veg...",Plant-based foods and beverages,canada,"enriched wheat flour, water, sugar, yeast, veg...",not healthy
196,Kara,UHT Coconut Cream 24%,"Plantaardige levensmiddelen en dranken,Dranken...","Wereld,en:Franciaország,en:Magyarország","['en:beverages', 'en:plant-based-milk-substitu...",en:plant-based-milk-substitutes,"natural coconut cream (99,9%), xanthan gum, gu...",Plant-based foods and beverages,world,"natural coconut cream (99, 9%), xanthan gum, g...",not healthy
...,...,...,...,...,...,...,...,...,...,...,...
25634,Auchan,Festonate Fromages,en:instant-pasta,"France, Monde","['en:composite-foods', 'en:one-dish-meals']",en:one-dish-meals,"Pâtes aux œufs cuites 45,7% (eau, semoule de B...",Pasta dishes,world,"pâtes aux œufs cuites 45, 7% (eau, semoule de ...",not healthy
25678,Auchan,Festonate carbonara,"Plats préparés, Plats à base de pâtes, Pâtes i...","France, Monde","['en:composite-foods', 'en:one-dish-meals']",en:one-dish-meals,"Pâtes aux œufs cuites 42% (semoule de BLÉ dur,...",Pasta dishes,world,"pâtes aux œufs cuites 42% (semoule de blé dur,...",not healthy
25693,Auchan,Tagliatelles carbonara aux lardons fumés,"Plats préparés, Plats à base de pâtes, Plats p...","France, Monde","['en:composite-foods', 'en:one-dish-meals']",en:one-dish-meals,"Pâtesaux ŒUFS cuites 46% (eau, semoule deBLÉ d...",Pasta dishes,world,"pâtesaux œufs cuites 46% (eau, semoule deblé d...",not healthy
25696,Daiya,Dairy-Free Cheddar Mac & Cheese,"Plant-based foods and beverages, Plant-based f...","France,United States, World","['en:composite-foods', 'en:one-dish-meals']",en:one-dish-meals,"Brown rice pasta (whole grain brown rice, rice...",Pasta dishes,world,"brown rice pasta (whole grain brown rice, rice...",not healthy


In [99]:
df_food.to_csv('../data/final_data.csv')