In [897]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS


### Load Additives Data

In [899]:
#Load the additives data
df_additives = pd.read_csv('../data/Ingredients_list_banned.csv')
df_additives.head()

Unnamed: 0,Ingredient,Purpose,Health_concern,Alternative_names
0,Acesulfame potassium,Sweetener,"Cancer, Hormone Disruption, Risks to Pregnant ...","Ace-K, Acesulfame K, E950,Sunett"
1,Allura Red AC,Coloring,May cause hyperactivity in children.,"E129,Red 40"
2,Aloe vera,Flavoring,Cancer,"Aloe barbadensis, Aloe leaf extract"
3,Amaranth,Coloring,carcinogenic effects,"E123,Red No. 2"
4,Aspartame,Sweetener,"Cancer,Linked to headaches, dizziness","Equal, NutraSweet, E951, AminoSweet"


#### Data Cleaning 

In [901]:
# Rename the column names to required format
df_additives.columns=df_additives.columns.str.lower()

In [902]:
# Combining the 'ingredient' and 'alternate_names' to have a combine list of unhealthy ingredients
df_additives['bad_ingredients'] = df_additives['ingredient']+ ','+df_additives['alternative_names'].fillna('')
df_additives.head(40)



Unnamed: 0,ingredient,purpose,health_concern,alternative_names,bad_ingredients
0,Acesulfame potassium,Sweetener,"Cancer, Hormone Disruption, Risks to Pregnant ...","Ace-K, Acesulfame K, E950,Sunett","Acesulfame potassium,Ace-K, Acesulfame K, E950..."
1,Allura Red AC,Coloring,May cause hyperactivity in children.,"E129,Red 40","Allura Red AC,E129,Red 40"
2,Aloe vera,Flavoring,Cancer,"Aloe barbadensis, Aloe leaf extract","Aloe vera,Aloe barbadensis, Aloe leaf extract"
3,Amaranth,Coloring,carcinogenic effects,"E123,Red No. 2","Amaranth,E123,Red No. 2"
4,Aspartame,Sweetener,"Cancer,Linked to headaches, dizziness","Equal, NutraSweet, E951, AminoSweet","Aspartame ,Equal, NutraSweet, E951, AminoSweet"
5,Azodicarbonamide,Coloring,Linked to respiratory issues and classified a...,"ADA,Dough Conditioner,Flour Treatment Agent","Azodicarbonamide,ADA,Dough Conditioner,Flour T..."
6,Brilliant Black BN,other,Linked to hyperactivity and allergic reactions.,"E151 ,Black PN","Brilliant Black BN,E151 ,Black PN"
7,Brominated vegetable oil,Emulsifier,Neurological & Behavioral,"BVO,Brominated Soybean Oil, Brominated Palm Oi...","Brominated vegetable oil ,BVO,Brominated Soybe..."
8,Butylated hydroxyanisole,Preservative,Cancer,"BHA, E320,Butylated Hydroxyanisole,E321,BHT,Bu...","Butylated hydroxyanisole ,BHA, E320,Butylated ..."
9,Caramel coloring,Coloring,Cancer,"E150d, Sulfite ammonia,E150c","Caramel coloring,E150d, Sulfite ammonia,E150c"


In [903]:
#Check the null values
df_additives.isna().sum()

ingredient           0
purpose              0
health_concern       1
alternative_names    2
bad_ingredients      0
dtype: int64

In [904]:
#Check the data types
df_additives.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ingredient         38 non-null     object
 1   purpose            38 non-null     object
 2   health_concern     37 non-null     object
 3   alternative_names  36 non-null     object
 4   bad_ingredients    38 non-null     object
dtypes: object(5)
memory usage: 1.6+ KB


In [905]:
#Summary statistics of additives dataset
df_additives.describe(include='all')

Unnamed: 0,ingredient,purpose,health_concern,alternative_names,bad_ingredients
count,38,38,37,36,38
unique,38,11,29,36,38
top,Acesulfame potassium,Coloring,Cancer,"Ace-K, Acesulfame K, E950,Sunett","Acesulfame potassium,Ace-K, Acesulfame K, E950..."
freq,1,12,9,1,1


### Load Food Products Dataset

In [907]:
#df_food = pd.read_csv('../data/ingredients v1.csv')
df_food = pd.read_csv('../data/openfoodfacts_products_world.csv')
df_food.head(20)

Unnamed: 0,brands,product_name,categories,countries,food_groups_tags,food_groups,ingredients_text,category_name,country
0,"Mutti,POLPA",pulpe de tomates,"Pflanzliche Lebensmittel und Getränke,Pflanzli...","Australien,Österreich,Belgien,Kanada,Frankreic...","['en:fruits-and-vegetables', 'en:vegetables']",en:vegetables,"tomatoes 99.8%, salt",Plant-based foods and beverages,canada
1,"Maïzena, Unilever",Maizena Fleur de Maïs Sans Gluten 400g,"Plant-based foods and beverages, Plant-based f...","Belgique, Canada, France, Martinique, La Réuni...","['en:cereals-and-potatoes', 'en:cereals']",en:cereals,Amidon de maïs.,Plant-based foods and beverages,canada
2,Barilla,Lasagne all'uovo,"Cibi e bevande a base vegetale, Cibi a base ve...","Belgium,Canada,Croatia,France,Germany,Greece,H...","['en:cereals-and-potatoes', 'en:cereals']",en:cereals,"Semola di grano duro, uova fresche di categori...",Plant-based foods and beverages,canada
3,Tipiak,Fine chapelure de pain,"Aliments et boissons à base de végétaux,Alimen...","Canada,France","['en:cereals-and-potatoes', 'en:bread']",en:bread,"Farine de blé (gluten), sel, levure, Traces po...",Plant-based foods and beverages,canada
4,"Zespri,Sungold,Catania,ALDI Zespri",Kiwi Sungold,"Aliments et boissons à base de végétaux,Alimen...","Belgique,Canada,France,Allemagne,Pologne,Espag...","['en:fruits-and-vegetables', 'en:fruits']",en:fruits,Kiwifruit,Plant-based foods and beverages,canada
5,Kraft,Creamy Peanut Butter,"Plant-based foods and beverages, Plant-based f...","Canada,France","['en:fats-and-sauces', 'en:fats']",en:fats,"_Select roasted peanuts_, Soybean oil, Sugars ...",Plant-based foods and beverages,canada
6,Quaker,So Simple Apple And Blueberry imp,"Plant-based foods and beverages, Plant-based f...",Canada,"['en:cereals-and-potatoes', 'en:breakfast-cere...",en:breakfast-cereals,"Quaker Wholegrain Rolled Oats (77%), Sugar, Dr...",Plant-based foods and beverages,canada
7,Kraft,Smooth Peanut Butter,"Plant-based foods and beverages, Plant-based f...","Canada,France","['en:cereals-and-potatoes', 'en:legumes']",en:legumes,"Select roasted peanuts, Soybean oil, Corn malt...",Plant-based foods and beverages,canada
8,La Molisana,Penne rigate nº 20,"Cibi e bevande a base vegetale, Cibi a base ve...","Canada,Italy","['en:cereals-and-potatoes', 'en:cereals']",en:cereals,pasta di semola di grano duro,Plant-based foods and beverages,canada
9,Vita Coco,Coconut Water,"Beverages and beverages preparations,Plant-bas...","Canada,France,United Kingdom,United States, World","['en:beverages', 'en:sweetened-beverages']",en:sweetened-beverages,"Coconut Water (99%), Fructose (&lt;1%), Vitami...",Plant-based foods and beverages,canada


#### Data Cleaning

In [909]:
# Check the column names
df_food.columns

Index(['brands', 'product_name', 'categories', 'countries', 'food_groups_tags',
       'food_groups', 'ingredients_text', 'category_name', 'country'],
      dtype='object')

In [910]:
#Shape of food products data
df_food.shape

(29414, 9)

In [911]:
#Check for missing values 
df_food.isna().sum()


brands              1712
product_name         540
categories             0
countries              0
food_groups_tags       0
food_groups         1053
ingredients_text    3686
category_name          0
country                0
dtype: int64

In [912]:
#drop data where product name is missing 
df_food.dropna(subset=['ingredients_text'],inplace=True)
df_food.shape

(25728, 9)

In [913]:
df_food.isna().sum()

brands              625
product_name        271
categories            0
countries             0
food_groups_tags      0
food_groups         757
ingredients_text      0
category_name         0
country               0
dtype: int64

In [914]:
#Food dataset columns datatype
df_food.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25728 entries, 0 to 29413
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   brands            25103 non-null  object
 1   product_name      25457 non-null  object
 2   categories        25728 non-null  object
 3   countries         25728 non-null  object
 4   food_groups_tags  25728 non-null  object
 5   food_groups       24971 non-null  object
 6   ingredients_text  25728 non-null  object
 7   category_name     25728 non-null  object
 8   country           25728 non-null  object
dtypes: object(9)
memory usage: 2.0+ MB


In [915]:
#Summary statistics of food products dataset
df_food.describe(include= 'all')

Unnamed: 0,brands,product_name,categories,countries,food_groups_tags,food_groups,ingredients_text,category_name,country
count,25103,25457,25728,25728,25728,24971,25728,25728,25728
unique,4130,9831,6667,1132,44,43,10364,100,3
top,Auchan,Extra Virgin Olive Oil,"Breakfasts, Spreads, Sweet spreads, Bee produc...",Canada,"['en:sugary-snacks', 'en:sweets']",en:sweets,Extra virgin olive oil,Cereals and potatoes,world
freq,3603,66,342,3669,2366,2366,58,297,9290


-----

### Text Preprocesing

In [918]:
def preprocess_text(text):
    # Check if text is NaN (float), return empty string if true
    if pd.isna(text):
        return ''
    # Split text by commas, strip spaces, and process each component
    components = text.split(',')
    processed_components = [component.strip().lower() for component in components]
    # Join the processed components back with commas
    return ', '.join(processed_components)

            

In [919]:

df_additives['bad_ingredients_preprocessed'] = df_additives['bad_ingredients'].apply(preprocess_text)
df_additives

Unnamed: 0,ingredient,purpose,health_concern,alternative_names,bad_ingredients,bad_ingredients_preprocessed
0,Acesulfame potassium,Sweetener,"Cancer, Hormone Disruption, Risks to Pregnant ...","Ace-K, Acesulfame K, E950,Sunett","Acesulfame potassium,Ace-K, Acesulfame K, E950...","acesulfame potassium, ace-k, acesulfame k, e95..."
1,Allura Red AC,Coloring,May cause hyperactivity in children.,"E129,Red 40","Allura Red AC,E129,Red 40","allura red ac, e129, red 40"
2,Aloe vera,Flavoring,Cancer,"Aloe barbadensis, Aloe leaf extract","Aloe vera,Aloe barbadensis, Aloe leaf extract","aloe vera, aloe barbadensis, aloe leaf extract"
3,Amaranth,Coloring,carcinogenic effects,"E123,Red No. 2","Amaranth,E123,Red No. 2","amaranth, e123, red no. 2"
4,Aspartame,Sweetener,"Cancer,Linked to headaches, dizziness","Equal, NutraSweet, E951, AminoSweet","Aspartame ,Equal, NutraSweet, E951, AminoSweet","aspartame, equal, nutrasweet, e951, aminosweet"
5,Azodicarbonamide,Coloring,Linked to respiratory issues and classified a...,"ADA,Dough Conditioner,Flour Treatment Agent","Azodicarbonamide,ADA,Dough Conditioner,Flour T...","azodicarbonamide, ada, dough conditioner, flou..."
6,Brilliant Black BN,other,Linked to hyperactivity and allergic reactions.,"E151 ,Black PN","Brilliant Black BN,E151 ,Black PN","brilliant black bn, e151, black pn"
7,Brominated vegetable oil,Emulsifier,Neurological & Behavioral,"BVO,Brominated Soybean Oil, Brominated Palm Oi...","Brominated vegetable oil ,BVO,Brominated Soybe...","brominated vegetable oil, bvo, brominated soyb..."
8,Butylated hydroxyanisole,Preservative,Cancer,"BHA, E320,Butylated Hydroxyanisole,E321,BHT,Bu...","Butylated hydroxyanisole ,BHA, E320,Butylated ...","butylated hydroxyanisole, bha, e320, butylated..."
9,Caramel coloring,Coloring,Cancer,"E150d, Sulfite ammonia,E150c","Caramel coloring,E150d, Sulfite ammonia,E150c","caramel coloring, e150d, sulfite ammonia, e150c"


In [920]:
# Apply text preprocessing to food products dataset
df_food['processed_ingredients'] = df_food['ingredients_text'].apply(preprocess_text)
df_food.head()

Unnamed: 0,brands,product_name,categories,countries,food_groups_tags,food_groups,ingredients_text,category_name,country,processed_ingredients
0,"Mutti,POLPA",pulpe de tomates,"Pflanzliche Lebensmittel und Getränke,Pflanzli...","Australien,Österreich,Belgien,Kanada,Frankreic...","['en:fruits-and-vegetables', 'en:vegetables']",en:vegetables,"tomatoes 99.8%, salt",Plant-based foods and beverages,canada,"tomatoes 99.8%, salt"
1,"Maïzena, Unilever",Maizena Fleur de Maïs Sans Gluten 400g,"Plant-based foods and beverages, Plant-based f...","Belgique, Canada, France, Martinique, La Réuni...","['en:cereals-and-potatoes', 'en:cereals']",en:cereals,Amidon de maïs.,Plant-based foods and beverages,canada,amidon de maïs.
2,Barilla,Lasagne all'uovo,"Cibi e bevande a base vegetale, Cibi a base ve...","Belgium,Canada,Croatia,France,Germany,Greece,H...","['en:cereals-and-potatoes', 'en:cereals']",en:cereals,"Semola di grano duro, uova fresche di categori...",Plant-based foods and beverages,canada,"semola di grano duro, uova fresche di categori..."
3,Tipiak,Fine chapelure de pain,"Aliments et boissons à base de végétaux,Alimen...","Canada,France","['en:cereals-and-potatoes', 'en:bread']",en:bread,"Farine de blé (gluten), sel, levure, Traces po...",Plant-based foods and beverages,canada,"farine de blé (gluten), sel, levure, traces po..."
4,"Zespri,Sungold,Catania,ALDI Zespri",Kiwi Sungold,"Aliments et boissons à base de végétaux,Alimen...","Belgique,Canada,France,Allemagne,Pologne,Espag...","['en:fruits-and-vegetables', 'en:fruits']",en:fruits,Kiwifruit,Plant-based foods and beverages,canada,kiwifruit


In [921]:
# Apply text preprocessing to additives dataset
df_additives['processed_bad_ingredients'] = df_additives['bad_ingredients'].apply(preprocess_and_split_ingredients)
df_additives.head(30)

Unnamed: 0,ingredient,purpose,health_concern,alternative_names,bad_ingredients,bad_ingredients_preprocessed,processed_bad_ingredients
0,Acesulfame potassium,Sweetener,"Cancer, Hormone Disruption, Risks to Pregnant ...","Ace-K, Acesulfame K, E950,Sunett","Acesulfame potassium,Ace-K, Acesulfame K, E950...","acesulfame potassium, ace-k, acesulfame k, e95...","[acesulfame potassium, ace-k, acesulfame k, e9..."
1,Allura Red AC,Coloring,May cause hyperactivity in children.,"E129,Red 40","Allura Red AC,E129,Red 40","allura red ac, e129, red 40","[allura red ac, e129, red 40]"
2,Aloe vera,Flavoring,Cancer,"Aloe barbadensis, Aloe leaf extract","Aloe vera,Aloe barbadensis, Aloe leaf extract","aloe vera, aloe barbadensis, aloe leaf extract","[aloe vera, aloe barbadensis, aloe leaf extract]"
3,Amaranth,Coloring,carcinogenic effects,"E123,Red No. 2","Amaranth,E123,Red No. 2","amaranth, e123, red no. 2","[amaranth, e123, red no. 2]"
4,Aspartame,Sweetener,"Cancer,Linked to headaches, dizziness","Equal, NutraSweet, E951, AminoSweet","Aspartame ,Equal, NutraSweet, E951, AminoSweet","aspartame, equal, nutrasweet, e951, aminosweet","[aspartame, equal, nutrasweet, e951, aminosweet]"
5,Azodicarbonamide,Coloring,Linked to respiratory issues and classified a...,"ADA,Dough Conditioner,Flour Treatment Agent","Azodicarbonamide,ADA,Dough Conditioner,Flour T...","azodicarbonamide, ada, dough conditioner, flou...","[azodicarbonamide, ada, dough conditioner, flo..."
6,Brilliant Black BN,other,Linked to hyperactivity and allergic reactions.,"E151 ,Black PN","Brilliant Black BN,E151 ,Black PN","brilliant black bn, e151, black pn","[brilliant black bn, e151, black pn]"
7,Brominated vegetable oil,Emulsifier,Neurological & Behavioral,"BVO,Brominated Soybean Oil, Brominated Palm Oi...","Brominated vegetable oil ,BVO,Brominated Soybe...","brominated vegetable oil, bvo, brominated soyb...","[brominated vegetable oil, bvo, brominated soy..."
8,Butylated hydroxyanisole,Preservative,Cancer,"BHA, E320,Butylated Hydroxyanisole,E321,BHT,Bu...","Butylated hydroxyanisole ,BHA, E320,Butylated ...","butylated hydroxyanisole, bha, e320, butylated...","[butylated hydroxyanisole, bha, e320, butylate..."
9,Caramel coloring,Coloring,Cancer,"E150d, Sulfite ammonia,E150c","Caramel coloring,E150d, Sulfite ammonia,E150c","caramel coloring, e150d, sulfite ammonia, e150c","[caramel coloring, e150d, sulfite ammonia, e150c]"


In [922]:
#save the text processed data set 

df_additives.to_csv('../data/addtitives_processed.csv',index=False)
df_food.to_csv('../data/food_processed.csv',index = False)