## Dataset from openfoodfacts

In [30]:
import pandas as pd
from datasets import load_dataset

Load a small sample of the dataset

In [31]:
train_iter = load_dataset("openfoodfacts/product-database", split="food", streaming=True)
sample_list = list(train_iter.take(1000))
df = pd.DataFrame(sample_list)
print(f"Loaded {len(df)} items.")
print(df.head())

Loaded 1000 items.
   additives_n additives_tags allergens_tags   brands_tags   brands  \
0          NaN           None      [en:nuts]  [xx:bovetti]  Bovetti   
1          0.0             []             []      [lagg-s]   Lagg's   
2          0.0             []             []      [lagg-s]   Lagg's   
3          0.0             []             []   [xx:lagg-s]   Lagg's   
4          0.0             []             []      [lagg-s]   Lagg's   

                                          categories  \
0  Petit-déjeuners,Produits à tartiner,Produits à...   
1                                               null   
2  Plant-based foods and beverages, Beverages, Ho...   
3  Beverages and beverages preparations, Plant-ba...   
4                                               None   

                                     categories_tags  \
0  [en:breakfasts, en:spreads, en:sweet-spreads, ...   
1                                          [en:null]   
2  [en:plant-based-foods-and-beverages, en:bevera

In [32]:
print("Columns:", df.columns.tolist())

print("\nMissing values:")
print(df[['product_name']].isnull().sum())


Missing values:
product_name    0
dtype: int64


## Extract product_name from the dictionary

In [33]:
def extract_product_name(product_name):
    if isinstance(product_name, list):
        for entry in product_name:
            if isinstance(entry, dict) and 'lang' in entry and 'text' in entry:
                if entry['lang'] in ['main', 'en']:
                    return entry['text']
        return None
    return product_name

df['product_name'] = df['product_name'].apply(extract_product_name)

## Assigning label = 1, since all the items are food items. Storing only product_name and label

In [34]:
df = df.drop_duplicates(subset=['product_name'])

df['label'] = 1

df_clean = df[['product_name', 'label']]

print(f"Cleaned DataFrame shape: {df_clean.shape}")
print("\nLabel distribution:")
print(df_clean['label'].value_counts())
print("\nSample:")
print(df_clean[['product_name', 'label']].head())

Cleaned DataFrame shape: (850, 2)

Label distribution:
label
1    850
Name: count, dtype: int64

Sample:
                                        product_name  label
0  Véritable pâte à tartiner noisettes chocolat noir      1
1                               Chamomile Herbal Tea      1
2                     Lagg's, herbal tea, peppermint      1
3                                 Linden Flowers Tea      1
4                               Herbal Tea, Hibiscus      1


In [35]:
df_clean.to_csv('food_sample.csv', index=False)
print("Saved to 'food_sample.csv'")

Saved to 'food_sample.csv'
