## Dataset from openfoodfacts

In [None]:
import pandas as pd
from datasets import load_dataset

Load a small sample of the dataset

In [9]:
train_iter = load_dataset("openfoodfacts/product-database", split="beauty", streaming=True)
sample_list = list(train_iter.take(10000))
df = pd.DataFrame(sample_list)
print(f"Loaded {len(df)} items.")
print(df.head())

Loaded 10000 items.
   additives_n additives_tags                         allergens_tags  \
0          NaN             []                                     []   
1          NaN             []                                     []   
2          NaN             []                                     []   
3          NaN             []                                     []   
4          0.0             []  [en:citral, en:limonene, en:linalool]   

         brands_tags        brands                 categories  \
0                 []                                            
1                 []                                            
2  [xx:milton-brown]  Milton Brown                       None   
3        [xx:awapul]        Awapul                       None   
4     [xx:fragonard]     Fragonard  Parfums, Eaux de toilette   

                     categories_tags categories_properties checkers_tags  \
0                                 []                  None            []   
1   

In [4]:
print("Columns:", df.columns.tolist())

print("\nMissing values:")
print(df[['product_name']].isnull().sum())


Missing values:
product_name    0
dtype: int64


## Extract product_name from the dictionary

In [11]:
def extract_product_name(product_name):
    if isinstance(product_name, list):
        for entry in product_name:
            if isinstance(entry, dict) and 'lang' in entry and 'text' in entry:
                if entry['lang'] in ['main', 'en']:
                    return entry['text']
        return None
    return product_name

df['product_name'] = df['product_name'].apply(extract_product_name)

## Assigning label = 0, since all the items are non-food items. Storing only product_name and label

In [12]:
df = df.drop_duplicates(subset=['product_name'])

df['label'] = 0

df_clean = df[['product_name', 'label']]

print(f"Cleaned DataFrame shape: {df_clean.shape}")
print("\nLabel distribution:")
print(df_clean['label'].value_counts())
print("\nSample:")
print(df_clean[['product_name', 'label']].head())

Cleaned DataFrame shape: (8378, 2)

Label distribution:
label
0    8378
Name: count, dtype: int64

Sample:
                                  product_name  label
0                          Kiehls - Cream #gel      0
1              huile de douche Daniel jouvence      0
2  Rejuvenating Arctic shajio body moisturiser      0
3                                        Spray      0
4                        CÃ¨dre eau de toilette      0


In [13]:
df_clean.to_csv('non_food_sample.csv', index=False)
print("Saved to 'non_food_sample.csv'")

Saved to 'non_food_sample.csv'
