In [33]:
import requests
import pandas as pd
import numpy as np

In [8]:
def fetch_classifications(pages=120):
    base_url = "https://world.openfoodfacts.org/api/v2/search"
    records = []
    
    for page in range(1, pages + 1):   
        params = {
            "categories": "chocolates",
            "fields": "code,product_name,brands,nutriments",
            "page_size": 100,  
            "page": page
        }
        
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            products = data.get("products", [])
            records.extend(products)
              
    return pd.DataFrame(records, columns=["code", "product_name", "brands", "nutriments"])

In [9]:
data=fetch_classifications(pages=2)

In [10]:
data.head()

Unnamed: 0,code,product_name,brands,nutriments
0,6111035000430,Sidi Ali,Sidi Ali,"{'carbohydrates': 42, 'carbohydrates_100g': 4...."
1,6111242100992,Perly,Jaouda,"{'calcium': 0.25, 'calcium_100g': 0.25, 'calci..."
2,6111035002175,Sidi Ali,sidi ali,"{'chloride': 0.014, 'chloride_100g': 0.014, 'c..."
3,6111035000058,Eau minérale naturelle,"Les Eaux Minérales d'oulmès,Sidi Ali",{'fruits-vegetables-legumes-estimate-from-ingr...
4,6111252421568,اكوافينا,AQUAFINA,"{'alcohol': 0, 'alcohol_100g': 0, 'alcohol_ser..."


In [11]:
data=fetch_classifications(pages=120)

In [12]:
data

Unnamed: 0,code,product_name,brands,nutriments
0,6111035000430,Sidi Ali,Sidi Ali,"{'carbohydrates': 42, 'carbohydrates_100g': 4...."
1,6111242100992,Perly,Jaouda,"{'calcium': 0.25, 'calcium_100g': 0.25, 'calci..."
2,6111035002175,Sidi Ali,sidi ali,"{'chloride': 0.014, 'chloride_100g': 0.014, 'c..."
3,6111035000058,Eau minérale naturelle,"Les Eaux Minérales d'oulmès,Sidi Ali",{'fruits-vegetables-legumes-estimate-from-ingr...
4,6111252421568,اكوافينا,AQUAFINA,"{'alcohol': 0, 'alcohol_100g': 0, 'alcohol_ser..."
...,...,...,...,...
11992,6111069000819,Thé vert en grain najma,Sultan,"{'carbohydrates': 0, 'carbohydrates_100g': 0, ..."
11993,0612322001612,nairns dark chocolate chip oat biscuits,Nairn’s,"{'carbohydrates': 63.8, 'carbohydrates_100g': ..."
11994,5010478014510,Drumstick Squashies,Swizzels,"{'carbohydrates': 84, 'carbohydrates_100g': 84..."
11995,0070177067779,twinings tea,"Twinings, Sainsburys","{'carbohydrates': 0, 'carbohydrates_100g': 0, ..."


In [13]:
data.isnull().sum()

code              0
product_name    254
brands          536
nutriments        0
dtype: int64

In [14]:
data['nutriments']

0        {'carbohydrates': 42, 'carbohydrates_100g': 4....
1        {'calcium': 0.25, 'calcium_100g': 0.25, 'calci...
2        {'chloride': 0.014, 'chloride_100g': 0.014, 'c...
3        {'fruits-vegetables-legumes-estimate-from-ingr...
4        {'alcohol': 0, 'alcohol_100g': 0, 'alcohol_ser...
                               ...                        
11992    {'carbohydrates': 0, 'carbohydrates_100g': 0, ...
11993    {'carbohydrates': 63.8, 'carbohydrates_100g': ...
11994    {'carbohydrates': 84, 'carbohydrates_100g': 84...
11995    {'carbohydrates': 0, 'carbohydrates_100g': 0, ...
11996    {'carbohydrates': 3.2, 'carbohydrates_100g': 3...
Name: nutriments, Length: 11997, dtype: object

In [15]:
data.keys()

Index(['code', 'product_name', 'brands', 'nutriments'], dtype='object')

In [16]:
data['nutriments'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 11997 entries, 0 to 11996
Series name: nutriments
Non-Null Count  Dtype 
--------------  ----- 
11997 non-null  object
dtypes: object(1)
memory usage: 93.9+ KB


In [17]:
required_keys = [
    "energy-kcal_value",
    "energy-kj_value",
    "carbohydrates_value",
    "sugars_value",
    "fat_value",
    "saturated-fat_value",
    "proteins_value",
    "fiber_value",
    "salt_value",
    "sodium_value",
    "nova-group",
    "nutrition-score-fr",
    "fruits-vegetables-nuts-estimate-from-ingredients_100g"
]

if "nutriments" in data.columns:
    nutriments_filtered = data["nutriments"].apply(
        lambda x: {k: x.get(k) for k in required_keys} if isinstance(x, dict) else {}
    )

    nutriments_df = pd.json_normalize(nutriments_filtered)

    
    nutriments_df = nutriments_df.loc[:, ~nutriments_df.columns.duplicated()]

   
    data = pd.concat([data.drop(columns=["nutriments"]), nutriments_df], axis=1)


data = data.loc[:, ~data.columns.duplicated()]

print("✅ Final columns:", data.columns.tolist())

✅ Final columns: ['code', 'product_name', 'brands', 'energy-kcal_value', 'energy-kj_value', 'carbohydrates_value', 'sugars_value', 'fat_value', 'saturated-fat_value', 'proteins_value', 'fiber_value', 'salt_value', 'sodium_value', 'nova-group', 'nutrition-score-fr', 'fruits-vegetables-nuts-estimate-from-ingredients_100g']


In [18]:
data.head()

Unnamed: 0,code,product_name,brands,energy-kcal_value,energy-kj_value,carbohydrates_value,sugars_value,fat_value,saturated-fat_value,proteins_value,fiber_value,salt_value,sodium_value,nova-group,nutrition-score-fr,fruits-vegetables-nuts-estimate-from-ingredients_100g
0,6111035000430,Sidi Ali,Sidi Ali,0.0,20.0,42.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,0.0
1,6111242100992,Perly,Jaouda,97.0,406.0,9.4,,3.0,,8.0,,,,3.0,,0.0
2,6111035002175,Sidi Ali,sidi ali,,,,,,,,,65.0,26.0,,0.0,0.0
3,6111035000058,Eau minérale naturelle,"Les Eaux Minérales d'oulmès,Sidi Ali",,,,,,,,,65.0,26.0,1.0,0.0,0.0
4,6111252421568,اكوافينا,AQUAFINA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00508,0.002032,,0.0,0.0


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11997 entries, 0 to 11996
Data columns (total 16 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   code                                                   11997 non-null  object 
 1   product_name                                           11743 non-null  object 
 2   brands                                                 11461 non-null  object 
 3   energy-kcal_value                                      11229 non-null  float64
 4   energy-kj_value                                        6436 non-null   float64
 5   carbohydrates_value                                    11229 non-null  float64
 6   sugars_value                                           11025 non-null  float64
 7   fat_value                                              11257 non-null  float64
 8   saturated-fat_value                           

In [20]:
null_percentage = (data.isnull().sum() / len(data)) * 100

print(null_percentage)

code                                                      0.000000
product_name                                              2.117196
brands                                                    4.467784
energy-kcal_value                                         6.401600
energy-kj_value                                          46.353255
carbohydrates_value                                       6.401600
sugars_value                                              8.102026
fat_value                                                 6.168209
saturated-fat_value                                       8.718846
proteins_value                                            6.251563
fiber_value                                              31.607902
salt_value                                                9.068934
sodium_value                                              9.068934
nova-group                                               10.585980
nutrition-score-fr                                       14.06

In [21]:
data[["product_name", "brands"]] = data[["product_name", "brands"]].ffill()



In [22]:
data[["energy-kcal_value", "energy-kj_value"]] = data[["energy-kcal_value", "energy-kj_value"]].bfill()
data["sodium_value"] = data["sodium_value"].bfill()


In [23]:
for col in ["carbohydrates_value", "sugars_value", "fat_value"]:
    data[col] = data[col].fillna(data[col].mean())
    

In [24]:
data["salt_value"] = data["salt_value"].fillna(data["salt_value"].mean())

In [25]:
data["proteins_value"] = data["proteins_value"].fillna(data["proteins_value"].median())
data["fiber_value"] = data["fiber_value"].fillna(data["fiber_value"].median())
data["saturated-fat_value"] = data["saturated-fat_value"].fillna(data["saturated-fat_value"].median())

In [26]:
data["nova-group"] = data["nova-group"].fillna(data["nova-group"].mode()[0])


In [27]:
data["nutrition-score-fr"] = data["nutrition-score-fr"].fillna(0)
data["fruits-vegetables-nuts-estimate-from-ingredients_100g"] = (
    data["fruits-vegetables-nuts-estimate-from-ingredients_100g"].fillna(-1)
)

In [29]:
data["energy-kj_value"] = data["energy-kj_value"].fillna(0)

In [30]:
data.isnull().sum()

code                                                     0
product_name                                             0
brands                                                   0
energy-kcal_value                                        0
energy-kj_value                                          0
carbohydrates_value                                      0
sugars_value                                             0
fat_value                                                0
saturated-fat_value                                      0
proteins_value                                           0
fiber_value                                              0
salt_value                                               0
sodium_value                                             0
nova-group                                               0
nutrition-score-fr                                       0
fruits-vegetables-nuts-estimate-from-ingredients_100g    0
dtype: int64

In [31]:
data.head()

Unnamed: 0,code,product_name,brands,energy-kcal_value,energy-kj_value,carbohydrates_value,sugars_value,fat_value,saturated-fat_value,proteins_value,fiber_value,salt_value,sodium_value,nova-group,nutrition-score-fr,fruits-vegetables-nuts-estimate-from-ingredients_100g
0,6111035000430,Sidi Ali,Sidi Ali,0.0,20.0,42.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0
1,6111242100992,Perly,Jaouda,97.0,406.0,9.4,11.399435,3.0,1.4,8.0,2.5,18.509833,26.0,3.0,0.0,0.0
2,6111035002175,Sidi Ali,sidi ali,0.0,0.0,29.469532,11.399435,14.873864,1.4,5.6,2.5,65.0,26.0,4.0,0.0,0.0
3,6111035000058,Eau minérale naturelle,"Les Eaux Minérales d'oulmès,Sidi Ali",0.0,0.0,29.469532,11.399435,14.873864,1.4,5.6,2.5,65.0,26.0,1.0,0.0,0.0
4,6111252421568,اكوافينا,AQUAFINA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00508,0.002032,4.0,0.0,0.0


In [28]:
data.duplicated().sum()

np.int64(0)

In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11997 entries, 0 to 11996
Data columns (total 16 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   code                                                   11997 non-null  object 
 1   product_name                                           11997 non-null  object 
 2   brands                                                 11997 non-null  object 
 3   energy-kcal_value                                      11997 non-null  float64
 4   energy-kj_value                                        11997 non-null  float64
 5   carbohydrates_value                                    11997 non-null  float64
 6   sugars_value                                           11997 non-null  float64
 7   fat_value                                              11997 non-null  float64
 8   saturated-fat_value                           

In [34]:
data["sugar_to_carb_ratio"] = data.apply(
    lambda row:row["sugars_value"]/ row["carbohydrates_value"]
    if row["carbohydrates_value"] not in [0,np.nan] else np.nan,
    axis=1
)


In [35]:
def calorie_category(kcal):
    if pd.isna(kcal):
        return np.nan
    elif kcal < 150:
        return "Low"
    elif kcal < 400:
        return "Moderate"
    else:
        return "High"

data["calorie_category"] = data["energy-kcal_value"].apply(calorie_category)

In [36]:
def sugar_category(sugars):
    if pd.isna(sugars):
        return np.nan
    elif sugars < 5:
        return "Low Sugar"
    elif sugars < 15:
        return "Moderate Sugar"
    else:
        return "High Sugar"

data["sugar_category"] = data["sugars_value"].apply(sugar_category)

In [37]:
data["is_ultra_processed"] = data["nova-group"].apply(
    lambda x: "Yes" if x == 4 else ("No" if not pd.isna(x) else np.nan)
)


In [38]:
data.info() #Structure

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11997 entries, 0 to 11996
Data columns (total 20 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   code                                                   11997 non-null  object 
 1   product_name                                           11997 non-null  object 
 2   brands                                                 11997 non-null  object 
 3   energy-kcal_value                                      11997 non-null  float64
 4   energy-kj_value                                        11997 non-null  float64
 5   carbohydrates_value                                    11997 non-null  float64
 6   sugars_value                                           11997 non-null  float64
 7   fat_value                                              11997 non-null  float64
 8   saturated-fat_value                           

In [41]:
data["sugar_to_carb_ratio"] = data["sugar_to_carb_ratio"].fillna(0)

In [43]:
null_percentage = (data.isnull().sum() / len(data)) * 100

print(null_percentage)

code                                                     0.0
product_name                                             0.0
brands                                                   0.0
energy-kcal_value                                        0.0
energy-kj_value                                          0.0
carbohydrates_value                                      0.0
sugars_value                                             0.0
fat_value                                                0.0
saturated-fat_value                                      0.0
proteins_value                                           0.0
fiber_value                                              0.0
salt_value                                               0.0
sodium_value                                             0.0
nova-group                                               0.0
nutrition-score-fr                                       0.0
fruits-vegetables-nuts-estimate-from-ingredients_100g    0.0
sugar_to_carb_ratio     

In [46]:
data.to_csv("Choco_Crunch.csv",index =False)