# ChocoCrunch Analytics

## Data Extracting

In [1]:
import requests
import pandas as pd

all_products = []
base_url = "https://world.openfoodfacts.org/api/v2/search"
params = {
    "categories": "chocolates",
    "fields": "code,product_name,brands,nutriments",
    "page_size": 100,
}

# Fetch data page by page
for page in range(1, 122):  # Pages 1 to 120
    params["page"] = page
    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        data = response.json()
        products = data.get("products", [])
        all_products.extend(products)

df_all_products = pd.DataFrame(all_products)
df_all_products.head()

Unnamed: 0,brands,code,nutriments,product_name
0,Sidi Ali,6111035000430,"{'carbohydrates': 42, 'carbohydrates_100g': 4....",Sidi Ali
1,Jaouda,6111242100992,"{'calcium': 0.25, 'calcium_100g': 0.25, 'calci...",Perly
2,sidi ali,6111035002175,"{'chloride': 0.014, 'chloride_100g': 0.014, 'c...",Sidi Ali
3,"Les Eaux Minérales d'oulmès,Sidi Ali",6111035000058,{'fruits-vegetables-legumes-estimate-from-ingr...,Eau minérale naturelle
4,AQUAFINA,6111252421568,"{'alcohol': 0, 'alcohol_100g': 0, 'alcohol_ser...",اكوافينا


In [4]:
df_nutriments = pd.json_normalize(df_all_products['nutriments'])
df_selected = df_nutriments[['energy-kcal_value','energy-kj_value','carbohydrates_value','sugars_value','fat_value','saturated-fat_value','proteins_value','fiber_value',
'salt_value','sodium_value','nova-group','nutrition-score-fr','fruits-vegetables-nuts-estimate-from-ingredients_100g']]
Data= pd.concat([df_all_products.drop(columns=['nutriments']), df_selected], axis=1)
Data.isnull().sum()

brands                                                    557
code                                                        0
product_name                                              273
energy-kcal_value                                         790
energy-kj_value                                          5618
carbohydrates_value                                       786
sugars_value                                              997
fat_value                                                 760
saturated-fat_value                                      1075
proteins_value                                            771
fiber_value                                              3842
salt_value                                               1114
sodium_value                                             1114
nova-group                                               1292
nutrition-score-fr                                       1723
fruits-vegetables-nuts-estimate-from-ingredients_100g     751
dtype: i

## Null Handling

In [5]:
from sklearn.impute import KNNImputer
import pandas as pd
import numpy as np

cols = [
    'carbohydrates_value',
    'sugars_value',
    'fiber_value',
    'fat_value',
    'saturated-fat_value',   
    'proteins_value',
    'salt_value',
    'nova-group',
    'nutrition-score-fr',
    'fruits-vegetables-nuts-estimate-from-ingredients_100g'
]
imputer = KNNImputer(n_neighbors=5)
Data[cols] = (pd.DataFrame(imputer.fit_transform(Data[cols]), columns=cols)).round(2)
Data['energy-kcal_value'] = (
    Data['carbohydrates_value'].fillna(0) * 4 +
    Data['proteins_value'].fillna(0) * 4 +
    Data['fat_value'].fillna(0) * 9 +
    Data['fiber_value'].fillna(0) * 2
).round(2)
Data['energy-kj_value'] = (Data['energy-kcal_value'].fillna(0) * 4.184).round(2)
#Data['nova-group'] = Data['nova-group'].round().astype(int)
Data['sodium_value'] = Data['sodium_value'].fillna(Data['salt_value'] / 2.5).round(2)
#Data['fiber_value'] = Data.groupby('nova-group')['fiber_value'].transform(lambda x: x.fillna(x.median()))
Data['proteins_value'] = Data.groupby('nova-group')['proteins_value'].transform(lambda x: x.fillna(x.median()))
Data['product_name'] = Data.groupby('brands')['product_name'].transform(lambda x: x.fillna(x.mode().iloc[0]) if not x.mode().empty else x)
Data['product_name'] = Data['product_name'].fillna('Unknown Product')
Data['brands'] = Data.groupby('product_name')['brands'].transform(lambda x: x.fillna(x.mode().iloc[0]) if not x.mode().empty else x)
Data.isnull().sum()

brands                                                   0
code                                                     0
product_name                                             0
energy-kcal_value                                        0
energy-kj_value                                          0
carbohydrates_value                                      0
sugars_value                                             0
fat_value                                                0
saturated-fat_value                                      0
proteins_value                                           0
fiber_value                                              0
salt_value                                               0
sodium_value                                             0
nova-group                                               0
nutrition-score-fr                                       0
fruits-vegetables-nuts-estimate-from-ingredients_100g    0
dtype: int64

## Column Derive

In [6]:
#sugar_to_carb_ratio
Data['sugar_to_carb_ratio'] = (Data['sugars_value'] / Data['carbohydrates_value'].replace(0, pd.NA)).fillna(0).round(2)
#calorie_category
def classify_calorie(kcal):
    if pd.isnull(kcal):
        return 'Unknown'
    elif kcal <= 100:
        return 'Low'
    elif kcal <= 300:
        return 'Medium'
    else:
        return 'High'

Data['calorie_category'] = Data['energy-kcal_value'].apply(classify_calorie)
#sugar_category
def classify_sugar(sugar):
    if pd.isnull(sugar):
        return 'Unknown'
    elif sugar <= 5:
        return 'Low'
    elif sugar <= 15:
        return 'Medium'
    else:
        return 'High'
Data['sugar_category'] = Data['sugars_value'].apply(classify_sugar)
#is_ultra_processed
def is_ultra(nova):
    if pd.isnull(nova):
        return None  # or 'Unknown'
    return nova == 4

Data['is_ultra_processed'] = Data['nova-group'].apply(is_ultra)
Data.isnull().sum()

  Data['sugar_to_carb_ratio'] = (Data['sugars_value'] / Data['carbohydrates_value'].replace(0, pd.NA)).fillna(0).round(2)


brands                                                   0
code                                                     0
product_name                                             0
energy-kcal_value                                        0
energy-kj_value                                          0
carbohydrates_value                                      0
sugars_value                                             0
fat_value                                                0
saturated-fat_value                                      0
proteins_value                                           0
fiber_value                                              0
salt_value                                               0
sodium_value                                             0
nova-group                                               0
nutrition-score-fr                                       0
fruits-vegetables-nuts-estimate-from-ingredients_100g    0
sugar_to_carb_ratio                                     

In [7]:
Data.duplicated().sum()

np.int64(0)

In [8]:
Data_cleaned = Data.drop_duplicates(ignore_index=True)
Data_cleaned.duplicated().sum()

np.int64(0)

In [10]:
Data_cleaned.head()

Unnamed: 0,brands,code,product_name,energy-kcal_value,energy-kj_value,carbohydrates_value,sugars_value,fat_value,saturated-fat_value,proteins_value,fiber_value,salt_value,sodium_value,nova-group,nutrition-score-fr,fruits-vegetables-nuts-estimate-from-ingredients_100g,sugar_to_carb_ratio,calorie_category,sugar_category,is_ultra_processed
0,Sidi Ali,6111035000430,Sidi Ali,168.0,702.91,42.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,3.4,1.0,0.0,0.33,Medium,Medium,False
1,Jaouda,6111242100992,Perly,2444497000000.0,10227770000000.0,9.4,4.7,3.0,0.5,8.0,1222248000000.0,30.67,12.27,3.0,13.4,0.0,0.5,High,Low,False
2,sidi ali,6111035002175,Sidi Ali,272.56,1140.39,17.42,25.62,20.44,1.94,3.4,2.66,65.0,26.0,2.2,0.0,0.0,1.47,Medium,High,False
3,"Les Eaux Minérales d'oulmès,Sidi Ali",6111035000058,Eau minérale naturelle,206.52,864.08,27.76,2.16,0.96,0.94,19.86,3.7,65.0,26.0,1.0,0.0,0.0,0.08,Medium,Low,False
4,AQUAFINA,6111252421568,اكوافينا,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,2.8,0.0,0.0,0.0,Low,Low,False


In [11]:
Data_cleaned.to_csv(r"C:\Users\lenovo\OneDrive\Desktop\My Project 2\ChocoCrunch.csv",index = False)