In [None]:
# !pip install -r ../requirements.txt 

In [None]:
import requests
import pandas as pd

In [None]:
def extract_products_by_category(category):
    url = f"https://world.openfoodfacts.org/cgi/search.pl?action=process&tagtype_0=categories&tag_contains_0=contains&tag_0={category}&page_size=100&json=true"
    response = requests.get(url)
    
    if response.status_code == 200:
        products_data = response.json()["products"]
        products_df = pd.DataFrame(products_data)
        return products_df
    else:
        print(f"Failed to fetch data for {category}")
        return None

categories = ["Biscuits", "Breads", "Nuts", "Sandwiches", "Snacks", "Meat alternatives", "Chocolate candies", "Breakfast cereals", "Fruits"]

combined_products = pd.DataFrame()

# Extract and combine products for each category
for category in categories:
    products_df = extract_products_by_category(category)
    if products_df is not None:
        combined_products = pd.concat([combined_products, products_df], ignore_index=True)

# Remove duplicates
combined_products.drop_duplicates(subset="product_name", keep="first", inplace=True)


In [None]:
combined_file_name = "combined_products.csv"
combined_products.to_csv('../data/'+combined_file_name, index=False)
print(f"Combined products saved to {combined_file_name}")

# Data Preprocessing

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("../data/combined_products.csv")

In [None]:
df.head(2)

In [None]:
df2 = df[["_id", "image_url", "brands", "pnns_groups_2", "nutriments", "nutriscore_data", "nutrition_grade_fr"]]

In [None]:
df2.head(10)

In [None]:
# drop columns that has no values
df2.dropna(subset=['nutriments'], inplace=True)

In [None]:
df2.head()

# Flatten the column nutriments

In [None]:
from pandas import json_normalize
import json

In [None]:
# Assuming 'nutriments' is the column with JSON data in DataFrame df2
# Replace single quotes with double quotes in the 'nutriments' column
df2['nutriments'] = df2['nutriments'].str.replace("'", '"')

# Load JSON strings within 'nutriments' column
df2['nutriments'] = df2['nutriments'].apply(json.loads)

# Flatten the JSON column into separate columns
df_nutriments = pd.json_normalize(df2['nutriments'])

# Concatenate the flattened columns with the original DataFrame
df2 = pd.concat([df2, df_nutriments], axis=1)

# Drop the original 'nutriments' column if needed
df2.drop(columns=['nutriments'], inplace=True)

In [None]:
df2.head()

In [None]:
df2.dropna(subset=['nutriscore_data'], inplace=True)
df2.head()

In [None]:
# retrive the nutriscore data for one sample
df2['nutriscore_data'][0]

In [None]:
# Function to validate and convert JSON strings to dictionaries
def validate_json(json_str):
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        # Attempt to replace single quotes with double quotes and retry decoding
        try:
            json_str = json_str.replace("'", '"')
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None  # Return None for invalid JSON strings


In [None]:
# Assuming 'nutriments' is the column with JSON data in DataFrame df2
# Replace single quotes with double quotes in the 'nutriments' column
# df2['nutriscore_data'] = df2['nutriscore_data'].str.replace("'", '"')

df2['nutriscore_data'] = df2['nutriscore_data'].apply(validate_json)

# Load JSON strings within 'nutriments' column
# df2['nutriscore_data'] = df2['nutriscore_data'].apply(json.loads)

# Flatten the JSON column into separate columns
df_nutriscore_data = pd.json_normalize(df2['nutriscore_data'])

# Concatenate the flattened columns with the original DataFrame
df3 = pd.concat([df2, df_nutriscore_data], axis=1)

# Drop the original 'nutriments' column if needed
df3.drop(columns=['nutriscore_data'], inplace=True)

In [None]:
df3['negative_points'][0]

In [None]:
df3.head()

In [None]:
df3.to_csv('../data/combined_products_v2.csv', index=False)

In [None]:
# filter negative_points and keep <= 11
df3 = df3[df3['negative_points'] <= 11]

In [None]:
df3.head()

In [None]:
df3['positive_points'].value_counts()

In [None]:
# columns to keep in the final df
to_keep_columns = ["_id", "image_url", "brands", "pnns_groups_2", "energy-kj_100g", "sugars_100g",
                  "salt_100g", "saturated-fat_100g", "proteins_100g", "fiber_100g", 
                   "fruits-vegetables-nuts-estimate-from-ingredients_100g",
                  "sodium_100g", "nutrition-score-fr_100g", "nutrition_grade_fr", "negative_points", "positive_points"]

In [None]:
df_final = df3[to_keep_columns]

In [None]:
# number of rows
len(df_final.index)

# randomly sample 150 items 

In [None]:
# Filter out rows where any column contains NaN values
df_no_na = df_final.dropna()

# Calculate the number of samples to take for each unique value in 'pnns_groups_2' column
num_samples = 300 // df_no_na['pnns_groups_2'].nunique()

# Create an empty DataFrame to store sampled data
sampled_data = pd.DataFrame()

# Iterate through each unique value in 'pnns_groups_2', sample the data, and append to the sampled_data DataFrame
for group_value in df_no_na['pnns_groups_2'].unique():
    group_data = df_no_na[df_no_na['pnns_groups_2'] == group_value].sample(min(num_samples, len(df_no_na[df_no_na['pnns_groups_2'] == group_value])))
    sampled_data = pd.concat([sampled_data, group_data])

# If the total number of samples is less than 150, sample the remaining randomly
remaining_samples = 300 - len(sampled_data)
if remaining_samples > 0:
    remaining_data = df_no_na.sample(min(remaining_samples, len(df_no_na)))
    sampled_data = pd.concat([sampled_data, remaining_data])

# Final sampled DataFrame with approximately 150 items
final_sample = sampled_data.sample(n=300)

In [None]:
sampled_data.to_csv("../data/final_preprocessed_data.csv", index=False)