In [58]:
import requests
import pandas as pd
import time

def fetch_categories():
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    url = "https://world.openfoodfacts.org/categories.json"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json().get('tags', [])
    else:
        print(f"Failed to fetch categories: {response.status_code}")
        return []

def fetch_products_for_category(category_id, country):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    url = f"https://world.openfoodfacts.org/category/{category_id}/country/{country}.json?page_size=200"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json().get('products', [])
    elif response.status_code == 429:
        print(f"Rate limit exceeded for category {category_id} in {country}. Retrying in 60 seconds...")
        time.sleep(60)
        return fetch_products_for_category(category_id, country)
    else:
        print(f"Failed to fetch products for category {category_id} in {country}: {response.status_code}")
        return []

def extract_relevant_fields(products):
    fields = ['brands', 'product_name','categories', 'countries', 'food_groups_tags', 'food_groups', 'ingredients_text']
    filtered_products = []
    for product in products:
        filtered_product = {field: product.get(field, '') for field in fields}
        filtered_products.append(filtered_product)
    return filtered_products

def main():
    categories = fetch_categories()
    countries = ['canada', 'united-states','world']  # Now includes both countries
    all_products = []

    for category in categories:
        category_id = category['id']
        category_name = category['name']
        for country in countries:
            print(f"Fetching products for category: {category_name} in {country}")
            products = fetch_products_for_category(category_id, country)
            filtered_products = extract_relevant_fields(products)
            for product in filtered_products:
                product['category_name'] = category_name
                product['country'] = country
                all_products.append(product)
            time.sleep(1)  # Rate limiting by adding a delay

    df = pd.DataFrame(all_products)
    df.to_csv('../data/openfoodfacts_products_world.csv', index=False)
    print("Data saved to 'openfoodfacts_products.csv'.")

if __name__ == '__main__':
    main()


Fetching products for category: Plant-based foods and beverages in canada
Fetching products for category: Plant-based foods and beverages in united-states
Fetching products for category: Plant-based foods and beverages in world
Fetching products for category: Plant-based foods in canada
Fetching products for category: Plant-based foods in united-states
Fetching products for category: Plant-based foods in world
Rate limit exceeded for category en:plant-based-foods in world. Retrying in 60 seconds...
Fetching products for category: Snacks in canada
Fetching products for category: Snacks in united-states
Fetching products for category: Snacks in world
Fetching products for category: Sweet snacks in canada
Fetching products for category: Sweet snacks in united-states
Rate limit exceeded for category en:sweet-snacks in united-states. Retrying in 60 seconds...
Fetching products for category: Sweet snacks in world
Fetching products for category: Beverages in canada
Fetching products for categ