In [None]:
# !pip install transformers
# !pip install langchain
# !pip install langchain-openai
# !pip install langchain-community
# !pip install langchain-huggingface
# !pip install accelerate

In [None]:
import json
import os
from datetime import datetime, UTC

import pandas as pd
import requests
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm.notebook import tqdm
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


In [None]:
class FoodakaiAPILoader(BaseLoader):
    def __init__(
        self,
        apikey: str,
        url: str = "https://api.foodakai.com/search-api-1.0/search/",
        timeout: int = 60,
        ):
      self._apikey = apikey
      self._url = url
      self._timeout = timeout
      self._headers = {"Accept": "application/json", "Content-Type": "application/json"}

    def load(self, **filters) -> list[Document]:

        data_from = filters.get("data_from", "2000-01-01")
        data_until = filters.get("data_until", datetime.now(UTC).strftime("%Y-%m-%d"))
        product = filters.get("product")
        hazard = filters.get("hazard")
        origin = filters.get("origin")
        page_size = filters.get("page_size", 1000)

        payload: dict = {
            "apikey": self._apikey,
            "pageSize": page_size,
            "from": data_from,
            "to": data_until,
            "entityType": "incident",
            "detail": True,
            "published": True,
            "strictQuery": {},
        }

        if product:
            payload["strictQuery"]["product.value.keyword"] = product

        if hazard:
            payload["strictQuery"]["hazards.value.keyword"] = hazard

        if origin:
            payload["strictQuery"]["origin.value.keyword"] = origin

        response = requests.post(
            self._url,
            json=payload,
            headers=self._headers,
            timeout=self._timeout,
        )
        response.raise_for_status()

        data = response.json()

        print(payload)

        hits = data.get("hits", {}).get("hits", [])

        documents = []
        for hit in hits:
            document = self._create_document(hit)
            documents.append(document)

        return documents

    @staticmethod
    def _create_document(doc: dict) -> Document:
        attrs = {
            "title": doc["_source"]["title"],
            "description": doc["_source"]["description"],
            "published": doc["_source"]["createdOn"],
            "url": doc["_source"]["originalSource"][0]["url"],
            "data_source": doc["_source"]["originalSource"][0]["dataSource"],
            "notification_type": doc["_source"]["notificationType"][0],
        }

        # TODO: do not summarize documents with less than x size
        content = (
            f"Title: {attrs['title']}\n"
            f"Description: {attrs['description']}\n"
            f"Published: {attrs['published']}\n"
            f"URL: {attrs['url']}\n"
            f"Data Source: {attrs['data_source']}\n"
            f"Notification Type: {attrs['notification_type']}\n"
        )
        return Document(
            page_content=content,
            metadata={"id": doc["_id"]},
        )

In [None]:
loader = FoodakaiAPILoader(apikey="...")

In [None]:
# TODO: every doc should belong to at least one customization
documents = loader.load(data_from="2025-01-14")

{'apikey': '9a88bdb1-8409-3bd8-98ce-f3548acffa22', 'pageSize': 1000, 'from': '2025-01-14', 'to': '2025-02-14', 'entityType': 'incident', 'detail': True, 'published': True, 'strictQuery': {}}


In [None]:
len(documents)

1000

In [None]:
documents[0].page_content

"Title: Incorrect expiry date in prepared salads by E.Leclerc Morlaix from France\nDescription: MACÉDOINE DE LÉGUMES Accueil Alimentation Plats préparés et snacks 1 visuels du produit rappelé Caractéristiques principales du produit rappelé Plats préparés et snacks vendredi 14 février 2025 MACÉDOINE DE LÉGUMES Marque\xa0: E.Leclerc Morlaix Réf. Fiche\xa0: 2025-02-0102 № de Version\xa0: 1 Origine de la fiche\xa0: MORLAIX DISTRIBUTION E.Leclerc Morlaix Nature juridique du rappel\xa0: Volontaire (sans arrêté préfectoral) Informations transmises par le professionnel Générer une affichette récapitulative (au format PDF)\xa0 Si ce produit est toujours en rayon, faire un signalement sur\xa0 Informations d'identification du produit rappelé Catégorie de produit Alimentation Sous-catégorie de produit Plats préparés et snacks Nom de la marque du produit E.Leclerc Morlaix Noms des modèles ou références Macédoine de légumes Identification des produits GTIN Lot Date 211035026631 042 Date limite de co

In [None]:
# TODO: explore other models
model_id = "Qwen/Qwen2.5-1.5B-Instruct"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    device_map="auto"  # Let accelerate map model to GPU if available
)

# Create a text-generation pipeline
generate_text = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=500,
    temperature=0.2,
    do_sample=True,
)

# TODO: account for context length, map-phase summary new tokens and number of summaries
# TODO: experiment with transformer model fine-tuned for summarization
# TODO: eval with BLUE, ROUGE metrics (initial text, final summary)

Device set to use cuda:0


In [None]:
llm = HuggingFacePipeline(pipeline=generate_text)

In [None]:
map_chain = load_summarize_chain(llm, chain_type="stuff", verbose=False)
reduce_chain = load_summarize_chain(llm, chain_type="stuff", verbose=False)
# TODO: compare map-reduce with refinement approach

In [None]:
summaries = {}

for doc in tqdm(documents):
    summaries[doc.metadata["id"]] = map_chain.invoke([doc])

  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
summaries['FDK_23758287']["output_text"]

'Write a concise summary of the following:\n\n\n"Title: Aflatoxin and aflatoxin B1 in pistachios from United States\nDescription: None\nPublished: 2025-02-13T00:00:00\nURL: https://webgate.ec.europa.eu/rasff-window/screen/notification/744589\nData Source: RASFF\nNotification Type: border rejection\n"\n\n\nCONCISE SUMMARY: The European Food Safety Authority (EFSA) has detected high levels of aflatoxins, specifically aflatoxin B1, in pistachio samples imported from the United States. This finding led to their rejection at the EU border due to safety concerns.\n\n**Summary:** EFSA detects high aflatoxin levels in US-imported pistachios, leading to border rejection.'

In [None]:
len(summaries)

32

In [None]:
users = [
    {
        "name": "John Doe",
        "customization": {
            "products": [],
            "hazards": ["salmonella"],
            "origins": []
        }
    },
    {
        "name": "Enzo Ferrari",
        "customization": {
            "products": [],
            "hazards": ["listeria monocytogenes"],
            "origins": []
        }
    }
]

In [None]:
user_summaries = {}
for user in users:

    user_docs = loader.load(
        product="||".join(user["customization"]["products"]) if user["customization"]["products"] else None,
        hazard="||".join(user["customization"]["hazards"]) if user["customization"]["hazards"] else None,
        origin="||".join(user["customization"]["origins"]) if user["customization"]["origins"] else None,
    )

    relevant_summaries = [
        summaries.get(doc.metadata["id"]) for doc in user_docs
    ]

    if not relevant_summaries:
        print(f"User {user['name']} does not have relevant incidents")
        continue

    relevant_summaries_text = "\n".join(relevant_summaries)
    user_summaries[user["name"]] = map_chain.invoke(relevant_summaries_text)

{'apikey': '9a88bdb1-8409-3bd8-98ce-f3548acffa22', 'pageSize': 1000, 'from': '2000-01-01', 'to': '2025-02-14', 'entityType': 'incident', 'detail': True, 'published': True, 'strictQuery': {'hazards.value.keyword': 'salmonella'}}


TypeError: sequence item 0: expected str instance, dict found