## Run the local LLM pipeline to get structured output

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import glob
import os
from PIL import Image
from pillow_heif import register_heif_opener
from dotenv import load_dotenv
import pandas as pd
import time

import tempfile
import torch
import pandas as pd
import outlines
from transformers import pipeline

from hellofresh_extractor.llm.prompts import multimodal_system_prompt, multimodal_user_query, structured_system_prompt
from hellofresh_extractor.llm.utils import convert_structured_result_to_df
from hellofresh_extractor.llm.output_schemas import ExtractedMeal
from hellofresh_extractor.gsuite.drive.GoogleDriveHelper import GoogleDriveHelper
from hellofresh_extractor.llm.MultiModalModel import MultiModalModel
from hellofresh_extractor.llm.StructuredOutputModel import StructuredOutputModel

In [None]:
load_dotenv()

In [None]:
multimodal_pipe = pipeline(
    "image-text-to-text",
    model="google/gemma-3-4b-it",
    device="mps",
    torch_dtype=torch.bfloat16,
    token=os.environ.get('HF_TOKEN'),
    use_fast=True
)
structured_model = outlines.models.transformers(
    "HuggingFaceTB/SmolLM2-1.7B-Instruct", 
    device="mps",
    model_kwargs = {"temperature": 0.1, "do_sample": True}
)

In [None]:
multimodal_model_caller = MultiModalModel(
    model_pipe = multimodal_pipe
)
structured_model_caller = StructuredOutputModel(
    model=structured_model,
    outputmodel=ExtractedMeal
)

In [None]:
this_path = os.getcwd()
images_path = os.path.join(this_path,"images")
images = glob.glob(os.path.join(images_path,"*.HEIC"))

In [None]:
all_meals = []
register_heif_opener()
t0 = time.time()
for i, image in enumerate(images):
    print(f"At image {i}")
    
    open_image = Image.open(image).convert("RGB")
    with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as temp_file:
        temp_filename = temp_file.name
        open_image.save(temp_filename, format="JPEG")
    
        user_message = [
            {"type": "image", "image": temp_filename}, 
            {"type": "text", "text": multimodal_user_query}
        ]
    
        print("Running multimodal model")
        try:
            multimodal_result = multimodal_model_caller.invoke(
                system_message = multimodal_system_prompt, 
                user_messages = user_message
            )
            multimodal_extracted_text = multimodal_result[0]["generated_text"]
        except Exception as e:
            print("Error in multimodal model inference: {}".format(e))

    print("Running structured output model")
    try:
        structured_result = structured_model_caller.invoke(
            system_message = structured_system_prompt, 
            text_to_extract = multimodal_extracted_text,
            user_query = multimodal_user_query
        )
        df = convert_structured_result_to_df(structured_result)
        df["image_path"] = image
        all_meals.append(df)
    except Exception as e:
        print("Error in structured model inference: {}".format(e))
t1 = time.time()
mean_process_time = (t1-t0)/len(images)

In [None]:
mean_process_time

In [None]:
all_meals = pd.concat(all_meals)

In [None]:
all_meals.to_csv("test_hello_fresh_recipes_local.csv",index=False)

In [None]:
all_meals