### Dependencies 

In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
from starfish import StructuredLLM, data_factory
from starfish.common.env_loader import load_env_file

load_env_file()

[32m2025-04-14 09:25:12[0m | [1mINFO    [0m | [36mstarfish.common.env_loader[0m | [34menv_loader.py:52[0m | [1mLoaded 9 environment variables from /Users/zhengisamazing/1.python_dir/starfish/.env[0m


True

### Structured LLM - Single
#### 1. Model provider LLM Call

In [3]:
first_llm = StructuredLLM(
    model_name="openai/gpt-4o-mini",
    prompt="Facts about city {{city_name}}.",
    output_schema=[{"name": "question", "type": "str"}, {"name": "answer", "type": "str"}],
    model_kwargs={"temperature": 0.7},
)

first_response = await first_llm.run(city_name="New York")
first_response.data

[{'question': 'What is the population of New York City?',
  'answer': 'As of 2023, New York City has an estimated population of over 8.4 million people, making it the most populous city in the United States.'}]

In [4]:
print(first_llm.render_prompt_printable(city_name="New York", num_records=5))


📝 CONSTRUCTED MESSAGES:

Role: user
Content:
Facts about city New York.


You are asked to generate exactly 5 records and please return the data in the following JSON format: 
[
    {
    "question": ""  //  (required),
    "answer": ""  //  (required)
    }
    ...
]

Required fields: question, answer


End of prompt



#### 2. Customized Openai Compatible Model provider LLM Call

In [5]:
first_llm = StructuredLLM(
    model_name="hyperbolic/deepseek-ai/DeepSeek-V3-0324",
    prompt="Facts about city {{city_name}}.",
    output_schema=[{'name': 'question', 'type': 'str'}, 
                   {'name': 'answer', 'type': 'str'}],
    model_kwargs={"temperature": 0.7}
)

first_response = await first_llm.run(city_name="New York", num_records=5)
first_response.data

[{'question': 'What is the nickname of New York City?',
  'answer': 'The Big Apple'},
 {'question': 'Which famous statue is located in New York Harbor?',
  'answer': 'The Statue of Liberty'},
 {'question': 'What is the name of the largest park in Manhattan?',
  'answer': 'Central Park'},
 {'question': 'Which borough of New York City is known for its diverse culture and arts scene?',
  'answer': 'Brooklyn'},
 {'question': 'What is the name of the famous theater district in New York City?',
  'answer': 'Broadway'}]

#### 3. Local LLM

In [6]:
### Local model
first_llm = StructuredLLM(
    model_name="ollama/gemma3:1b",
    prompt="Facts about city {{city_name}}.",
    output_schema=[{"name": "question", "type": "str"}, {"name": "answer", "type": "str"}],
    model_kwargs={"temperature": 0.7},
)

first_response = await first_llm.run(city_name="New York", num_records=5)
first_response.data

[32m2025-04-13 23:58:00[0m | [1mINFO    [0m | [36mstarfish.core.llm.proxy.litellm_adapter[0m | [34mlitellm_adapter.py:94[0m | [1mEnsuring Ollama model gemma3:1b is ready...[0m
[32m2025-04-13 23:58:00[0m | [1mINFO    [0m | [36mstarfish.core.llm.backend.ollama_adapter[0m | [34mollama_adapter.py:63[0m | [1mStarting Ollama server...[0m
[32m2025-04-13 23:58:01[0m | [1mINFO    [0m | [36mstarfish.core.llm.backend.ollama_adapter[0m | [34mollama_adapter.py:79[0m | [1mOllama server started successfully[0m
[32m2025-04-13 23:58:01[0m | [1mINFO    [0m | [36mstarfish.core.llm.backend.ollama_adapter[0m | [34mollama_adapter.py:129[0m | [1mFound model gemma3:1b[0m
[32m2025-04-13 23:58:01[0m | [1mINFO    [0m | [36mstarfish.core.llm.backend.ollama_adapter[0m | [34mollama_adapter.py:232[0m | [1mModel gemma3:1b is already available[0m
[32m2025-04-13 23:58:01[0m | [1mINFO    [0m | [36mstarfish.core.llm.proxy.litellm_adapter[0m | [34mlitellm_adapter.py

[{'question': 'What is the population of New York City?',
  'answer': 'As of 2023, the population of New York City is approximately 8.8 million people.'}]

In [7]:
### Clean it up
from starfish.core.llm.backend.ollama_adapter import stop_ollama_server
await stop_ollama_server()

[32m2025-04-13 23:58:08[0m | [1mINFO    [0m | [36mstarfish.core.llm.backend.ollama_adapter[0m | [34mollama_adapter.py:254[0m | [1mStopping Ollama server...[0m
[32m2025-04-13 23:58:09[0m | [1mINFO    [0m | [36mstarfish.core.llm.backend.ollama_adapter[0m | [34mollama_adapter.py:305[0m | [1mOllama server stopped successfully[0m


True

### Structured LLM - Workflow
#### 1. Two LLM

In [8]:
from starfish.core.llm.utils import merge_structured_outputs
from starfish import StructuredLLM 
first_llm = StructuredLLM(
    model_name="openai/gpt-4o-mini",
    prompt="Facts about city {{city_name}}.",
    output_schema=[{"name": "question", "type": "str"}, {"name": "answer", "type": "str"}],
)

first_response = await first_llm.run(city_name="New York", num_records=5)


second_llm = StructuredLLM(
    model_name="openai/gpt-4o-mini",
    prompt="""You will be given a list of question and answer pairs, 
please rate each individually about its accuracy, funny and conciseness. 
rating are from 1 to 10, 1 being the worst and 10 being the best. 
lets also rank them among themself so from 1 being the best.
Here is question and answer pairs: {{QnA_pairs}}""",
    output_schema=[{"name": "accuracy", "type": "int"}, {"name": "funny", "type": "int"}, {"name": "conciseness", "type": "int"}, {"name": "rank", "type": "int"}],
    model_kwargs={"temperature": 1},
)

second_response = await second_llm.run(QnA_pairs=first_response.data)

### Merge result:
merge_structured_outputs(first_response.data, second_response.data)


[{'question': 'What is the population of New York City?',
  'answer': 'As of 2023, New York City has an estimated population of over 8.4 million people.',
  'accuracy': 9,
  'funny': 2,
  'conciseness': 9,
  'rank': 2},
 {'question': 'What is the official nickname of New York City?',
  'answer': "New York City is commonly known as 'The Big Apple.'",
  'accuracy': 10,
  'funny': 3,
  'conciseness': 10,
  'rank': 1},
 {'question': 'Which river borders New York City to the west?',
  'answer': 'The Hudson River borders New York City to the west.',
  'accuracy': 9,
  'funny': 1,
  'conciseness': 9,
  'rank': 4},
 {'question': 'What is the tallest building in New York City?',
  'answer': 'As of 2023, One World Trade Center is the tallest building in New York City, standing at 1,776 feet.',
  'accuracy': 10,
  'funny': 2,
  'conciseness': 10,
  'rank': 1},
 {'question': 'What famous park is located in the heart of Manhattan?',
  'answer': 'Central Park is the famous park located in the heart 

In [9]:
print(first_llm.render_prompt_printable(city_name="New York", num_records=5))


📝 CONSTRUCTED MESSAGES:

Role: user
Content:
Facts about city New York.


You are asked to generate exactly 5 records and please return the data in the following JSON format: 
[
    {
    "question": ""  //  (required),
    "answer": ""  //  (required)
    }
    ...
]

Required fields: question, answer


End of prompt



In [10]:
print(second_llm.render_prompt_printable(QnA_pairs=first_response.data))


📝 CONSTRUCTED MESSAGES:

Role: user
Content:
You will be given a list of question and answer pairs, 
please rate each individually about its accuracy, funny and conciseness. 
rating are from 1 to 10, 1 being the worst and 10 being the best. 
lets also rank them among themself so from 1 being the best.
Here is question and answer pairs: |QnA_pairs| : [{"question": "What is the population of New York City?", "answer": "As of 2023, New York City has an estimated population of over 8.4 million people."}, {"question": "What is the official nickname of New York City?", "answer": "New York City is commonly known as 'The Big Apple.'"}, {"question": "Which river borders New York City to the west?", "answer": "The Hudson River borders New York City to the west."}, {"question": "What is the tallest building in New York City?", "answer": "As of 2023, One World Trade Center is the tallest building in New York City, standing at 1,776 feet."}, {"question": "What famous park is located in the heart o

In [4]:
from starfish.core.llm.utils import merge_structured_outputs 
@data_factory(max_concurrency=50)
async def workflow(city_name, num_records_per_city):
    print(f"Processing city: {city_name}!")
    first_llm = StructuredLLM(
        model_name="openai/gpt-4o-mini",
        prompt="Facts about city {{city_name}}.",
        output_schema=[{"name": "question", "type": "str"}, {"name": "answer", "type": "str"}],
    )

    first_response = await first_llm.run(city_name=city_name, num_records=num_records_per_city)


    second_llm = StructuredLLM(
        model_name="openai/gpt-4o-mini",
        prompt="""You will be given a question and answer pair, 
                please rate each individually about accuracy, funny and conciseness. 
                rating are from 1 to 10, 1 being the worst and 10 being the best. 
                Here is question and answer pair: {{QnA_pairs}}""",
        output_schema=[{"name": "accuracy", "type": "int"}, {"name": "funny", "type": "int"}, {"name": "conciseness", "type": "int"}],
        model_kwargs={"temperature": 1},
    )

    second_response = await second_llm.run(QnA_pairs=first_response.data)

    ### Merge result:
    final_output = merge_structured_outputs(first_response.data, second_response.data)

    return final_output


final_output = workflow.run(data = [
                                        {'city_name': 'New York'},
                                        {'city_name': 'Los Angeles'},
                                        {'city_name': 'Chicago'},
                                        {'city_name': 'Houston'},
                                        {'city_name': 'Miami'}
                                    ], num_records_per_city=5)

[32m2025-04-14 09:26:12[0m | [1mINFO    [0m | [36mstarfish.core.data_factory.factory[0m | [34mfactory.py:180[0m | [1m
2. Creating master job...[0m
[32m2025-04-14 09:26:12[0m | [1mINFO    [0m | [36mstarfish.core.data_factory.job_manager[0m | [34mjob_manager.py:154[0m | [1mNo task to run, waiting for task to complete[0m
Processing city: New York!
Processing city: Los Angeles!
Processing city: Chicago!
Processing city: Houston!
Processing city: Miami!
[32m2025-04-14 09:26:13[0m | [1mINFO    [0m | [36mstarfish.core.data_factory.job_manager[0m | [34mjob_manager.py:154[0m | [1mNo task to run, waiting for task to complete[0m
[32m2025-04-14 09:26:14[0m | [1mINFO    [0m | [36mstarfish.core.data_factory.job_manager[0m | [34mjob_manager.py:154[0m | [1mNo task to run, waiting for task to complete[0m
[32m2025-04-14 09:26:15[0m | [1mINFO    [0m | [36mstarfish.core.data_factory.job_manager[0m | [34mjob_manager.py:154[0m | [1mNo task to run, waiting for

In [12]:
from typing import Any, Dict

from starfish import StructuredLLM, data_factory

# -> List[Dict] job hooks hook-up


# Add callback for error handling
def handle_error(err_str: str):
    print(f"Error occurred: {err_str}")


async def handle_record_complete(data: Any, state: Dict[str, Any]):
    print(f"Record complete: {data}")


async def handle_duplicate_record(data: Any, state: Dict[str, Any]):
    print(f"Record duplicated: {data}")


@data_factory(
    storage="local", max_concurrency=50, state={}, on_record_complete=[handle_record_complete, handle_duplicate_record], on_record_error=[handle_error]
)
async def get_city_info_wf(city_name, region_code):
    structured_llm = StructuredLLM(
        model_name="openai/gpt-4o-mini",
        prompt="Facts about city {{city_name}} in region {{region_code}}.",
        output_schema=[{"name": "question", "type": "str"}, {"name": "answer", "type": "str"}],
        model_kwargs={"temperature": 0.7},
    )
    output = await structured_llm.run(city_name=city_name, region_code=region_code)
    return output.data


# Execute with batch processing
results = get_city_info_wf.run(
    cities=["Paris", "Tokyo", "New York", "London"],
    num_facts=3
)

results = get_city_info_wf.run(
    city_name= ["Berlin", "Rome"],
    region_code=["DE", "IT"],
)


results = get_city_info_wf.run(
    data=[{"city_name": "Berlin"}, {"city_name": "Rome"}],
    region_code=["DE", "IT"],
    city_name="Beijing",  ### Overwrite the data key
    # num_records_per_city = 3
)


TypeError: data_factory() got an unexpected keyword argument 'state'