### Dependencies 

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [5]:
from starfish.core.structured_llm import StructuredLLM
from starfish.common.env_loader import load_env_file
from starfish.common.utils import merge_structured_outputs
from starfish.utils.data_factory import data_factory

load_env_file()

[32m2025-04-07 22:04:50[0m | [1mINFO    [0m | [36mstarfish.common.env_loader[0m | [34menv_loader.py:52[0m | [1mLoaded 9 environment variables from /Users/zhengisamazing/1.python_dir/starfish/.env[0m


True

In [53]:
### Mock LLM call
import random
import asyncio

async def mock_llm_call(city_name, num_records_per_city, fail_rate=0.05, sleep_time=0.01):
    # Simulate a slight delay (optional, feels more async-realistic)
    await asyncio.sleep(sleep_time)

    # 5% chance of failure
    if random.random() < fail_rate:
        print(f"  {city_name}: Failed!") ## For debugging
        raise ValueError(f"Mock LLM failed to process city: {city_name}")
    
    print(f"{city_name}: Successfully processed!") ## For debugging

    result = [f"{city_name}_{random.randint(1, 5)}" for _ in range(num_records_per_city)]
    return result

Test Case 1:  ✅

Settings:
- ✅ `Input Data`:  data + broadcast variable`
- ✅ `Decorator`: controlled concurrency
- ✅ `Retry Logic`: Try if failed 

In [62]:

@data_factory(max_concurrency=2)
async def test1(city_name, num_records_per_city, fail_rate = 0.5, sleep_time = 1):
    return await mock_llm_call(city_name, num_records_per_city, fail_rate = fail_rate, sleep_time = sleep_time)

test1.run(data = [
    {'city_name': '1. New York'},
    {'city_name': '2. Los Angeles'},
    {'city_name': '3. Chicago'},
    {'city_name': '4. Houston'},
    {'city_name': '5. Miami'}
], num_records_per_city=5)

1. New York: Successfully processed!
  2. Los Angeles: Failed!
  3. Chicago: Failed!
  4. Houston: Failed!
  5. Miami: Failed!
2. Los Angeles: Successfully processed!
3. Chicago: Successfully processed!
  4. Houston: Failed!
5. Miami: Successfully processed!
  4. Houston: Failed!
  4. Houston: Failed!
  4. Houston: Failed!
4. Houston: Successfully processed!


Test Case 2: ❌

Expected behavior: Data should be optional - use can just pass kwargs variables 

Settings:
- ❌ `Input Data`:  kwargs list + broadcast variable`



In [56]:

@data_factory(max_concurrency=2)
async def test1(city_name, num_records_per_city, fail_rate = 0.5, sleep_time = 1):
    return await mock_llm_call(city_name, num_records_per_city, fail_rate = fail_rate, sleep_time = sleep_time)

test1.run(city = ["1. New York", "2. Los Angeles", "3. Chicago", "4. Houston", "5. Miami"], num_records_per_city=5)

TypeError: default_input_converter() missing 1 required positional argument: 'data'

Test Case 3:  ❌

Description: Test if the system handles failures gracefully, ensuring it doesnt run indefinitely when the failure rate is 100%

Expected behavior: It should stop after it was not able to process it after cetain times 

Settings:
- ❌ `Handle failures`: Not infinite loop

In [59]:

@data_factory(max_concurrency=2)
async def test1(city_name, num_records_per_city, fail_rate = 1, sleep_time = 0.05):
    return await mock_llm_call(city_name, num_records_per_city, fail_rate = fail_rate, sleep_time = sleep_time)

test1.run(data = [
    {'city_name': '1. New York'},
    {'city_name': '2. Los Angeles'},
    {'city_name': '3. Chicago'},
    {'city_name': '4. Houston'},
    {'city_name': '5. Miami'}
], num_records_per_city=5)

  1. New York: Failed!
  2. Los Angeles: Failed!
  3. Chicago: Failed!
  4. Houston: Failed!
  5. Miami: Failed!
  1. New York: Failed!
  2. Los Angeles: Failed!
  3. Chicago: Failed!
  4. Houston: Failed!
  5. Miami: Failed!
  1. New York: Failed!
  2. Los Angeles: Failed!
  3. Chicago: Failed!
  4. Houston: Failed!
  5. Miami: Failed!
  1. New York: Failed!
  2. Los Angeles: Failed!
  3. Chicago: Failed!
  4. Houston: Failed!
  5. Miami: Failed!
  1. New York: Failed!
  2. Los Angeles: Failed!
  3. Chicago: Failed!
  4. Houston: Failed!
  5. Miami: Failed!
  1. New York: Failed!
  2. Los Angeles: Failed!
  3. Chicago: Failed!
  4. Houston: Failed!
  5. Miami: Failed!
  1. New York: Failed!
  2. Los Angeles: Failed!
  3. Chicago: Failed!
  4. Houston: Failed!
  5. Miami: Failed!
  1. New York: Failed!
  2. Los Angeles: Failed!
  3. Chicago: Failed!
  4. Houston: Failed!
  5. Miami: Failed!
  1. New York: Failed!
  2. Los Angeles: Failed!
  3. Chicago: Failed!
  4. Houston: Failed!
  5

KeyboardInterrupt: 

  3. Chicago: Failed!
  4. Houston: Failed!


Test Case 4:  ✅

Description: Test kwags keyword overide by broacase variables

Settings:
- ✅ `Input Data`:  data + broadcast variable override

In [66]:

@data_factory(max_concurrency=2)
async def test1(city_name, num_records_per_city, fail_rate = 0.1, sleep_time = 0.05):
    return await mock_llm_call(city_name, num_records_per_city, fail_rate = fail_rate, sleep_time = sleep_time)

test1.run(data = [
    {'city_name': '1. New York'},
    {'city_name': '2. Los Angeles'},
], city_name = 'override_city_name', 
num_records_per_city = 1)

override_city_name: Successfully processed!
override_city_name: Successfully processed!


Test Case 5:  ✅

Description: Test kwags keyword overide by list variables

Settings:
- ✅ `Input Data`:  data + broadcast variable override

In [71]:
@data_factory(max_concurrency=2)
async def test1(city_name, num_records_per_city, fail_rate = 0.1, sleep_time = 0.05):
    return await mock_llm_call(city_name, num_records_per_city, fail_rate = fail_rate, sleep_time = sleep_time)

test1.run(data = [
    {'city_name': '1. New York'},
    {'city_name': '2. Los Angeles'},
], city_name = ['1. override_city_name', '2. override_city_name'], 
num_records_per_city = 1)

1. override_city_name: Successfully processed!
2. override_city_name: Successfully processed!


Test Case 6:  ❌

Description: Missing required kwags

Expected behavior: it should raise an error about missing required kwags, somehow it is running indefinitely

Settings:
- ❌ `Missing required num_records_per_city`: 


In [None]:

@data_factory(max_concurrency=2)
async def test1(city_name, num_records_per_city, fail_rate = 0.1, sleep_time = 0.05):
    return await mock_llm_call(city_name, num_records_per_city, fail_rate = fail_rate, sleep_time = sleep_time)

test1.run(data = [
    {'city_name': '1. New York'},
    {'city_name': '2. Los Angeles'},
], city_name = 'override_city_name',  
)

KeyboardInterrupt: 

Test Case 7:  ❌

Description: Pass extra parameters that are not defined in the workflow

Expected behavior: it should raise an error about extra kwargs, somehow it is running indefinitely

Settings:
- ❌ `Add additional parameters'


In [78]:

@data_factory(max_concurrency=2)
async def test1(city_name, num_records_per_city, fail_rate = 0.1, sleep_time = 0.05):
    return await mock_llm_call(city_name, num_records_per_city, fail_rate = fail_rate, sleep_time = sleep_time)

test1.run(data = [
    {'city_name': '1. New York'},
    {'city_name': '2. Los Angeles'},
], num_records_per_city = 1, random_param = 'random_param')

KeyboardInterrupt: 

Test Case 8:  ❌

Description: Test hooks that change the state of the workflow and workflow can access to this data

Expected behavior: it should be able to see the state changed which we do - but it is running in indefinite loop

Settings:
- ❌ `Add additional parameters'


In [83]:

def test_hook(data, state):
    state['variable'] =  f'changed_state - {data}'
    return state


@data_factory(max_concurrency=2, on_record_complete=[test_hook], state= {'variable': 'initial_state'})
async def test1(city_name, num_records_per_city, fail_rate = 0.1, sleep_time = 0.05):
    print(f"Checking state: {test1.state['variable']}")
    return await mock_llm_call(city_name, num_records_per_city, fail_rate = fail_rate, sleep_time = sleep_time)

test1.run(data = [
    {'city_name': '1. New York'},
    {'city_name': '2. Los Angeles'},
], num_records_per_city = 1)

Checking state: initial_state
Checking state: initial_state
1. New York: Successfully processed!
2. Los Angeles: Successfully processed!
Checking state: changed_state - ['2. Los Angeles_1']
Checking state: changed_state - ['2. Los Angeles_1']
1. New York: Successfully processed!
2. Los Angeles: Successfully processed!
Checking state: changed_state - ['2. Los Angeles_3']
Checking state: changed_state - ['2. Los Angeles_3']
1. New York: Successfully processed!
2. Los Angeles: Successfully processed!
Checking state: changed_state - ['2. Los Angeles_1']
Checking state: changed_state - ['2. Los Angeles_1']
1. New York: Successfully processed!
2. Los Angeles: Successfully processed!
Checking state: changed_state - ['2. Los Angeles_3']
Checking state: changed_state - ['2. Los Angeles_3']
  1. New York: Failed!
2. Los Angeles: Successfully processed!
Checking state: changed_state - ['2. Los Angeles_1']
Checking state: changed_state - ['2. Los Angeles_1']
1. New York: Successfully processed!
2.

KeyboardInterrupt: 