# Synthetic Devices

This notebook is for creating synthetic devices based on the previously seeded areas and homes.

## Model Client

In [2]:
import google.generativeai as genai

from home_assistant_datasets import secrets
from home_assistant_datasets import model_client

secrets.DEFAULT_SECRETS_FILE = "../secrets.yaml"

# Gemini flash is higher quality and cheaper model than the GPT alternatives.
MODEL_ID = "gemini-1.5-flash"
genai.configure(api_key=secrets.get_secret("google_api_key"))
model = model_client.GoogleClient(MODEL_ID)

## Read Seed Data

The seed data for the prompt is manually curated to use as examples for further synthetic data generation.

In [6]:
import pathlib
import synthetic_home.device_types
import yaml
import synthetic_home

SEEDS_DIR = pathlib.Path("./seeds")
SEED_DEVICES_FILE = SEEDS_DIR / "devices.yaml"

registry = synthetic_home.device_types.load_device_type_registry()
device_types = [
    {
        "name": device_type,
        "desc": device.desc,
    }
    for device_type, device in registry.device_types.items()
]

with open(SEED_DEVICES_FILE) as f:
    seed_devices = list(yaml.load_all(f.read(), Loader=yaml.Loader))


In [7]:
device_type_labels = "\n".join([
    yaml.dump(device_type, sort_keys=False)
    for device_type in device_types
])
device_type_labels = yaml.dump(device_types, sort_keys=False, explicit_start=False)

device_type_prompt = f"""
The system uses a `device_type` to describe a type of device. Here are the valid `device_type` values.
device_types:
{device_type_labels}
"""

In [8]:
DEVICE_ENTITY_PROMPT_FORMAT = """
Input:
{home}
Output:
{comments}
{devices}
"""

item = []
for seed in seed_devices:
    # print(seed)
    home = yaml.dump(seed["home"], sort_keys=False)
    devices_thoughts = seed["devices-thoughts"]
    # We stick the chain of thought in the comments to help guide the output to be valid yaml
    thoughts_comment = "\n".join([f"# thought: {thought}" for thought in devices_thoughts])
    devices = yaml.dump(seed["devices"], sort_keys=False)
    item.append(DEVICE_ENTITY_PROMPT_FORMAT.format(home=home, devices=devices, comments=thoughts_comment))

seed_devices_prompt = "\n".join(item)
print(seed_devices_prompt)


Input:
name: Coastal Bungalow
thoughts:
- The bungalow in a seaside village may be a vacation home or a tranquil retreat.
- The cozy living room with a fireplace hints at a cozy atmosphere, and there may
  be smart climate control for comfort.
- The outdoor shower is convenient for beach days, so there may be smart water heating
  systems.
desc: Bungalow in a seaside village in Norway
areas:
- Living Room
- Bedroom
- Porch
- Outdoor Shower

Output:
# thought: Since the costal bungalow is tranquil and cozy, we can suggest a modest amount of devices
Living Room:
- name: Living Room
  device_type: light-dimmable
  device_info:
    model: Enbrighten Z-Wave
    manufacturer: GE
    sw_version: 2.3.29-7
- name: Smart Speaker
  device_type: smart-speaker
  device_info:
    model: Echo Dot
    manufacturer: Amazon
    sw_version: 4th Gen
- name: Thermostat
  device_type: hvac
  device_info:
    model: Learning Thermostat
    manufacturer: Nest
    sw_version: 5.9.3
  attributes:
  - unit_of_m

In [12]:
DEVICE_TYPE_PROMPT = f"""
You are an expert in smart home automation and are generating data used to
evaluate the performance of a smart home system Home Assistant on tasks like
summarization, performing actions, or being an independent agent managing
automations and maintenance tasks.

You use your knowledge about the world to generate details about homes that
can be used for synthetic smart home automation data. For example, an apartment
may have a smart thermostat, a house may have a smart garage door opener or
smart lock and camera, and all houses may have a smart light or weather feed
air quality, or a smart speaker or television. The needs of a home owner
may vary if they are a single person or family, or where in the world they
live. For example, a high rise apartment probably does not have a backyard.

Do not be cliche. Do not assume everyone has a smart watch. Don't assume
every room has a smart light just because it is listed, but of course many will.

Below are example inputs and outputs for generating areas and devices for a home,
thinking step by step about the needs of the home.

The following device types are supported:
{device_type_prompt}

Below are example inputs and outputs for generating device entities for a home. The output
format is valid  yaml and you may think step by step with output in the comments prefixed
with `# thought:`.

{seed_devices_prompt}

Output yaml in plain text and do not output markdown.
"""

In [14]:
from tqdm.auto import tqdm
import random
import shutil

N_DATAPOINTS = 40
DATASET_DIR = pathlib.Path("../datasets")
AREAS_DIR = DATASET_DIR / "areas-v2"
DEVICES_OUTPUT_DIR = DATASET_DIR / "devices-v3"

# Wipe existing devices
shutil.rmtree(DEVICES_OUTPUT_DIR, ignore_errors=True)
DEVICES_OUTPUT_DIR.mkdir(exist_ok=True)


homes = []
for path in AREAS_DIR.glob("*.yaml"):
    with path.open("r") as f:
        content = f.read()
    home_id = path.name.split(".")[0] # remove the .yaml
    home_data = yaml.load(content, Loader=yaml.Loader)
    homes.append((home_id, home_data))

random.shuffle(homes)

if len(homes) > N_DATAPOINTS:
    homes = homes[:N_DATAPOINTS]

def extract_toplevel_comments(content: str):
  """Extracts comments (lines starting with "#") from the top level of a YAML file."""
  return [line for line in content.split("\n") if line.strip().startswith("#")]

skipped = 0
with tqdm(total=len(homes)) as pbar:
    for home_id, home in homes:
        response = None
        response_obj = None
        for i in range(3):
            prompt = DEVICE_ENTITY_PROMPT_FORMAT.format(home=home, devices="", comments="")
            response = model.complete(DEVICE_TYPE_PROMPT, prompt)
            try:
                response_obj = yaml.safe_load(response)
            except yaml.YAMLError:
                skipped += 1
                continue
            break
        if response_obj is not None:
            updated_home = home.copy()
            updated_home.update({"devices": response_obj})
            with open(DEVICES_OUTPUT_DIR / f"{home_id}.yaml", "w") as device_output:
                device_output.write("\n".join(extract_toplevel_comments(response)))
                device_output.write("\n")
                device_output.write(yaml.dump(updated_home, explicit_start=True, sort_keys=False))

        pbar.set_description(f"Skipped {skipped}")
        pbar.update(1)


Skipped 0: 100%|██████████| 40/40 [03:09<00:00,  4.73s/it]


## Validation

Examine the dataset and look at the data and statistics. This is also a chance to perform any manual cleaning if there are minor formatting issues in the generated data.


In [15]:
import itertools
from operator import itemgetter

homes = []
for path in DEVICES_OUTPUT_DIR.glob("*.yaml"):
    with path.open("r") as f:
        content = f.read()
    home_data = yaml.load(content, Loader=yaml.Loader)
    homes.append(home_data)


entity_counts =  {}
domain_counts = {}
total_homes = len(homes)
total_areas = 0
total_devices = 0
device_types = {}
device_mfgs = {}
for home in homes:
    area_devices = home["devices"]
    for area, devices in area_devices.items():
        if not devices:
            continue
        total_areas += 1
        for device in devices:
            total_devices += 1
            device_type = device["device_type"]
            device_types[device_type] = device_types.get(device_type, 0) + 1
            device_mfg = device.get("manufacturer", "unknown")
            device_mfgs[device_mfg] = device_mfgs.get(device_mfg, 0) + 1


print(f"Total homes: {total_homes}")
print(f"Total devices: {total_devices} (average {total_devices / total_areas:0.2f} per area, {total_devices / total_homes:0.2f} per home)")
print(f"Total unique devices: {len(device_types)}")

sorted_dict = dict(sorted(device_types.items(), key=itemgetter(1), reverse=True))
device_type_rank = [ (k, f"{(v / total_devices)*100:.0f}%") for k, v in itertools.islice(sorted_dict.items(), 15) ]

print(f"Devices:")
device_type_rank

Total homes: 40
Total devices: 571 (average 1.88 per area, 14.28 per home)
Total unique devices: 26
Devices:


[('light', '36%'),
 ('light-dimmable', '15%'),
 ('smart-speaker', '9%'),
 ('smart-plug', '6%'),
 ('hvac', '5%'),
 ('motion-sensor', '5%'),
 ('sensor', '4%'),
 ('smart-sprinkler', '3%'),
 ('smart-tv', '3%'),
 ('camera', '2%'),
 ('exhaust-fan', '2%'),
 ('switch', '1%'),
 ('weather-service', '1%'),
 ('door-sensor', '1%'),
 ('water-valve', '1%')]