In [6]:
import os
import json
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import CommaSeparatedListOutputParser
from tqdm import tqdm

In [7]:
system_prompt = """### Instruction ###
You are an expert in generating synthetic data. Your task is to generate N unique sentences, all in lowercase, describing an object based on the provided property in the format "property: value". Avoid repeating sentences and do not include any additional information.

### Output Format ###
Your response should be a list of comma-separated values, e.g., `sentence1, sentence2`.

### Example ###

# Input #
property: color
value: red
N: 3

# Output #
a red object, an item that is colored red, something that has a red hue

Begin!"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{attribute}: {value}, N: {num}"),
    ]
)

llm = ChatOpenAI(model="gpt-4o-mini")

chain = prompt | llm | CommaSeparatedListOutputParser()

In [8]:
def generate_sentences(attribute, value, num):
    result = chain.invoke({"attribute": attribute, "value": value, "num": num})
    if len(result) < num:
        result += chain.invoke({"attribute": attribute, "value": value, "num": num - len(result)})
    if len(result) > num:
        result = result[:num]
    return result

In [9]:
with open("../data/attributes.json") as f:
    attributes = json.load(f)

attributes["container"].extend(["shelf", "table"])
attributes

{'shape': ['cylinder', 'cube', 'torus', 'cone', 'sphere'],
 'color': ['blue',
  'magenta',
  'black',
  'red',
  'orange',
  'purple',
  'green',
  'yellow',
  'cyan'],
 'material': ['metal', 'rubber'],
 'container': ['floor',
  'shelf_bottom',
  'shelf_top',
  'table_top',
  'table_under',
  'crate',
  'box',
  'shelf',
  'table'],
 'size': ['small', 'big']}

In [10]:
# Generate sentences for each attribute
sentences = {}
for attribute, values in tqdm(attributes.items()):
    sentences[attribute] = {}
    for value in values:
        sentences[attribute][value] = generate_sentences(attribute, value, 100)

# Save sentences to file
with open("../sentences.json", "w") as f:
    json.dump(sentences, f, indent=4)

100%|██████████| 5/5 [03:15<00:00, 39.05s/it]


In [None]:
# Count the number of unique sentences generated
unique_sentences = set()
for attribute, values in sentences.items():
    for value, sents in values.items():
        unique_sentences.update(sents)
print(f"Unique sentences: {len(unique_sentences)}")