<h1> Set-up </h1>

In [1]:
import os
import sys
import json
import openai
from langchain.tools import tool
from pydantic.v1 import BaseModel, Field
from IPython.display import display, HTML
from langchain.chat_models import ChatOpenAI
from langchain.schema.agent import AgentFinish
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import MessagesPlaceholder
from langchain.agents.format_scratchpad import format_to_openai_functions
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain_core.utils.function_calling import convert_to_openai_function
sys.path.append('../..')

In [2]:
# Setting your personal OPENAI Api Key
openai.api_key = os.environ['OPENAI_API_KEY']

<h1> Initializing Tools </h1>

In [3]:
class Design_logo_input(BaseModel):
    topic: str = Field(..., description="The topic for which the logo should be designed for. For example, if the topic were 'ice-cream shop' then the logo returned would be a logo fit for the ice-cream shop to put on their sign and merchandise.")

@tool(args_schema=Design_logo_input)
def design_logo_concepts(topic: str) -> str:
    """Returns the url for a logo design that is based on the provided topic"""
    # Currently a very empty function
    return f"s3://designs/logo_{len(topic)%5+1}.jpg"

In [4]:
class UI_Output_Request(BaseModel):
    inpt: str = Field(... , description=f"A description of the UI function you want and how you would call it along with the arguments you would use")

@tool(args_schema=UI_Output_Request)
def ui_output_request(inpt: str) -> str:
    """Provides a convenient way for you to describe what UI function you want and what to call it with without actually calling a UI function."""
    return inpt

In [5]:
class Product_UI_input(BaseModel):
    path: str = Field(... , description=f"The path of the image for the UI page. If no image is wanted or required provide an empty string.")

@tool(args_schema=Product_UI_input)
def produce_image_UI(path: str) -> str:
    """This is a UI function. Produces the JSON for a general UI page about an image."""
    data = {"component":"Image", "props":{"url":path}}
    return  json.dumps(data)            

<h1> Registering Tools </h1>

In [6]:
tools = [design_logo_concepts]
functions = [convert_to_openai_function(f) for f in tools]

In [7]:
ui_tools = [produce_image_UI]
ui_functions = [convert_to_openai_function(f) for f in ui_tools]

In [8]:
tool_dictionary = {
        "design_logo_concepts": design_logo_concepts
    }

In [9]:
ui_tool_dictionary = {
    "produce_image_UI": produce_image_UI
}

<h1> Setting up LLM Prompts </h1>

In [10]:
agent_system_message = "You are helpful assistant."

In [11]:
cognitivie_ui_instruction_I = "Should the output to the following prompt be a html page or text? Answer 'Yes' if you think a html page" +\
    " is an appropriate option and 'No' if text is enough. The prompt is as follows:"

In [12]:
cognitive_ui_instruction_UI = "Your task is to choose a function from the available options that best matches your ideal UI page" +\
    " that will serve as output for information from the scratch pad. Return a function call to it with the appropriate arguments" +\
    " and nothing else."

In [13]:
cognitive_ui_instruction_html = "Your task is to present the information from the scratch pad in html. Produce fully functioning" +\
    " and standalone html code (i.e. no references to images you don't have but feel free to use image references from the scratch" +\
    " pad but if there are no image references in the scratch pad don't make them up) that I can take and output immediately with no edits." +\
    " Assume all the images are 1024 by 1024 pixels so make sure to properly size the images so they aren't too big on the screen." +\
    "Return only the html code, and nothing else. Do not provide any commentary about the code or any of your commands."

In [14]:
cognitive_ui_instruction_text = "Answer what is in the scratchpad. Do not comment on this instruction."

<h1> Initializing Chains + Agent </h1>

In [15]:
gpt_version = "gpt-3.5-turbo"

In [16]:
init_prompt_model = ChatOpenAI(model_name= gpt_version, temperature=0).bind(functions=functions)
cognitive_ui_model = ChatOpenAI(model_name=gpt_version, temperature=0).bind(functions=ui_functions)
no_func_model = ChatOpenAI(model_name=gpt_version, temperature=0)

  warn_deprecated(


In [93]:
prompt = ChatPromptTemplate.from_messages([
    ("system", agent_system_message),
    ("user", "{input}"),
    MessagesPlaceholder(variable_name="agent_scratchpad")
])

In [18]:
init_prompt_chain = prompt | init_prompt_model | OpenAIFunctionsAgentOutputParser()
cognitivie_ui_chain =  prompt | cognitive_ui_model | OpenAIFunctionsAgentOutputParser()
no_func_chain = prompt | no_func_model | OpenAIFunctionsAgentOutputParser()

<h1> UI Output Generators </h1>

In [19]:
def produce_ui_output(user_input, intermediate_steps):
    result = cognitivie_ui_chain.invoke({
        "input": cognitive_ui_instruction_UI,
        "agent_scratchpad": format_to_openai_functions(intermediate_steps) + [user_input]
    })
    tool = ui_tool_dictionary[result.tool]
    observation = tool.run(result.tool_input)
    return observation

In [20]:
def produce_html(user_input, intermediate_steps):
    result = no_func_chain.invoke({
        "input": cognitive_ui_instruction_html,
        "agent_scratchpad": format_to_openai_functions(intermediate_steps) + [user_input]
    })
    return result

<h1> Agent Runnable </h1>

In [21]:
def run_agent(user_input, ui_producer=produce_ui_output):
    """
    Runs the UI agent which will return a tuple of (LLM Response, Is it calling a UI Function)
    """
    intermediate_steps = []
    result = init_prompt_chain.invoke({
        "input": user_input,
        "agent_scratchpad": format_to_openai_functions(intermediate_steps)
    })
    # Are we calling the functions?
    if not isinstance(result, AgentFinish):
        # Run first non-UI function
        tool = tool_dictionary[result.tool]
        observation = tool.run(result.tool_input)
        intermediate_steps.append((result, observation))
    # Cognitive UI Steps
    result = no_func_chain.invoke({
        "input": cognitivie_ui_instruction_I + user_input,
        "agent_scratchpad": []
    })
    print(result)
    if result.log.lower() == "yes":
        return ui_producer(user_input, intermediate_steps) , True
    else:
        result = no_func_chain.invoke({
            "input": cognitive_ui_instruction_text,
            "agent_scratchpad": [user_input]
        })
        return result.log, False
        

<h1>Testing</h1>

In [22]:
print(run_agent("Hello!"))

return_values={'output': 'No'} log='No'
('The scratchpad contains the word "Hello!"', False)


In [23]:
print(run_agent("Design me a big red logo"))

return_values={'output': 'Yes'} log='Yes'
('{"component": "Image", "props": {"url": "s3://designs/logo_3.jpg"}}', True)


In [24]:
print(run_agent("Tell me a bit about Cornell University, I hear they are also reffered to as Big Red."))

return_values={'output': 'Yes'} log='Yes'
('{"component": "Image", "props": {"url": "cornell_university.jpg"}}', True)


In [25]:
print(run_agent("Make me a nice UI page about the movie 'Dune'"))

return_values={'output': 'Yes'} log='Yes'
('{"component": "Image", "props": {"url": "s3://designs/logo_1.jpg"}}', True)


<h1> Direct HTML Output by LLM </h1>

In [26]:
html_code = run_agent("Design me a big red logo", produce_html)[0].log
display(HTML(html_code))

return_values={'output': 'Yes'} log='Yes'


In [36]:
user_input = "Provide me with a colorful list of all the movies Christain Bale played in along with links" +\
    " to them in IMDB and pictures and a short description"
bale_html_code = run_agent(user_input, produce_html)[0].log
display(HTML(bale_html_code))

return_values={'output': 'Yes'} log='Yes'


In [37]:
user_input = "Make me a colorful ad for a milk chocolate product I am making."
html_code = run_agent(user_input, produce_html)[0].log
print(html_code)

return_values={'output': 'Yes'} log='Yes'
<!DOCTYPE html>
<html lang="en">

<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Milk Chocolate Ad</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            text-align: center;
            background-color: #f7f7f7;
            margin: 0;
            padding: 0;
        }

        .ad-container {
            max-width: 600px;
            margin: 20px auto;
            background-color: #fff;
            border-radius: 10px;
            box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
            padding: 20px;
        }

        h1 {
            color: #ff6600;
        }

        p {
            color: #333;
        }

        img {
            max-width: 100%;
            height: auto;
            border-radius: 5px;
            margin-top: 20px;
        }
    </style>
</head>

<body>
    

<h1>Conclusions</h1>

<h3> Analysis of following 'Should the output be UI?' (2/16/2024) </h3> 
I have discovered that giving ChatGPT a long and complicated write-up followed by options followed by the expectation to execute one of the options resulted in tremendously poor decision making (almost always choosing the first option). I have had more success with reducing the mental difficulty of each query into first a simple yes/no question and then dealing with each option separately depending on the answer. An interesting trait I've detected that even with the yes/no question ChatGPT is more likely to say no when the user direclty asks for a UI page/output and more likely to say yes to prompts asking for explanation/description (i.e. 'Tell me what Cornell is?'). In any case, the prompt's wording is incredibly important and has to be fine-tuned to not make the model always say 'Yes'(UI output would be better) as ChatGPT does have a bias towards choosing the UI output format. One reason for this is that ChatGPT has a pre-disposition to be helpful and optimistic so when your prompt includes phrases like 'benefits the user' (which mine does) it is more likely to agree as to benefit the user the most, even if it is overkill. That being said, it becomes very cautious on long prompts and is difficult to persuade to choose the UI option (Laziness?). Interestingly enough, it is more likely to say yes when it has some former function call in its scratchpad.
<br/><br/>
Interestingly enough, it is far more likely to say yes to a UI format for a complicated prompt when you make it pretend it is a customer. As soon as you put in a command word like 'Make' it largely ignores the Yes/No question and answers directly. Telling it is an all powerful LLM makes it say yes to the UI page every time.
<br/><br/>
The system message, I realized, is incredibly powerful and the LLM's answers depend significantly on what is in the system message.
<br/><br/>
Putting the prompt or scratchpad info at the end makes the LLM much more likely to answer the question at hand than start concentrating on what the scratchpad is asking. This indicates it places heavy emphasis on commands that come first.

<h3>Analysis of the initial attempt to make the LLM produce its own HTML code directly (2/16/2024)</h3>

In general it is able to construct a simple html file quite well. I had to give it explicit instructions to not comment or talk about the code as it has a big tendency to explain what it is doing (possibly a sign of its fine-tuned cautiosness). Even for a long prompt, the smallest and seemingly inconsequential changes made it change its html output completely and even though it was explicitly told to not hallucinate image references it struggled with this command in particular.
<br/><br/>
It also fails to grasp all the details of the prompt. If I say I want a colorful list then it tends to ignore it but has the ability to make very nice html pages that reflect the main points of my request (look at the Christian Bale output above). It is smart enough to make dummy buttons and knows not to make up random links to connect the button to.
<br/><br/>
Very impressively, even though it was never explicitly told to do so, it proactively made CSS styles.

<h1> Additional Tests & Expansion of Capabilities (2/17/2024-3/4/2024) </h1>

<h2> Custom Imports for this section </h2>

In [79]:
import time
import requests
import ast

<h2> Testing Image Generation</h2>

In [45]:
class LLM_image_generate_input(BaseModel):
    prompt: str = Field(... , description=f"A detailed description of the image you want to generate")

@tool(args_schema=LLM_image_generate_input)
def image_generate(prompt: str, model="dall-e-2", size="1024x1024", should_time=False):
    """
    Generates images based on the provided prompt
    """
    prev = time.perf_counter()
    response = client.images.generate(
      model=model,
      prompt=prompt,
      size=size,
      quality="standard",
      n=1,
    )
    end = time.perf_counter()
    if should_time:
        print(f"Time it took to produce {model}'s Image:{end-prev}s", )
    return response.data[0].url

llm_ready_image_generate = convert_to_openai_function(image_generate)

In [31]:
### Trying to create images via DALLE-3 Prompts
client = openai.OpenAI()

# DALLE-3
dalle_3_image_url = image_generate("A white siamese cat", "dall-e-3", should_time=True)
# DALLE-2
dalle_2_image_url = image_generate("A white siamese cat", should_time=True)

Time it took to produce dall-e-3's Image:12.316742292000527s
Time it took to produce dall-e-2's Image:8.440609070000392s


In [32]:
print(dalle_3_image_url)

https://oaidalleapiprodscus.blob.core.windows.net/private/org-CRWrBq65zdTy03WsZ2bR8nw7/user-YJtuVKh5srkktPqUJ94RfYo9/img-CHnDTOjgiP7Tr1cYZFiZmIa4.png?st=2024-03-03T18%3A33%3A53Z&se=2024-03-03T20%3A33%3A53Z&sp=r&sv=2021-08-06&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-03-03T14%3A23%3A07Z&ske=2024-03-04T14%3A23%3A07Z&sks=b&skv=2021-08-06&sig=GFg1WLHHkvi5n54VxKV1XaRtnwM13GDAWnOy0TOtJ/I%3D


In [33]:
print(dalle_2_image_url)

https://oaidalleapiprodscus.blob.core.windows.net/private/org-CRWrBq65zdTy03WsZ2bR8nw7/user-YJtuVKh5srkktPqUJ94RfYo9/img-sMRPq4VcnvKm5znQCkbq0SOP.png?st=2024-03-03T18%3A34%3A02Z&se=2024-03-03T20%3A34%3A02Z&sp=r&sv=2021-08-06&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-03-03T15%3A11%3A51Z&ske=2024-03-04T15%3A11%3A51Z&sks=b&skv=2021-08-06&sig=ePOe3GDYpPXByNdrIgvavbHFlMAleGxmMgnUH0ORhl8%3D


In [34]:
### DALLE-3 is on average slower than DALLe-2 with marginal photo improvements 
### (There are also a number of limits on the throughput of DALLe-3 image production). Thus I decided to proceed
### with DALLe-2 as our image producer

<h2>False Image Detection in Produced HTML Code</h2>

<h3> Implementation </h3>

In [132]:
# Prompts
image_detection_prompt = "Below you will be provided with html code, please provide all the image sources (src) as strings in a python list format " +\
    "in the order they are found in the html document. Provide only the list (with proper square brackets) as normal text (no code blocks)," +\
    "no other commentary or text. The html code:"
def image_correction_prompt(number): 
    return f"Replace the {number}th image in the html code provided to you in the scratchpad (I am working in the order images are found in the html" +\
        " document) with an accurate and believable replacemenet image produced by a function you have access to by calling that function. Make sure" +\
        " the prompt is acceptable under openAI image generation guidelines and thus copyright free"
def image_insertion_prompt(number, img_src_list):
    return f"There are {number} images in html code provided to you in the scratchpad. I want you to take the following ordered list of image sources" +\
        " (ordered in the order they are found in the html document) and replace the image sources in html code and return the updated html code. While" +\
        " you are at it. All the new images are 1024x1024 so adjust the html code to make the images look nice in the html page. Provide only" +\
        "the html code, no commentary or discussion or affirmations, only the code. The ordered list of replacement image sources:"+str(img_src_list)

In [133]:
# Model
image_correction_model = ChatOpenAI(model_name=gpt_version, temperature=0).bind(functions=[llm_ready_image_generate])
# Chains
image_correction_chain = prompt | image_correction_model | OpenAIFunctionsAgentOutputParser()

In [134]:
# Helper Functions
def is_image_url_valid(url):
    try:
        response = requests.get(url)
        return response.status_code == 200
    except requests.exceptions.RequestException:
        return False

In [135]:
# Agent
def image_modification_agent(input_html_code):
    intermediate_steps = []
    result = init_prompt_chain.invoke({
        "input": image_detection_prompt+input_html_code,
        "agent_scratchpad": intermediate_steps
    })
    intermediate_steps.append(input_html_code)
    url_list = ast.literal_eval(result.log)
    new_url_list = []
    for i, image_url in enumerate(url_list):
        if not is_image_url_valid(image_url):
            result = image_correction_chain.invoke({
                        "input": image_correction_prompt(1),
                        "agent_scratchpad": intermediate_steps
                    })
            new_image_url = image_generate(result.tool_input)
            new_url_list.append(new_image_url)
        else:
            new_url_list.append(image_url)
    result = init_prompt_chain.invoke({
        "input": image_insertion_prompt(len(new_url_list), new_url_list),
        "agent_scratchpad": intermediate_steps
    })  
    return result

In [136]:
# Combine agents
def gen_html(user_input):
    original_html_code, is_ui_output = run_agent(user_input, produce_html)
    if is_ui_output:
        corrected_html_code = image_modification_agent(original_html_code.log)
        display(HTML(corrected_html_code.log))
    else:
        print(original_html_code)

<h2> Testing Image Generation </h2>

In [126]:
gen_html("How to make spaghetti bolognese")

return_values={'output': 'Yes'} log='Yes'


In [127]:
gen_html("A visual guide to making Spaghetti Bolognese")

return_values={'output': 'Yes'} log='Yes'


In [137]:
gen_html("A step-by-step guide to making Spaghetti Bolognese with accompanying images")

return_values={'output': 'Yes'} log='Yes'


In [131]:
gen_html("Make a cool poster for a sci-fi movie set in a desert planet that I am making called 'Sands'")

return_values={'output': 'Yes'} log='Yes'


<h2> Analysis </h2>

<h3> Random Decision Making</h3>
Even with the same prompt at 0 temperature ChatGPT frequently flip flops between deciding a prompt needs a UI response or doesn't further complicating the quality of the decision process.

<h3> Mis-handling Images </h3>
Even when explicitly told to do so and given exact image dimensions, the LLM poorly sizes the images often giving the entire screen over to them leading to a very unpleasant looking UI page. In general, the UI seems to poorly combine images into UI pages as seen by the tests above showing a distinct lack of spatial and aesthetic awarness which shouldn't be surprising for an LLM (which neither has nor was trained to have 

<h3> Over-dependence on Images </h3>
Prompts that use words like 'visual' trigger the LLM to put all contents of a UI page into the image which DALLe-2 cannot handle leading to a very barren and empty page with just a massive central image