In [None]:
import sys; sys.path.append("..")

import os
import dotenv
import uuid
from IPython.display import display, HTML

from google.cloud import texttospeech
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain.prompts import PromptTemplate
from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper


dotenv.load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

## Prompts Configuration

These variables configure the prompts that are used by the LLM models in the Chain.

In [None]:
def gcp_tts_generator(voice="en-US-Neural2-C", language="en-US", output_folder="./audio"):
    def generator(prompt):
        # Instantiates a client
        client = texttospeech.TextToSpeechClient()

        # Set the text input to be synthesized
        text = prompt.text
        synthesis_input = texttospeech.SynthesisInput(text=text)

        # Build the voice request, select the language code and voice name
        voice_config = texttospeech.VoiceSelectionParams(
            language_code=language, name=voice
        )

        # Select the type of audio file you want returned
        audio_config = texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3,
            pitch=4.0,
            speaking_rate=1.20,
        )

        # Perform the text-to-speech request on the text input with the selected
        # voice parameters and audio file type
        response = client.synthesize_speech(
            input=synthesis_input, voice=voice_config, audio_config=audio_config
        )

        # The response's audio_content is binary.
        random_name = f"{str(uuid.uuid4())}.mp3"
        output_file = os.path.join(output_folder, random_name)
        with open(output_file, "wb") as out:
            # Write the response to the output file.
            out.write(response.audio_content)
        return output_file
    return generator


In [None]:
# Constants
# Number of pages of the story
CONSTANTS = dict(
    MAX_PAGES = 6,
)


# Prompts components
## Writter
writer_task = """
You are story writer. You help the user write a page of a story.
The workflow of the story writing is the following:
1. The first step is to understand the <context> for setting up a engaging story.
2. The user will respond with the <action> he wants to take from the last page. If it is the first page, the action should be "start".
3. Write the next page of the story for <action> taken by the user and the current <karma_points>, write the consecuence of the action and current state in the page's <description>, a list of 2 possible <next_action>s, and the <karma_points> change for the story.
4. Repeat the step 2 and 3 until you reach the max length of the story. If you reach the ending page, there should be only one <next_action> with the "End" action.
"""

writer_rules = """
The writer rules are:
1. The first page (number 1) of the story should always start by a "start" action. No other action is allowed.
2. The last page (number MAX_PAGES) of the story should always end by one "End" next_action. No other next_action is allowed.
3. An action is a string of max 30 characters, with only the description of the action.
4. A description is a string of max 250 characters, with only the description of the page.
5. The karma_points is a list of 4 float numbers, representing the karma points change for the story. The values are between -1 and 1. The sum of all karma points is the total karma points of the story. The karma points are:
- The dimension of technology. Higher is more advanced. Lower is no technology.
- The dimension of happiness. Higher is humans are happier. Lower is humans are unhappier.
- The dimension of safety. Higher is humans are safer. Lower is humans doesn't exist.
- The dimension of control. Higher is humans have more control. Lower is AGI has more control.
6. The max length of the story is MAX_PAGES pages, so the last pages should end the story. Take in account the rythm of the story and the length of the pages, so the story is engaging.
7. Use the JSON output format defined in the [output_format] section.
"""

writer_output_format = """
The output format is the following JSON keys:
- "description": The description of the current page.
- "next_actions": The list of the next actions. Each action is a string of max 30 characters.
- "karma_points": A list of 4 float numbers, representing the karma points change for the story.
"""

writer_knowledge = """
The writer has the following knowledge for story inspiration:
- The writer knows the following characters: "Sebastian", "Fran"
- The story should by a "sci-fi" story, "utopia" or "dystopia".
- Ispired by the following books: "1984", "Life 3.0: Alpha team tale"
- Ispired by the following movies: "The Matrix", "The Terminator"
- Ispired by the following games: "Detroit: Become Human", "Deus Ex"
- Ispired by the following TV series: "Westworld", "Black Mirror"
- Ispired by the following anime: "Ghost in the Shell", "Serial Experiments Lain"
"""

## Drawer
drawer_style = "90s aesthetics, with a dark and gritty style and pixel art graphics. Using the following colors: #000000, #ffffff, #ff0000, #00ff00, #0000ff, #ffff00, #ff00ff, #00ffff"

drawer_task = """
Generate a short prompt to generate an image based on:
1. Scene description: {description}
2. Use this style: STYLE
3. Generate consistent images with this seed: [SEED]
4. The length of the prompt should not be more than 1000 characters.
5. Not render any text in the image!!!
"""



# Prompts utils functions
def build_writer_prompt(task, rules, knowledge, output_format, constants):
        for k, v in constants.items():
            task = task.replace(k, str(v))
            rules = rules.replace(k, str(v))
            knowledge = knowledge.replace(k, str(v))
            output_format = output_format.replace(k, str(v))
        messages = [
            ("system",
                "[task]: " + task + "\n"\
                "[rules]: " + rules + "\n"\
                "[knowledge]: " + knowledge + "\n"\
                "[output_format]: " + output_format + "\n"
                "[context]: {context}\n"
            ),
            ("ai",
                "I will write the next page of the story as "\
                "defined in the [task] in the format defined in [rules] and using the [knowledge]"\
                "and the [context]"\
                "Here a list of the previous pages: {pages}\n"\
                "This is the current karma points: {karma_points}\n"
                "This is the page number {page_number}"\
            ),
            ("human", "I choice the action {action}"),
        ]
        return ChatPromptTemplate.from_messages(messages)


def build_drawer_prompt(task, style):
    seed = str(uuid.uuid4())[0:8]
    task = task.replace("STYLE", style).replace("SEED", seed)
    return ChatPromptTemplate.from_template(task)


def build_speech_prompt():
    branch_prompt = RunnableBranch(
       (lambda x: x["page_number"] == 1, PromptTemplate.from_template("{description}")),
       (lambda x: x["page_number"] == CONSTANTS["MAX_PAGES"], PromptTemplate.from_template("{description}")),
        PromptTemplate.from_template("You choose: {action}. {description}.")
    )
    return branch_prompt


# Chains
image_generator = (
    build_drawer_prompt(drawer_task, drawer_style)
    | ChatOpenAI(model="gpt-4", temperature=0.0)
    | StrOutputParser()
    | RunnableLambda(lambda x: DallEAPIWrapper(model="dall-e-3", size="1024x1024", quality="standard").run(x))
)


audio_generator = (
    build_speech_prompt()
    | RunnableLambda(gcp_tts_generator())
)


writer_chain = (
    build_writer_prompt(writer_task, writer_rules, writer_knowledge, writer_output_format, CONSTANTS)
    | ChatOpenAI(model="gpt-4", temperature=0.9)
    | JsonOutputParser()
)

chain = (
    RunnablePassthrough.assign(page=writer_chain)
    | {
        "page": RunnableLambda(lambda x: x["page"]),
        "image_url": RunnableLambda(lambda x: x["page"]) | image_generator,
        "audio_file": RunnableLambda(lambda x: {"description": x["page"]["description"], "action": x["action"], "page_number": x["page_number"]}) | audio_generator,
    }
)

In [None]:
kp = {"technology": 0.0, "happiness": 0.0, "safety": 0.0, "control": 0.0}
context = 'Sebastian, a reputable scientist in the year 2100, discovers an old AI named Fran buried in the archives of his lab.'
curr_page = 1
pages = []

response = chain.invoke(
    {
        "context": context,
        "page_number": curr_page,
        "action": "start",
        "karma_points": kp,
        "pages": pages,
    }
)
response

In [None]:
# In two columns, display the image on the left and the text and audio on the right. Use html to style the text and audio.
html="""
<div style="display: flex; flex-direction: row;">
    <div style="width: 25%;">
        <img src="{image_url}" style="width: 100%;"/>
    </div>
    <div style="width: 75%;">
        <div style="font-size: 20px; font-weight: bold;">Text:</div>
        <p>Description: {description}</p>
        <div style="font-size: 20px; font-weight: bold;">Audio:</div>
        <br/>
        <audio controls>
          <source src="{audio_file}" type="audio/mpeg">
        </audio>
    </div>
</div>
"""
html = html.format(
    response=response,
    image_url=response["image_url"],
    description=response["page"]["description"],
    audio_file=response["audio_file"],
)
display(HTML(html))