In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [23]:
import lmql
import asyncio
import chromadb
from chromadb.utils import embedding_functions
import requests
import re
import os
import pandas as pd
from functools import lru_cache
from dataclasses import dataclass, field
from datetime import datetime
from typing import (
    Any,
    Union,
    ClassVar,
    Dict,
    Generator,
    List,
    Optional,
    Protocol,
    Tuple,
    Type,
    Optional,
    TypeVar,
    Callable,
    AsyncGenerator,
    TypedDict,
    Generic,
)
from itertools import chain
from uuid import UUID, uuid4
from glob import glob
from pathlib import Path
from pydantic import BaseModel, Field

DJ_URL = f"http://localhost:8000"

<IPython.core.display.Javascript object>

In [3]:
@lru_cache(1)
def get_chroma():
    return chromadb.Client()

<IPython.core.display.Javascript object>

In [4]:
@dataclass
class VectorStore:
    collection_name: str

    def __post_init__(self):
        ef = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="all-MiniLM-L6-v2"
        )
        self.client = get_chroma()
        self.collection = self.client.get_or_create_collection(
            self.collection_name, embedding_function=ef
        )

<IPython.core.display.Javascript object>

In [5]:

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

<IPython.core.display.Javascript object>

In [6]:
metrics_json = requests.get(
    f"{DJ_URL}/metrics",
).json()

metrics = pd.DataFrame(metrics_json)

<IPython.core.display.Javascript object>

In [7]:
dimensions = set(d.split(".")[0] for d in metrics.dimensions.sum())

<IPython.core.display.Javascript object>

In [8]:
dimensions = [
    requests.get(
        f"{DJ_URL}/nodes/{d}",
    ).json()
    for d in dimensions
]

<IPython.core.display.Javascript object>

In [9]:
dimensions = pd.DataFrame(dimensions)

<IPython.core.display.Javascript object>

In [10]:
dimensions_metrics = {}
for m, ds in zip(
    metrics.name,
    metrics.dimensions.apply(lambda l: {d.split(".")[0] for d in l}).tolist(),
):
    for d in ds:
        dimensions_metrics[d] = dimensions_metrics.get(d, [])
        dimensions_metrics[d].append(m)

<IPython.core.display.Javascript object>

In [11]:
dimensions["metrics"] = dimensions.name.map(dimensions_metrics)

<IPython.core.display.Javascript object>

In [12]:
metrics_vectorstore = VectorStore(collection_name="metrics")
dimensions_vectorstore = VectorStore(collection_name="dimensions")
knowledge_vectorstore = VectorStore(collection_name="knowledge")

Using embedded DuckDB without persistence: data will be transient


<IPython.core.display.Javascript object>

In [13]:
def window_document(
    file_name: str, document_text: str, window_size: int = 200, overlap: int = 50
):
    """
    Splits a document into overlapping windows of fixed size.

    Args:
        document (str): The document to split.
        window_size (int): The word size of each window.
        overlap (int): The amount of word overlap between adjacent windows.

    Returns:
        List[str]: A list of overlapping windows.
    """

    document = re.split(r"\s+", document_text)
    title = (
        re.split(r"[._-]+", file_name)
        + re.split(r"\s+", document_text.split("\n")[0])[:10]
    )
    windows = []
    start = 0
    end = window_size
    while end <= len(document):
        windows.append(" ".join((title if start != 0 else []) + document[start:end]))
        start += window_size - overlap
        end += window_size - overlap
    if end > len(document) and start < len(document):
        windows.append(" ".join(title + document[start:]))
    return windows

<IPython.core.display.Javascript object>

In [14]:
knowledge_files = glob("../examples/knowledge/*.txt")

<IPython.core.display.Javascript object>

In [15]:
knowledge_doc_texts = {}
for kd in knowledge_files:
    with open(kd) as f:
        knowledge_doc_texts[".".join(Path(kd).name.split(".")[:-1])] = f.read()

<IPython.core.display.Javascript object>

In [16]:
knowledge_docs = []
for kd, doc in knowledge_doc_texts.items():
    for idx, passage in enumerate(window_document(kd, doc)):
        knowledge_docs.append(
            {
                "ids": kd + f"_{idx}",
                "documents": passage,
                "metadatas": {"file": kd, "part": idx},
            }
        )
knowledge_docs = pd.DataFrame(knowledge_docs)

<IPython.core.display.Javascript object>

In [17]:
knowledge_vectorstore.collection.add(**knowledge_docs.to_dict(orient="list"))

<IPython.core.display.Javascript object>

In [18]:
metric_docs = pd.DataFrame(
    [
        {
            "ids": str(m.id),
            "documents": m.description,
            "metadatas": {
                "name": m["name"],
                "query": m.query,
                "dimensions": str(m.dimensions),
            },
        }
        for _, m in metrics.iterrows()
    ]
)

metrics_vectorstore.collection.add(**metric_docs.to_dict(orient="list"))

<IPython.core.display.Javascript object>

In [19]:
dimension_docs = pd.DataFrame(
    [
        {
            "ids": str(d.node_revision_id),
            "documents": d.description,
            "metadatas": {
                "name": d["name"],
                "query": d.query,
                "metrics": str(d.metrics),
            },
        }
        for _, d in dimensions.iterrows()
    ]
)

dimensions_vectorstore.collection.add(**metric_docs.to_dict(orient="list"))

<IPython.core.display.Javascript object>

In [20]:
T = TypeVar("T")


def required_value(message: str, return_type: Type[T]) -> Callable[[], T]:
    def raise_message() -> T:
        raise ValueError(message)

    return raise_message


class Stringable(Protocol):
    def __str__(self) -> str:
        pass


async def cli_agent_input() -> AsyncGenerator[str, None]:
    user = " "
    while True and user:
        yield input()


SchemaDict = Dict[str, Union[Type[str], Type[int], "SchemaDict"]]


@dataclass
class ToolSchema:
    """
    Final answer value produced from an agent
    """

    schema_dict: TypedDict
    _compiled: bool = field(init=False, default=False)
    _body: Optional[str] = field(init=False, default=None)
    _where: bool = field(init=False, default=False)

    @property
    def body(self):
        self._compile()
        return self._body

    @property
    def code(self):
        self._compile()
        return self.body.replace("[", "").replace("]", "")

    @property
    def where(self):
        self._compile()
        return self._where

    def _compile(self):
        if self._compiled:
            return
        schema_dict = self.schema_dict.__annotations__
        if not schema_dict:
            self._body = ""
            self._where = ""
            return
        where = []
        code = []
        prefix = self.schema_dict.__name__ + "_"
        uid = str(uuid4()).replace("-", "")

        def _helper(
            schema,
            key,
        ):
            if schema == int:
                variable = (prefix + key).upper()
                where.append(f'INT({variable}) and STOPS_AT({variable}, ",")')
                return variable + "_" + uid
            if schema == str:
                variable = (prefix + key).upper()
                where.append(rf"STOPS_AT({variable}, '\"')")
                return variable + "_" + uid
            if not isinstance(schema, dict):
                raise Exception(f"Unnacceptable type in schema: `{schema}`")
            result = "{"
            for key, value in schema.items():
                if "[" in key or "]" in key:
                    raise Exception("schema keys cannot have `[` or `]`")
                variable = _helper(value, key=key)
                result += f'"{key}": [{variable}], '
            result = result[:-2] + "}"
            return result

        self._body = _helper(schema_dict, key="")
        self._where = " and ".join(where)


@dataclass
class Tool:
    default_description: ClassVar[str]
    default_ref_name: ClassVar[str]
    input_schema: ClassVar[ToolSchema]
    model_identifier: Optional[str] = None
    description_: Optional[str] = None
    ref_name_: Optional[str] = None

    @property
    def description(self):
        return self.description_ or self.default_description

    @property
    def ref_name(self):
        return self.ref_name_ or self.default_ref_name

    async def __call__(self, input: dict) -> "Observation":
        raise NotImplementedError()


@dataclass
class Utterance:
    utterance: str
    session_id_: Optional[UUID] = None
    marker: str = ""
    timestamp: datetime = field(default_factory=datetime.utcnow)
    context: str = ""
    parent: Optional["Utterance"] = None
    utterance_id: UUID = field(default_factory=uuid4)

    def __str__(self):
        return self.marker + self.utterance

    def history(self, n: Optional[int] = None) -> Generator:
        n_ = n or float("inf")
        curr = self
        while n_ > 0 and curr is not None:
            yield curr
            curr = self.parent  # type: ignore
            n_ -= 1

    @property
    def session_id(self):
        if self.session_id_ is not None:
            return self.session_id_
        if self.parent is not None:
            return self.parent.session_id
        return None


@dataclass
class Observation(Utterance):
    """
    Value produced from a tool
    """

    tool: Tool = field(
        default_factory=required_value("`tool` is required for an Observation.", Tool)
    )


@dataclass
class VectorStoreMemory:
    utterance: Optional[Utterance] = None
    vector_store: Optional[VectorStore] = None
    default_k: int = 3

    @property
    def session_id(self) -> Optional[UUID]:
        return self.utterance and self.utterance.session_id

    async def add_memories(self, utterances: List[Utterance]):
        for utterance in utterances:
            if self.session_id is not None and utterance.session_id != self.session_id:
                raise Exception("utterances belong to the same session as this memory!")
        if self.vector_store is None:
            self.vector_store = Chroma(str(self.session_id))
        await self.vector_store.coll

    async def search(self, query: str, k: Optional[int] = None):
        k = k or self.default_k


@dataclass
class Agent:
    description: ClassVar[str]
    ref_name: ClassVar[str]
    tools: List[Type[Tool]]
    model_identifier: str
    memory: Optional[VectorStoreMemory] = None
    human_marker: str = "User: "
    agent_marker: str = "Agent: "

    async def __call__(self, input: AsyncGenerator[str, None]) -> Utterance:
        raise NotImplementedError()


@dataclass
class Thought(Utterance):
    """
    Value produced from an agent
    """

    agent: Agent = field(
        default_factory=required_value("`agent` is required for a Thought.", Agent)
    )
    marker = "Thought: "


@dataclass
class Answer(Utterance):
    """
    Final answer value produced from an agent
    """

    agent: Agent = field(
        default_factory=required_value("`agent` is required for a Answer.", Agent)
    )
    marker = "Answer: "

<IPython.core.display.Javascript object>

In [21]:
@dataclass
class KnowledgeSearchTool(Tool):
    default_description = "Search for knowledge documents."
    default_ref_name = "knowledge_search"
    input_schema = ToolSchema(TypedDict("KnowledgeQuery", {"query": str}))
    n_docs: int = 3
    threshold: float = 0.0

    async def __call__(self, input) -> Observation:
        query = input["query"]
        results = knowledge_vectorstore.collection.query(
            query_texts=query, n_results=self.n_docs
        )
        res = ""
        for meta, doc in zip(results["metadatas"], results["documents"]):
            res += f"{meta}: {doc}\n"
        return Observation(tool=self, utterance=res)

<IPython.core.display.Javascript object>

In [24]:
async def cli_input(session_id: UUID) -> AsyncGenerator[str, Optional[Answer]]:
    answer = None
    while True:
        try:
            utterance = input("User: ")
            if not utterance:
                return answer
            user = Utterance(utterance, marker="User: ", session_id_=session_id)
            yield user
            answer = yield
        except KeyboardInterrupt:
            return answer


class BasicQAAgent(Agent):
    description = "An agent that answers basic questions about metrics."
    ref_name = "basic_qa"

    @lmql.query
    async def run(self, utterance: Utterance) -> Utterance:
        tools_prompt = "Here are the tools you choose from:" + "\n".join(
            f"{tool.ref_name}: {tool.description}" for tool in self.tools
        )
        tool_refs = {tool.ref_name: tool for tool in self.tools}
        tool_conditions = " and\n".join(tool.input_schema.where for tool in self.tools)
        convo = (
            "\n".join(str(u) for u in chain(utterance.history(3), [utterance])) + "\n"
        )
        """
        argmax
            The following is a conversation between a User and an AI Agent.
            The Agent is talkative and provides lots of specific details from its context.
            The Agent has Thoughts, uses Tools by providing Tool Input, and ultimately provides Answers.
            If the Agent does not know the answer to a question, it truthfully says it does not know. 
            The Agent always uses thoughful reasoning like so:

            Thought: use tool
            Tool: agent selects appropriate tool
            Tool Input: {...}
            ===
            Thought: final answer
            Answer: agent describes the answer
            ===
            Thought: no answer
            Answer: Agent explains why it could not find an answer
                
            {tools_prompt}
            
            Conversation:
            {convo}
            "Thought: [THOUGHT]"
            thought = Thought(utterance = THOUGHT, agent = self, parent = utterance)
            if THOUGHT == 'use tool':
                "Tool: [TOOL]\n"
                tool = tool_refs.get(TOOL)
                "Tool Input: {tool.input_schema.body}"
                input_dict = eval(tool.input_schema.code)
                observation = await tool(input_dict)
                observation.parent = thought
            elif THOUGHT == 'final answer':
                "Answer: [ANSWER]"
                return Answer(utterance = ANSWER, agent = self)

            else:
                return Answer(utterance = "I cannot find an answer to `{utterance.utterance}`.", agent = self)

                    
            
        from
            {model_identifier}
        where
            THOUGHT in ["use tool", "final answer", "no answer"] and
            TOOL in set(tool_refs.keys()) and
            STOPS_AT(THOUGHT, \n) and
            STOPS_AT(TOOL, \n) and
            {tool_conditions}
        """

    async def __call__(
        self, input: AsyncGenerator[Utterance, None]
    ):
        user = input()
        async for utterance in user:
            response = await self.run(utterance)
            if isinstance(response, Answer):
                user.send(response)

<IPython.core.display.Javascript object>

In [None]:
import asyncio

async def cli_input(session_id: UUID) -> AsyncGenerator[str, Optional[Answer]]:
    answer = None
    while True:
        try:
            utterance = await asyncio.ainput("User: ")
            if not utterance:
                return answer
            user = Utterance(utterance, marker="User: ", session_id_=session_id)
            yield user
            answer = yield
        except KeyboardInterrupt:
            return answer


class BasicQAAgent(Agent):
    description = "An agent that answers basic questions about metrics."
    ref_name = "basic_qa"

    @lmql.query
    async def run(self, utterance: Utterance) -> Utterance:
        tools_prompt = "Here are the tools you choose from:" + "\n".join(
            f"{tool.ref_name}: {tool.description}" for tool in self.tools
        )
        tool_refs = {tool.ref_name: tool for tool in self.tools}
        tool_conditions = " and\n".join(tool.input_schema.where for tool in self.tools)
        convo = (
            "\n".join(str(u) for u in chain(utterance.history(3), [utterance])) + "\n"
        )
        """
        argmax
            The following is a conversation between a User and an AI Agent.
            The Agent is talkative and provides lots of specific details from its context.
            The Agent has Thoughts, uses Tools by providing Tool Input, and ultimately provides Answers.
            If the Agent does not know the answer to a question, it truthfully says it does not know. 
            The Agent always uses thoughful reasoning like so:

            Thought: use tool
            Tool: agent selects appropriate tool
            Tool Input: {...}
            ===
            Thought: final answer
            Answer: agent describes the answer
            ===
            Thought: no answer
            Answer: Agent explains why it could not find an answer
                
            {tools_prompt}
            
            Conversation:
            {convo}
            "Thought: [THOUGHT]"
            thought = Thought(utterance = THOUGHT, agent = self, parent = utterance)
            if THOUGHT == 'use tool':
                "Tool: [TOOL]\n"
                tool = tool_refs.get(TOOL)
                "Tool Input: {tool.input_schema.body}"
                input_dict = eval(tool.input_schema.code)
                observation = await tool(input_dict)
                observation.parent = thought
            elif THOUGHT == 'final answer':
                "Answer: [ANSWER]"
                return Answer(utterance = ANSWER, agent = self)

            else:
                return Answer(utterance = f"I cannot find an answer to '{utterance.utterance}'.", agent = self)

                    
            
        from
            {model_identifier}
        where
            THOUGHT in ["use tool", "final answer", "no answer"] and
            TOOL in set(tool_refs.keys()) and
            STOPS_AT(THOUGHT, \n) and
            STOPS_AT(TOOL, \n) and
            {tool_conditions}
        """

    async def __call__(
        self, input: AsyncGenerator[Utterance, None]
    ):
        user = input
        async for utterance in user:
            response = await self.run(utterance)
            if isinstance(response, Answer):
                user.send(response)

In [29]:
async def send_input(input_generator):
    # Loop until the user enters "quit"
    while True:
        # Prompt the user for input
        value = await loop.run_in_executor(None, input, "Enter input: ")

        # Send input to the generator
        input_generator.send(value)

<IPython.core.display.Javascript object>

In [28]:
loop = asyncio.get_event_loop()

<IPython.core.display.Javascript object>

In [None]:
loop.run_until_complete(cli_input(uuid4()))

In [27]:
ipt = cli_input(uuid4())
async for user in ipt:
    print(user)
    ipt.send("fuck you")

User: hello
User: hello


<IPython.core.display.Javascript object>

In [None]:
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.tools import BaseTool
from langchain.llms import OpenAI, GPT4All
from langchain.agents import (
    create_json_agent,
)
from langchain.agents.agent_toolkits import JsonToolkit
from langchain.tools.json.tool import JsonSpec
from langchain.agents import ZeroShotAgent, Tool, AgentExecutor, ConversationalChatAgent
from langchain import OpenAI, SerpAPIWrapper, LLMChain, ConversationChain
from langchain.memory import ConversationBufferMemory

from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI

from langchain.text_splitter import CharacterTextSplitter, Document
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
import pandas as pd
import json
import re
from typing import Optional
from typing import Dict, Any, Tuple
from langchain.document_loaders import DataFrameLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import requests
import os

DJ_URL = f"http://localhost:8000"



<IPython.core.display.Javascript object>

In [12]:
LCDoc(page_content="hello")

Document(page_content='hello', metadata={})

<IPython.core.display.Javascript object>

In [18]:
(await metrics_vectorstore.asimilarity_search("repair"))[0]

Document(page_content='Average repair price', metadata={'id': 22, 'name': 'avg_repair_price', 'display_name': 'Avg Repair Price', 'current_version': 'v1.0', 'created_at': '2023-04-24T19:04:50.305986+00:00', 'updated_at': '2023-04-24T19:04:50.307624+00:00', 'query': 'SELECT avg(price) as avg_repair_price FROM repair_order_details', 'dimensions': "['dispatcher.company_name', 'dispatcher.dispatcher_id', 'dispatcher.phone', 'hard_hat.address', 'hard_hat.birth_date', 'hard_hat.city', 'hard_hat.contractor_id', 'hard_hat.country', 'hard_hat.first_name', 'hard_hat.hard_hat_id', 'hard_hat.hire_date', 'hard_hat.last_name', 'hard_hat.manager', 'hard_hat.postal_code', 'hard_hat.state', 'hard_hat.title', 'municipality_dim.contact_name', 'municipality_dim.contact_title', 'municipality_dim.local_region', 'municipality_dim.municipality_id', 'municipality_dim.municipality_type_desc', 'municipality_dim.municipality_type_id', 'municipality_dim.state_id', 'repair_order.dispatcher_id', 'repair_order.hard_h

<IPython.core.display.Javascript object>

In [20]:
metrics_vectorstore.

<IPython.core.display.Javascript object>

In [75]:
from pydantic import BaseModel


class Person(BaseModel):
    name: str
    age: int


json_str = '{"name": "Alice", "age": 30}'

# Create an instance of the model from the JSON string
person = Person.parse_raw(json_str)

# Validate the input data
validated_person = person.validate()

# Access the fields of the validated model
print(validated_person.name)  # Output: Alice
print(validated_person.age)  # Output: 30

<IPython.core.display.Javascript object>

In [278]:
import lmql

someone = "the Python interpreter"


@lmql.query
async def greet(someone_else):
    """
    argmax
        "Greet {someone} and {someone_else}: [WHO]"
        return Utterance(WHO)
    from "openai/text-ada-001"
    where
        len(WHO) < 20
    """


(await greet("the World"))



[Utterance(utterance='\n\nHello, I am here!', session_id_=None, marker='', timestamp=datetime.datetime(2023, 4, 25, 19, 35, 8, 111501), context='', parent=None, utterance_id=UUID('73697e79-8add-42cc-9f2b-7088907d3532'))]

<IPython.core.display.Javascript object>

In [107]:
@dataclass
class LMQLMeta:
    model: str
    body: List[str]
    signature: str = "@lmql.query\nasync def _lmql_meta():\n"
    decode: str = "argmax"
    where: List[str] = field(default_factory=list)

    def __str__(self):
        body = "\n".join(self.body)
        where = ("where" if self.where else "") + "\n\t" + " and\n".join(self.where)
        prompt = f"""
{self.signature}
    \"\"\"
    {self.decode}
        {body}
    from
        "{self.model}"
    {where}
    \"\"\"
    """
        return prompt

    def compile(self):
        import lmql

        exec(str(self))
        match = re.match(r"def\s+([a-zA-Z_]\w*)\s*\(", text)
        assert (
            match is not None
        ), "LMQLMeta signature does not have an identifiable function name."
        function_name = match.group(1)
        return locals().get(function_name)

<IPython.core.display.Javascript object>