In [1]:
from langchain.chat_models import ChatOpenAI, ChatAnthropic
import pandas as pd
from io import StringIO
import json

In [2]:
with open("evals.json") as f:
    data = json.load(f)

In [3]:
from langsmith import Client
client = Client()
dataset_name = "Plates Dataset"

try:
    client.delete_dataset(dataset_name=dataset_name)
except:
    pass
# Storing inputs in a dataset lets us
# run chains and LLMs over a shared set of examples.
dataset = client.create_dataset(
    dataset_name=dataset_name, description="Plates Data.",
)
for d in data:
    # Each example must be unique and have inputs defined.
    # Outputs are optional
    client.create_example(
        inputs={"input": d["input"]},
        outputs={"output": json.dumps(d["output"])},
        dataset_id=dataset.id,
    )

In [4]:
from langchain.output_parsers.json import parse_json_markdown

In [5]:
import re
from typing import Any, Optional
import json

from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import StringEvaluator


class JSONEvaluator(StringEvaluator):

    @property
    def requires_input(self) -> bool:
        return True

    @property
    def requires_reference(self) -> bool:
        return True

    @property
    def evaluation_name(self) -> str:
        return "json_equivalence"

    def _evaluate_strings(
        self,
        prediction: str,
        input: Optional[str] = None,
        reference: Optional[str] = None,
        **kwargs: Any
    ) -> dict:
        try:
            pred = sorted(parse_json_markdown(prediction), key=lambda x: json.dumps(x))
            ref = sorted(parse_json_markdown(reference), key=lambda x: json.dumps(x))
            if pred == ref:
                return {"score": 1}
            else:
                return {"score": 0}
        except:
            return {"score": -1}

In [6]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from langchain.schema.output_parser import StrOutputParser

In [7]:
FIND_PLATES_PLATE_ORIENTATION = """Your job is to identify where in a dataframe plate data exists

Plate based data is always rectangular in nature, typically consisting of 24 (4x6), 96 (8x12), 384 (16x24), or 1536 (32 x 48) wells, and may be located anywhere within a dataset.
- There may be gaps in the data or surrounding rows and columns
- The plate data may be partial. This means that there may be columns or rows that have missing values. These still count and you should make sure to include them
- There may be multiple plates.
- Use heuristics, like looking for numbers and patterns consistent with plate dimensions.
- The sequence of the wells might also be a clue, as they are usually arranged in rows or columns and numbered sequentially.
- The header row for each plate (often consisting of monotonically increasing integers) should NOT be considered as the start of the plate.
- Likewise, the column with the column labels should NOT be considered as the start of the plate.
- Each chunk of data might contain multiple plates and there may be multiple chunks.

CONTENTS:

Use the `contents` field to describe the contents of the plate.
You can often look for text near the table to induce the `contents` field. You should use this text EXACTLY, do not add or change this in any way.

PARTIAL DATA:

Often times, you may have partial data. You can tell if you have partial data when a header row is longer than the plate data.

OUTPUT FORMATTING:

Use 0-indexing for row and column numbers in the final JSON output.

BEGIN!

Where are the plates in this data?

Be concise.

Produce your output as JSON. The format should be:
```json
[{"row_start": 25, "row_end": 30, "col_start": 1, "col_end": 12, "contents": "Entity ID"}]
```
""" 
USER_PLATE_ORIENTATION = """,,,,\nChemical,,1,2,3\n,A,SB123,SB124,SB125\n,B,SB123,SB124,SB126\n,C,,,\n,D,SB123,SB124,SB128\n,Control,1,2,3\n,A,Negative,Positive,Library\n,B,Negative,Positive,Library\n,C,Negative,Positive,Library\n,D,Negative,Positive,Library\nDox Concentration,,,,\n,1,2,3,\nA,0.005,0.005,0.005,\nB,0.005,0.005,0.005,\nC,0.005,0.005,0.005,\nD,0.005,0.005,0.005,\n,,,,\n,,,,\nPrimer,1,2,3,\nA,PR-001,PR-001,PR-001,\nB,PR-002,PR-002,PR-002,\nC,PR-003,PR-003,PR-003,\nD,PR-004,PR-004,PR-004,\n"""  # noqa: E501

ASSISTANT_PLATE_ORIENTATION = """
[{"row_start": 1, "row_end": 4, "col_start": 2, "col_end": 4, "contents": "Chemical"}, {"row_start": 6, "row_end": 9, "col_start": 2, "col_end": 4, "contents": "Control"},{"row_start": 12, "row_end": 15, "col_start": 1, "col_end": 3, "contents": "Dox Concentration"},{"row_start": 19, "row_end": 22, "col_start": 1, "col_end": 3, "contents": "Primer"}]
"""  # noqa: E501


In [44]:
example1 = "bar" + pd.DataFrame({**{str(c):[1,2, None, None, None] for c in range(14)}, **{14: [None, None, None, None, None]}}).to_csv()
label1 = '[{"row_start": 1, "row_end": 5, "col_start":1, "col_end": 15, "contents":"bar"}]'
example2 = "foo" + pd.DataFrame({"A":[1,2, 3, 4, 5], "B":[3,4, 5, 6, 7], "C":[None, None, None, None, None], "D":[None, None, None, None, None]}).to_csv()
label2 = '[{"row_start": 1, "row_end": 5, "col_start":1, "col_end": 4, "contents":"foo"}]'

In [78]:
def _encode(_str):
    _df = pd.read_csv(StringIO(_str), header=None)
    _df.columns=[f"{i}" for i in _df.columns]
    return _df.to_csv() + f"\n\nmax row_end: {_df.shape[0] - 1}; max col_end: {_df.shape[1] - 1}" 

In [79]:
examples = [
    HumanMessage(content=_encode(example1), example=True),
    AIMessage(content=label1, example=True),
    HumanMessage(content=_encode(example2), example=True),
    AIMessage(content=label2, example=True),
    #HumanMessage(content=pd.read_csv(StringIO(data[1]['input']), header=None).to_markdown(),example=True),
    #AIMessage(content = str(data[1]['output']),example=True),
]

In [80]:
prompt = ChatPromptTemplate.from_messages([
    SystemMessage(content=FIND_PLATES_PLATE_ORIENTATION),
] + examples + [
    #HumanMessage(content="This has an example with partial data: " + pd.read_csv(StringIO(USER_PLATE_ORIENTATION), header=None).to_markdown(), example=True),
    #AIMessage(content=ASSISTANT_PLATE_ORIENTATION, example=True),
    ("human", "{input}"),
])

In [81]:
llm = ChatOpenAI(temperature=0, model="gpt-4").with_fallbacks([ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k")])
#llm = ChatAnthropic(temperature=0, model="claude-2")

In [82]:
chain = {
    "input": lambda x: _encode(x["input"])
} | prompt | llm | StrOutputParser()

In [85]:
chain.invoke({"input": data[1]['input']})

'[{"row_start": 2, "row_end": 9, "col_start": 3, "col_end": 14, "contents": "Chemical"}, {"row_start": 14, "row_end": 21, "col_start": 3, "col_end": 14, "contents": "Control"}]'

In [86]:
data[1]['output']

[{'row_start': 2,
  'row_end': 9,
  'col_start': 3,
  'col_end': 14,
  'contents': 'Chemical'},
 {'row_start': 14,
  'row_end': 21,
  'col_start': 3,
  'col_end': 14,
  'contents': 'Control'}]

In [77]:
print(_encode(data[2]['input']))

max row_end: 19; max col_end: 12

,0,1,2,3,4,5,6,7,8,9,10,11,12
0,Chemical,,,,,,,,,,,,
1,,1,2,3,4,5,6,7,8,9,10,11,12
2,A,SB-001,SB-002,SB-023,SB-024,SB-025,SB-026,SB-027,SB-028,SB-029,SB-030,SB-031,SB-032
3,B,SB-001,SB-002,SB-023,SB-024,SB-025,SB-026,SB-027,SB-028,SB-029,SB-030,SB-031,SB-032
4,C,SB-001,SB-002,SB-023,SB-024,SB-025,SB-026,SB-027,SB-028,SB-029,SB-030,SB-031,SB-032
5,D,SB-001,SB-002,SB-023,SB-024,SB-025,SB-026,SB-027,SB-028,SB-029,SB-030,SB-031,SB-032
6,E,SB-001,SB-002,SB-023,SB-024,SB-025,SB-026,SB-027,SB-028,SB-029,SB-030,SB-031,SB-032
7,F,SB-001,SB-002,SB-023,SB-024,SB-025,SB-026,SB-027,SB-028,SB-029,SB-030,SB-031,SB-032
8,G,SB-001,SB-002,SB-023,SB-024,SB-025,SB-026,SB-027,SB-028,SB-029,SB-030,SB-031,SB-032
9,H,SB-001,SB-002,SB-023,SB-024,SB-025,SB-026,SB-027,SB-028,SB-029,SB-030,SB-031,SB-032
10,Control,,,,,,,,,,,,
11,,1,2,3,4,5,6,7,8,9,10,11,12
12,A,Negative,Positive,Library,Library,Library,Library,Library,Library,Library,Library,Library,Library
13,B,Negative,Positive

In [257]:
pd.read_csv(StringIO(data[1]['input']), header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,,,,,,,,,,,,,,,
1,,Chemical,,1,2,3,4,5,6,7,8,9,10,11,12
2,,,A,SB-001,SB-002,SB-013,SB-014,SB-015,SB-016,SB-017,SB-018,SB-019,SB-020,SB-021,SB-022
3,,,B,SB-001,SB-002,SB-013,SB-014,SB-015,SB-016,SB-017,SB-018,SB-019,SB-020,SB-021,SB-022
4,,,C,SB-001,SB-002,SB-013,SB-014,SB-015,SB-016,SB-017,SB-018,SB-019,SB-020,SB-021,SB-022
5,,,D,SB-001,SB-002,SB-013,SB-014,SB-015,SB-016,SB-017,SB-018,SB-019,SB-020,SB-021,SB-022
6,,,E,SB-001,SB-002,SB-013,SB-014,SB-015,SB-016,SB-017,SB-018,SB-019,SB-020,SB-021,SB-022
7,,,F,,,,,,,,,,,,
8,,,G,,,,,,,,,,,,
9,,,H,,,,,,,,,,,,


In [73]:
from langchain.smith import RunEvalConfig, run_on_dataset

def create_chain():
    return chain

eval_config = RunEvalConfig(
  custom_evaluators=[JSONEvaluator()]
)
run_on_dataset(
    client=client,
    dataset_name=dataset_name,
    llm_or_chain_factory=create_chain,
    evaluation=eval_config,
    verbose=True,
)

View the evaluation results for project '4caa8d9cf54b4427a601a80953935948-RunnableSequence' at:
https://dev.langchain.plus/projects/p/ba80a6d6-aceb-43c6-94e0-980f9ec3f480?eval=true


{'project_name': '4caa8d9cf54b4427a601a80953935948-RunnableSequence',
 'results': <Task pending name='Task-5' coro=<_arun_on_examples() running at /Users/harrisonchase/.pyenv/versions/3.10.1/envs/plate-chain/lib/python3.10/site-packages/langchain/smith/evaluation/runner_utils.py:816>>}

Processed examples: 8

In [71]:
chain.invoke({"input": USER_PLATE_ORIENTATION})

AIMessage(content='```json\n[\n  {"row_start": 2, "row_end": 5, "col_start": 2, "col_end": 4, "contents": "Chemical"},\n  {"row_start": 7, "row_end": 10, "col_start": 2, "col_end": 4, "contents": "Control"},\n  {"row_start": 13, "row_end": 16, "col_start": 1, "col_end": 3, "contents": "Dox Concentration"},\n  {"row_start": 20, "row_end": 23, "col_start": 1, "col_end": 3, "contents": "Primer"}\n]\n```', additional_kwargs={}, example=False)

In [35]:
from langchain.chains import create_extraction_chain

In [36]:
schema = {
    "properties": {
        "row_start": {"type": "integer"},
        "row_end": {"type": "integer"},
        "col_start": {"type": "integer"},
        "col_end": {"type": "integer"},
        "contents": {"type": "string"},
    },
    "required": ["row_start", "row_end", "col_start", "col_end", "contents"],
}

In [37]:
chain = create_extraction_chain(schema, ChatOpenAI(temperature=0, model="gpt-4"))

In [39]:
output = chain.invoke({"input": df.to_markdown()})

In [43]:
from kor import Object, Text, Number
from kor import create_extraction_chain as kor_extract


In [51]:
schema = Object(
    id="plate_data",
    description=(
        """Plate based data is always rectangular in nature, typically consisting of 24 (4x6), 96 (8x12), 384 (16x24), or 1536 (32 x 48) wells, and may be located anywhere within a dataset.
- There may be gaps in the data or surrounding rows and columns
- The plate data may be partial
- There may be multiple plates.
- Use heuristics, like looking for numbers and patterns consistent with plate dimensions.
- The sequence of the wells might also be a clue, as they are usually arranged in rows or columns and numbered sequentially.
- The header row for each plate (often consisting of monotonically increasing integers) should NOT be considered as the start of the plate.
- Likewise, the column with the column labels should NOT be considered as the start of the plate.

Each chunk of data might contain multiple plates and there may be multiple chunks.
Use the `contents` field to describe the contents of the plate.
You can often look for text near the table to induce the `contents` field. You should use this text EXACTLY, do not add or change this in any way.

Use 0-indexing for row and column numbers in the final output.
"""
    ),
    attributes=[
        Number(
            id="row_start",
            description="Start of plate data",
            examples=[],
        ),
        Number(
            id="row_end",
            description="End of plate data",
            examples=[],
        ),
        Number(
            id="col_start",
            description="Start of plate data",
            examples=[],
        ),
        Number(
            id="col_end",
            description="End of plate data",
            examples=[],
        ),
        Text(
            id="contents",
            description="contents of plate data",
            examples=[],
        ),
    ],
    many=True,
)

chain = kor_extract(ChatOpenAI(temperature=0, model="gpt-4"), schema, encoder_or_encoder_class='json')

In [52]:
chain.run(df.to_markdown())

{'data': {},
 'raw': '<json>[\n{\n"row_start": 2,\n"row_end": 5,\n"col_start": 2,\n"col_end": 4,\n"contents": "Chemical"\n},\n{\n"row_start": 7,\n"row_end": 10,\n"col_start": 2,\n"col_end": 4,\n"contents": "Control"\n},\n{\n"row_start": 13,\n"row_end": 16,\n"col_start": 1,\n"col_end": 3,\n"contents": "Dox Concentration"\n},\n{\n"row_start": 20,\n"row_end": 23,\n"col_start": 1,\n"col_end": 3,\n"contents": "Primer"\n}\n]</json>',
 'errors': [kor.exceptions.ParseError('The LLM has returned structured data which does not match the expected schema. Providing additional examples may help improve the parse.')],
 'validated_data': {}}

In [50]:
import json
json.loads('[\n{\n"row_start": 2,\n"row_end": 5,\n"col_start": 2,\n"col_end": 4,\n"contents": "Chemical"\n},\n{\n"row_start": 7,\n"row_end": 10,\n"col_start": 2,\n"col_end": 4,\n"contents": "Control"\n},\n{\n"row_start": 13,\n"row_end": 16,\n"col_start": 1,\n"col_end": 3,\n"contents": "Dox Concentration"\n},\n{\n"row_start": 20,\n"row_end": 23,\n"col_start": 1,\n"col_end": 3,\n"contents": "Primer"\n}\n]')

[{'row_start': 2,
  'row_end': 5,
  'col_start': 2,
  'col_end': 4,
  'contents': 'Chemical'},
 {'row_start': 7,
  'row_end': 10,
  'col_start': 2,
  'col_end': 4,
  'contents': 'Control'},
 {'row_start': 13,
  'row_end': 16,
  'col_start': 1,
  'col_end': 3,
  'contents': 'Dox Concentration'},
 {'row_start': 20,
  'row_end': 23,
  'col_start': 1,
  'col_end': 3,
  'contents': 'Primer'}]

In [202]:
from langchain.schema.runnable import RunnableMap

chain1 = RunnableMap({
    "foo": lambda x: x["bar"]
}) | ChatPromptTemplate.from_template("{foo}") | ChatOpenAI()
chain2 = ChatPromptTemplate.from_template("{foo}") | ChatOpenAI()

In [211]:
%%time
chain1.invoke({"bar": "hi"})

CPU times: user 56.1 ms, sys: 12.6 ms, total: 68.7 ms
Wall time: 2.21 s


AIMessage(content='Hello! How can I assist you today?', additional_kwargs={}, example=False)

In [215]:
%%time
chain2.invoke({"foo": "hi"})

CPU times: user 36.4 ms, sys: 8.47 ms, total: 44.9 ms
Wall time: 934 ms


AIMessage(content='Hello! How can I assist you today?', additional_kwargs={}, example=False)