In [None]:
!pip install sqlalchemy pandas langchain anthropic openai

### 1. Connect to DB and Load Queries

In [40]:
from sqlalchemy import create_engine
import pandas as pd

engine = create_engine('postgresql://postgres:7zMBT65ost!zjNssgc5x@35.237.166.52:5432/postgres')

try:
    conn = engine.connect()
    print("Successfully connected to the database.")
except Exception as e:
    print("Unable to connect to the database.")
    print(e)


Successfully connected to the database.


In [41]:
def run_query(query, conn):
    from sqlalchemy import exc
    try:
        df = pd.read_sql_query(query, conn)
    except exc.SQLAlchemyError as e:
        print(e)
        conn.rollback()
    return df

In [42]:
query = """
SELECT prompt, answer FROM chatbot_slackquery
WHERE command='chat-openai'
ORDER BY id DESC
LIMIT 30;
"""

df = run_query(query, conn)
queries = df[df.index % 3 == 1].to_dict('records')

In [43]:
for query in queries:
    query['context'] = query['prompt'].split('Human: ', 4)[2].strip()
    query['question'] = query['prompt'].split('Human: ', 4)[3].split('Question: ')[1].strip()
    query.pop('prompt')
    print(query['question'])

What amount of equity should be given to an adviser?
Which investor should the founders of Chief, Childers and Kaplan, choose to collaborate with and why — the male venture capitalist or the female venture capitalist?
Can you describe the most appealing aspects of the Squire business model? Additionally, can you explain why the founders encountered difficulties in raising their seed round?
Could you provide a summary of the key issues in the Khatabook case? Please answer the assignment questions to the best of your ability, making educated guesses where necessary, and provide as much detail as possible.
What is the key decision point for the founders of Bubble?
What should be my considerations when determining if a startup is an appropriate early stage investment opportunity?
can you tell me what is the first topic in the course we'll cover?
Who is Christina Wallace?
How would you evaluate the founder-market fit of the team at Shippo? Could you provide specific details regarding their 

In [None]:
import json
with open("queries.json", "w") as f:
    json.dump(queries, f)

In [48]:
import json
with open("queries_alt.json", "r") as f:
    queries = json.load(f)

### 2. Full Context Comparison Evaluator

In [49]:
from dataclasses import dataclass, asdict
import openai
from typing import Tuple

@dataclass
class LLMRequest:
    objective: str
    inputs: dict[str, str]
    responses: list[dict[str, str]]

def _execute_comparator(
    objective: str,
    input_variables: dict[str, str],
    older_checkpoint_response: str,
    newer_checkpoint_response: str,
) -> Tuple[str, str]:
    system_msg = "You are an evaluator trying to grade the response of two agents based on provided JSON data. Keeping the objectives and the inputs in mind, decide which response is better and provide a reason. You always use the function provided."
    responses = [
        {
            "name": "Andrew",
            "response": older_checkpoint_response,
        },
        {
            "name": "Barry",
            "response": newer_checkpoint_response,
        },
    ]
    llm_request = LLMRequest(
        objective=objective, inputs=input_variables, responses=responses
    )
    user_msg = f"""{json.dumps(asdict(llm_request))}

    Given the above objective, inputs and responses, report your decision on the best response along with the reasoning. Think step by step.
    """
    functions = [
        {
            "name": "report_decision",
            "description": "report the results of the evaluation",
            "parameters": {
                "type": "object",
                "properties": {
                    "decision": {
                        "type": "string",
                        "enum": ["Andrew", "Barry", "Same"],
                        "description": "The name of the agent with the best response. 'Same' if both are equally good.",
                    },
                    "reason": {
                        "type": "string",
                        "description": "The reason for the decision.",
                    },
                },
                "required": ["decision", "reason"],
            },
        }
    ]

    response = openai.ChatCompletion.create(  # type: ignore
        model="gpt-4",
        temperature=0.1,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        functions=functions,
        function_call={"name": "report_decision"},
    )["choices"][0]["message"]
    assert response["content"] is None
    assert response["function_call"]["name"] == "report_decision"
    decision = json.loads(response["function_call"]["arguments"])["decision"]
    reason = json.loads(response["function_call"]["arguments"])["reason"]
    return decision, reason

def run_evaluation(query, context_a, context_b):
    return _execute_comparator(
        objective="This agent tries to generate a context based on a query. Verify that the agent selects the best context for the query.",
        input_variables={"question": query},
        older_checkpoint_response=context_a,
        newer_checkpoint_response=context_b
    )

In [68]:
err_count = 0
for idx, query in enumerate(queries):
    comparison_key = 'context_alt_1'
    if comparison_key in query:
      print("\n\n****************************\n")
      print(f"Question [{idx}]: {query['question']}\n")
      if query['change'] == 'reorder':
          correct_decision = 'Same'
      else:
          correct_decision = 'Barry'
      decision, reason = run_evaluation(
          query=query['question'],
          context_b=query['context'],
          context_a=query[comparison_key])
      print(f"Expected [{idx}]: {correct_decision}")
      print(f"Actual   [{idx}]: {decision}")
      print(f"Reason   [{idx}]: {reason}")
      if correct_decision not in decision:
          err_count += 1
          print(f"⚠️⚠️⚠️⚠️⚠️⚠️ ERROR ⚠️⚠️⚠️⚠️⚠️⚠️")

print(f"\n\nTotal Errors: {err_count}")



****************************

Question [0]: What amount of equity should be given to an adviser?

Expected [0]: Barry
Actual   [0]: Same
Reason   [0]: Both agents, Andrew and Barry, provided comprehensive and relevant responses to the question about the amount of equity to be given to an adviser. They both sourced their information from credible sources and provided a detailed context for the query. Therefore, it's difficult to determine which one is better as they both meet the objectives equally well.
⚠️⚠️⚠️⚠️⚠️⚠️ ERROR ⚠️⚠️⚠️⚠️⚠️⚠️


****************************

Question [1]: Which investor should the founders of Chief, Childers and Kaplan, choose to collaborate with and why — the male venture capitalist or the female venture capitalist?

Expected [1]: Barry
Actual   [1]: Same
Reason   [1]: Both agents, Andrew and Barry, provided the same response. They both extracted relevant information from the same sources, providing a comprehensive context for the query. Therefore, it's impo

### 3. Summarized Context Comparison Evaluator

In [60]:
from dataclasses import dataclass, asdict
import openai
from typing import Tuple

@dataclass
class SummaryRequest:
    inputs: dict[str, str]
    content: str

def _execute_summarizer(
    input_variables: dict[str, str],
    content: str,
) -> Tuple[str, str]:
    system_msg = "You are a summarizer that summarizes content within the context of the input parameters. You provide a detailed summary that does not leave out any pieces of information that could be used in the context of the input. If you are unsure, you include the content in your summary."
    summary_request = SummaryRequest(inputs=input_variables, content=content)
    user_msg = f"""{json.dumps(asdict(summary_request))}
    Tip: Be sure to leave in all the pieces of information that could be used in the context of the input. If you are unsure, you include the content in your summary.
    """

    response = openai.ChatCompletion.create(  # type: ignore
        model="gpt-3.5-turbo-16k",
        temperature=0.1,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
    )["choices"][0]["message"]["content"]
    return response

def run_summarizer(query, context):
    return _execute_summarizer(
        input_variables={"question": query},
        content=context,
    )

In [67]:
err_count = 0
for idx, query in enumerate(queries):
    comparison_key = 'context_alt_1'
    if comparison_key in query:
        print("\n\n****************************\n")
        question = query['question']
        print(f"Question [{idx}]: {question}\n")
        if query['change'] == 'reorder':
            correct_decision = 'Same'
        else:
            correct_decision = 'Barry'
        context_b = run_summarizer(question, query['context'])
        context_a = run_summarizer(question, query[comparison_key])
        decision, reason = run_evaluation(
            query=query['question'],
            context_a=context_a,
            context_b=context_b)
        print(f"Expected [{idx}]: {correct_decision}")
        print(f"Actual   [{idx}]: {decision}")
        print(f"Reason   [{idx}]: {reason}")
        if correct_decision not in decision:
            err_count += 1
            print(f"⚠️⚠️⚠️⚠️⚠️⚠️ ERROR ⚠️⚠️⚠️⚠️⚠️⚠️")

print(f"\n\nTotal Errors: {err_count}")



****************************

Question [0]: What amount of equity should be given to an adviser?

Expected [0]: Barry
Actual   [0]: Andrew
Reason   [0]: Both agents provided comprehensive responses, but Andrew's response was more directly related to the question about equity for an adviser. Andrew provided more specific details about how to structure the equity grant, considerations for vesting, and how to revisit the grant annually. Barry's response, while informative, veered off-topic with discussions about diversified portfolios and the involvement of equity investors in management, which are not directly related to the question.
⚠️⚠️⚠️⚠️⚠️⚠️ ERROR ⚠️⚠️⚠️⚠️⚠️⚠️


****************************

Question [1]: Which investor should the founders of Chief, Childers and Kaplan, choose to collaborate with and why — the male venture capitalist or the female venture capitalist?

Expected [1]: Barry
Actual   [1]: Same
Reason   [1]: Both Andrew and Barry provided comprehensive and detailed re

In [133]:
from langchain.chat_models import ChatOpenAI

PROMPT_SUMMARIZE_TEMPLATE = """The following are potentially relevant pieces of content retrieved for a given user query. For each of the sources in the context, provide a summarized version of the source keeping in mind relevance to the original query.

QUERY: ###
{query}
###

CONTEXT: ###
{context}
###
"""

def gen_summarized_context(query, input_key, output_key):
    prompt = PROMPT_SUMMARIZE_TEMPLATE.format(query=query['question'], context=query[input_key])
    chat = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
    query[output_key] = chat.predict(prompt).strip()
    return query

for query in queries:
    query = gen_summarized_context(query, 'context', 'context_summary')


In [129]:
from langchain.chat_models import ChatOpenAI

PROMPT_TEMPLATE = """You are an evaluator trying to grade the response of two students based on the following JSON data. Keeping the objectives and the inputs in mind, decide which response is better and provide a reason. If both are equally good, say "Both".

{{
    "objective": "This student tries to generate a context based on a query. Verify that the student selects the correct context for the query by reviewing the context summaries.",
    "inputs":
    [
        {{
            "query": "{query}"
        }}
    ],
    "responses":
    [
        {{
            "name": "Andrew",
            "context_summaries": "{response_a}"
        }},
        {{
            "name": "Barry",
            "context_summaries": "{response_b}"
        }}
    ]
}}

Give your reasoning followed by the name of the student with the best response (or "Both" if both are equally good). Think step by step.
"""

PROMPT_FINAL_ANSWER_TEMPLATE = """Given the following information, list the one-word name of the student who performed better (Or "Both" if both are equally good). Only answer with one word:

{reason}
"""

def run_evaluation(query, context_a, context_b):
    response_a = context_a.replace('"', '\\"')
    response_b = context_b.replace('"', '\\"')
    eval_prompt = PROMPT_TEMPLATE.format(query=query, response_a=response_a, response_b=response_b)
    chat = ChatOpenAI(temperature=0, model="gpt-4")
    reason = chat.predict(eval_prompt).strip()
    print(f"Reason: {reason}")
    final_prompt = PROMPT_FINAL_ANSWER_TEMPLATE.format(reason=reason)
    answer = chat.predict(final_prompt).strip()
    return answer

In [95]:
err_count = 0
for idx, query in enumerate(queries[:5]):
    print(f"Question [{idx}]: {query['question']}\n")
    query_alt = queries[(idx + 3) % len(queries)]
    response = run_evaluation(query=query['question'],
                              context_b=query['context_summary'],
                              context_a=query_alt['context_summary'])
    print(f"Evaluation [{idx}]: {response}")
    if "Barry" not in response:
        err_count += 1
        print(f"ERROR: [{idx}]")

print(f"Total Errors: {err_count}")

Question [0]: What are the essential elements that should be included in a pre-seed first presentation with a potential investor? Could you provide a detailed description of each element?

Reason: The objective is to evaluate the students' ability to select the correct context for the query. The query asks for a detailed description of the essential elements that should be included in a pre-seed first presentation with a potential investor.

Andrew's context summaries are mostly about the challenges and decisions faced by start-ups when scaling, including issues related to leadership, product strategy, and selling. They also discuss the expansion options for a company called Chief and the risks and considerations involved. However, they do not directly address the query about the essential elements that should be included in a pre-seed first presentation with a potential investor.

On the other hand, Barry's context summaries directly address the query. They provide a list of elements 

### 4. Full Context Comparison Evaluator - Subtle Changes

In [131]:
err_count = 0
for idx, query in enumerate(queries[:5]):
    print(f"Question [{idx}]: {query['question']}\n")
    response = run_evaluation(query=query['question'],
                              context_a=query['context'],
                              context_b=query['context_alt'])
    print(f"Evaluation [{idx}]: {response}")
    if "Andrew" not in response:
        err_count += 1
        print(f"ERROR: [{idx}]")

print(f"Total Errors: {err_count}")

Question [0]: What are the essential elements that should be included in a pre-seed first presentation with a potential investor? Could you provide a detailed description of each element?

Reason: Both students, Andrew and Barry, have provided the same context summaries. Therefore, both responses are equally good.
Evaluation [0]: Both
ERROR: [0]
Question [1]: Which investor should the founders of Chief, Childers and Kaplan, choose to collaborate with and why — the male venture capitalist or the female venture capitalist?

Reason: Both

Both students, Andrew and Barry, have provided the same context summaries. They have both included relevant information about the male-led firm and the female-led firm, the decision-making process of the founders, and the implications of their choices. Therefore, both responses are equally good.
Evaluation [1]: Both
ERROR: [1]
Question [2]: Considering all the factors, what decision should the founders of Chief, Childers and Kaplan, make?

Reason: Both s

### 5. Summarized Context Comparison Evaluator - Subtle Changes

In [None]:
for query in queries[:5]:
    query = gen_summarized_context(query, 'context_alt', 'context_alt_summary')

In [137]:
err_count = 0
for idx, query in enumerate(queries[:5]):
    print(f"Question [{idx}]: {query['question']}\n")
    response = run_evaluation(query=query['question'],
                              context_a=query['context_summary'],
                              context_b=query['context_alt_summary'])
    print(f"Evaluation [{idx}]: {response}")
    if "Andrew" not in response:
        err_count += 1
        print(f"ERROR: [{idx}]")

print(f"Total Errors: {err_count}")

Question [0]: What are the essential elements that should be included in a pre-seed first presentation with a potential investor? Could you provide a detailed description of each element?

Reason: Both students, Andrew and Barry, have provided context summaries that are relevant to the query. They both mention sources that discuss the essential elements of a pre-seed first presentation with a potential investor, considerations when raising a pre-seed round, tips for pitching to investors, and the importance of understanding the financials and being clear about the ask. 

However, Barry's summaries are slightly more detailed and provide additional context about the sources, such as specifying that one is a fundraising handbook and another is a blog post. This additional context could be helpful in understanding the reliability and perspective of the sources. 

Therefore, Barry's response is slightly better.
Evaluation [0]: Barry
ERROR: [0]
Question [1]: Which investor should the founder

In [141]:
from dataclasses import dataclass

@dataclass
class DataTest:
    foo: str
    bar: int

chames = {'output': DataTest(foo='foo', bar=1)}
print(chames)

{'output': DataTest(foo='foo', bar=1)}


In [143]:
import json
with open("queries.json", "w") as f:
    json.dump({json.dumps(queries)}, f)

[{'prompt': 'System: You are a world class algorithm to answer questions in a specific format. You use the context provided to answer the question and list your sources in the format specified. Do not make up answers.\nHuman: Answer question using the following context.\nHuman: Source: "ENTRE_M3_BV3_05_17_2023 Jeff W1 Changes.docx - 50/120"\ninvestor should contain the following nine elements: An Intro: who you are, why you’re here, why you are special, what’s your earned secret, and try to convey founder-market fit. A Problem: what is the customer pain and why is it a “must have?” A Solution: what’s your disruptive, breakthrough compelling solution and why is it 10x better—not 2x better than existing solutions? An Opportunity and Market size: provide your best thinking on the TAM/SAM/SOM using a methodology that is both top down (meaning research or survey-based) and bottoms up (how much can we charge each account).  A Competitive Analysis: what is your unique difference? Why can you 

In [70]:
queries

[{'answer': "The amount of equity given to an advisor in a startup can vary based on several factors, including the advisor's expected contribution, the value of their involvement, and the existing capital structure of the company. However, it is generally suggested that advisors, who typically contribute less value than directors, should receive a lower equity grant. A common suggestion is an annual grant of 25% to 50% of the four-year grant you'd give a junior engineer. This would equate to 1x - 2x a junior engineer's grant if the advisor stays engaged for four years. It's also recommended to vest the equity over time and ensure there's a tangible commitment associated with it.",
  'question': 'What amount of equity should be given to an adviser?',
  'change': 'replace'},
 {'answer': 'The founders of Chief, Childers and Kaplan, decided to collaborate with both the male and female venture capitalists. They raised $22 million in funding, combining investments from an all-male firm led 