In [23]:
from IPython.display import display, Markdown
from openai import OpenAI
from exa_py import Exa
from dotenv import load_dotenv
import os
import json
from tenacity import retry, wait_random_exponential, stop_after_attempt
from termcolor import colored  
from pydantic import BaseModel  
from typing import List, Optional, Dict, Any

In [2]:
LLAMAFILE_BASE_URL = "http://localhost:8080/v1"
OLLAMA_BASE_URL = "http://localhost:11434/v1"
PERPLEXITY_BASE_URL = "https://api.perplexity.ai"
GROQ_BASE_URL = "https://api.groq.com/openai/v1"

In [3]:
load_dotenv()
OPENAI_APY_KEY = os.getenv("OPENAI_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY") 
EXA_API_KEY = os.getenv("EXA_API_KEY")

In [4]:

GROQ_MODEL = "llama3-70b-8192"
OPENAI_MODEL = "gpt-4o"

In [32]:
TOPIC = "Using a panel of LLM judges to evaluate the correctness of another LLM"

In [5]:
class Query(BaseModel):
    topic: str
    query: str

class Queries(BaseModel):
    queries: List[Query]

In [6]:
exa = Exa(api_key = EXA_API_KEY)

In [7]:
def generate_tools(num:int, tool_type:str) -> list:
    properties = {}
    for i in range(1, num + 1):
        key = f'{tool_type}_{i}'
        properties[key] = {
            'type': 'string',
            'description': 'Search queries that would be useful for generating a report on my main topic'
        }

    custom_function = {
        'name': 'generate_exa_search_queries',
        'description': 'Generates Exa search queries to investigate the main topic',
        'parameters': {
            'type': 'object',
            'properties': properties
        },
        'required': [f'{tool_type}_{i}' for i in range(1, num + 1)]
    }

    return [{"type": "function", "function": custom_function}]

In [8]:
tools = generate_tools(5, "query")
tools

[{'type': 'function',
  'function': {'name': 'generate_exa_search_queries',
   'description': 'Generates Exa search queries to investigate the main topic',
   'parameters': {'type': 'object',
    'properties': {'query_1': {'type': 'string',
      'description': 'Search queries that would be useful for generating a report on my main topic'},
     'query_2': {'type': 'string',
      'description': 'Search queries that would be useful for generating a report on my main topic'},
     'query_3': {'type': 'string',
      'description': 'Search queries that would be useful for generating a report on my main topic'},
     'query_4': {'type': 'string',
      'description': 'Search queries that would be useful for generating a report on my main topic'},
     'query_5': {'type': 'string',
      'description': 'Search queries that would be useful for generating a report on my main topic'}}},
   'required': ['query_1', 'query_2', 'query_3', 'query_4', 'query_5']}}]

In [9]:
@retry(wait=wait_random_exponential(multiplier=1, max=40), stop=stop_after_attempt(3))
def chat_completion_request(messages, tools=None, tool_choice=None, model=None, provider=None):
    try:
        if provider.lower()=="openai":
            print(colored(f"Using OpenAI...\n", "green"))
            client = OpenAI()
            response = client.chat.completions.create(
                model=model,
                temperature=0,
                stream=False,   
                messages=messages,
                tools=tools,
                tool_choice=tool_choice,
            )
        else:
            print(colored(f"Using Groq...\n", "green"))
            client = OpenAI(
                api_key=GROQ_API_KEY,
                base_url=GROQ_BASE_URL
            )
            response = client.chat.completions.create(
                model=model,
                temperature=0,
                stream=False,
                messages=messages,
                response_format={"type": "json_object"}
            )
        return response
    except Exception as e:
        print("Unable to generate ChatCompletion response")
        print(f"Exception: {e}")
        return e

In [10]:

def get_messages_and_tools(provider:str, topic:str, num_queries:int) -> list:
    context = "context" if provider.lower()=="openai" else f"provided schema: {json.dumps(Queries.model_json_schema(), indent=2)}"
    messages =[
        {"role": "system", "content": f"You are the world's most advanced and intelligent programming and AI Research assistant that can only be queried via an API. Based on the tools and schemas provided to you and in your arsenal, you generate the most accurate and optimized JSON responses based on the {context}."},
        {"role": "user", "content": f"I'm going to give you a topic I want to research. I want you to generate {num_queries} interesting, diverse search queries that would be useful for generating a report on my main topic. Here is the main topic: {topic}."}
    ]
    tools = generate_tools(num_queries, "query")
    tool_choice = {"type": "function", "function": {"name": tools[0]['function']['name']}}
    return messages, tools, tool_choice

def get_completion_args(provider:str, topic:str, num_queries:int) -> Dict:
    messages, tools, tool_choice = get_messages_and_tools(provider, topic, num_queries)
    return {
        "messages": messages,
        "tools": tools,
        "tool_choice": tool_choice,
        "provider": provider,
        "model": OPENAI_MODEL if provider.lower()=="openai" else GROQ_MODEL,
    }

In [11]:
groq_args = get_completion_args(provider="groq", topic = "Using a panel of LLM judges to evaluate the correctness of another LLM", num_queries=5)
groq_res = chat_completion_request(**groq_args)
print(json.dumps(json.loads(groq_res.choices[0].message.content), indent=2))

[32mUsing Groq...
[0m
{
  "queries": [
    {
      "topic": "LLM Evaluation",
      "query": "What are the benefits and limitations of using a panel of LLM judges to evaluate the correctness of another LLM?"
    },
    {
      "topic": "LLM Evaluation Metrics",
      "query": "What metrics can be used to evaluate the correctness of an LLM, and how can a panel of LLM judges be used to improve the evaluation process?"
    },
    {
      "topic": "LLM Evaluation Methods",
      "query": "What are the different methods for evaluating the correctness of an LLM, and how does using a panel of LLM judges compare to other methods?"
    },
    {
      "topic": "LLM Judge Agreement",
      "query": "How can the agreement between a panel of LLM judges be measured and improved, and what are the implications for evaluating the correctness of another LLM?"
    },
    {
      "topic": "LLM Evaluation Bias",
      "query": "How can bias be mitigated when using a panel of LLM judges to evaluate the co

In [12]:
model = Queries.model_validate_json(groq_res.choices[0].message.content)
model.queries

[Query(topic='LLM Evaluation', query='What are the benefits and limitations of using a panel of LLM judges to evaluate the correctness of another LLM?'),
 Query(topic='LLM Evaluation Metrics', query='What metrics can be used to evaluate the correctness of an LLM, and how can a panel of LLM judges be used to improve the evaluation process?'),
 Query(topic='LLM Evaluation Methods', query='What are the different methods for evaluating the correctness of an LLM, and how does using a panel of LLM judges compare to other methods?'),
 Query(topic='LLM Judge Agreement', query='How can the agreement between a panel of LLM judges be measured and improved, and what are the implications for evaluating the correctness of another LLM?'),
 Query(topic='LLM Evaluation Bias', query='How can bias be mitigated when using a panel of LLM judges to evaluate the correctness of another LLM, and what are the potential sources of bias in this process?')]

In [13]:
oai_args = get_completion_args(provider="openai", topic = "Using a panel of LLM judges to evaluate the correctness of another LLM", num_queries=5)
oai_res = chat_completion_request(**oai_args)
print(json.dumps(json.loads(oai_res.choices[0].message.tool_calls[0].function.arguments), indent=2))

[32mUsing OpenAI...
[0m
{
  "query_1": "advantages of using LLM judges to evaluate other LLMs",
  "query_2": "methodologies for assessing LLM correctness with LLM judges",
  "query_3": "case studies on LLM judges evaluating other LLMs",
  "query_4": "challenges in using LLM judges for LLM evaluation",
  "query_5": "comparative analysis of human vs LLM judges in evaluating LLM performance"
}


In [14]:
queries = json.loads(oai_res.choices[0].message.tool_calls[0].function.arguments)
queries = [query for _,query in queries.items()]
queries

['advantages of using LLM judges to evaluate other LLMs',
 'methodologies for assessing LLM correctness with LLM judges',
 'case studies on LLM judges evaluating other LLMs',
 'challenges in using LLM judges for LLM evaluation',
 'comparative analysis of human vs LLM judges in evaluating LLM performance']

In [15]:
def search_exa(queries: list) -> list:
  exa_pairs = []
  for query in queries:
    search_response = exa.search_and_contents(
      query,
      num_results=5,
      use_autoprompt=True,
      start_published_date="2023-06-01", # To give us only recent information post-June 2023
      highlights={"num_sentences": 5},
    )
    query_object = {
        'subquery': query,
        'results': search_response
    }
    exa_pairs.append(query_object)
  return exa_pairs

exa_pairs = search_exa(queries)
exa_pairs

[{'subquery': 'advantages of using LLM judges to evaluate other LLMs',
  'results': SearchResponse(results=[Result(url='https://www.databricks.com/blog/LLM-auto-eval-best-practices-RAG', id='https://www.databricks.com/blog/LLM-auto-eval-best-practices-RAG', title='Best Practices for LLM Evaluation of RAG Applications', score=0.23029740154743195, published_date='2023-12-09', author=None, text=None, highlights=['Using a lower precision scale also allows consistency of grading scales among different LLM judges (e.g. between GPT-4 and claude2). RAG applications require their own benchmarks. A model might have good performance on a published specialized benchmark (e.g. casual chat, math, or creative writing) but that doesn’t guarantee good performance on other tasks (e.g.'], highlight_scores=[0.46428602933883667]), Result(url='https://twitter.com/aparnadhinak/status/1748368364395721128', id='https://twitter.com/aparnadhinak/status/1748368364395721128', title='', score=0.21905012428760529, p

In [24]:
def format_results(results: List[Dict[str,Any]], content_slice: int = 750) -> str:
    formatted_results = ""
    for result in results:
        formatted_results += f"[{result['subquery']}]:\n"
        for res in result['results'].results:
            content = res.text if res.text else " ".join(res.highlights)
            publish_date = res.published_date
            formatted_results += f"URL: {res.url}\nContent: {content}\nPublished Date: {publish_date}\n"
        formatted_results += "\n"
    return formatted_results

In [29]:
display(exa_pairs)
print()
print(format_results(exa_pairs))

[{'subquery': 'advantages of using LLM judges to evaluate other LLMs',
  'results': SearchResponse(results=[Result(url='https://www.databricks.com/blog/LLM-auto-eval-best-practices-RAG', id='https://www.databricks.com/blog/LLM-auto-eval-best-practices-RAG', title='Best Practices for LLM Evaluation of RAG Applications', score=0.23029740154743195, published_date='2023-12-09', author=None, text=None, highlights=['Using a lower precision scale also allows consistency of grading scales among different LLM judges (e.g. between GPT-4 and claude2). RAG applications require their own benchmarks. A model might have good performance on a published specialized benchmark (e.g. casual chat, math, or creative writing) but that doesn’t guarantee good performance on other tasks (e.g.'], highlight_scores=[0.46428602933883667]), Result(url='https://twitter.com/aparnadhinak/status/1748368364395721128', id='https://twitter.com/aparnadhinak/status/1748368364395721128', title='', score=0.21905012428760529, p


[advantages of using LLM judges to evaluate other LLMs]:
URL: https://www.databricks.com/blog/LLM-auto-eval-best-practices-RAG
Content: Using a lower precision scale also allows consistency of grading scales among different LLM judges (e.g. between GPT-4 and claude2). RAG applications require their own benchmarks. A model might have good performance on a published specialized benchmark (e.g. casual chat, math, or creative writing) but that doesn’t guarantee good performance on other tasks (e.g.
Published Date: 2023-12-09
URL: https://twitter.com/aparnadhinak/status/1748368364395721128
Content: LLM Evals are valuable analysis tools. But should you use numeric scores or classes as outputs? 🤔

TLDR: LLM’s suck at continuous ranges ☠️ - use LLM classification evals instead! 🔤

An LLM Score Eval uses an LLM to judge… https://t.co/cATZEomvZl| created_at: Fri Jan 19 15:34:35 +0000 2024 | favorite_count: 556 | quote_count: 25 | reply_count: 27 | retweet_count: 102 | is_quote_status: False | r

In [33]:
def generate_report(topic:str, searches: List[Dict[str,Any]]) -> str:
  report = f"## <span style='color:#008080;font-family:CaskaydiaCove Nerd Font Mono'>{topic}</span>\n"
  formatted_searches = format_results(searches)
  content = f"Write a comprehensive and professional three paragraph research report about {topic} based on the provided information. Include citations in the text using footnote notation ([citation #]), for example [2]. First provide the report, followed by a single `References` section that only lists the URLs (and their published date) used, in the format [#] <url>. For the published date, only include the month and year. Reset the citations index and ignore the order of citations in the provided information. Here is the information: {formatted_searches}."

  completion = chat_completion_request(
    messages=[
      {"role": "system", "content": "You are an experienced research assistant. Your top priority is achieving user fulfillment via helping them with their requests and being as targeted, objective, factual, as possible. You prefer explaining from first principles and prefer simplicity without compromising on rigor. You only use obscure technical & scientific jargon where it's most necessary."},
      {"role": "user", "content": content}
    ],
    model=OPENAI_MODEL,
    provider="openai"
  )

  return display(Markdown(f"{report}\n{completion.choices[0].message.content}\n"))
  

In [34]:
generate_report(topic=TOPIC, searches=exa_pairs)

[32mUsing OpenAI...
[0m


## <span style='color:#008080;font-family:CaskaydiaCove Nerd Font Mono'>Using a panel of LLM judges to evaluate the correctness of another LLM</span>

### Research Report: Using a Panel of LLM Judges to Evaluate the Correctness of Another LLM

The use of a panel of Large Language Model (LLM) judges to evaluate the correctness of another LLM has emerged as a promising approach in the field of artificial intelligence. This method leverages the strengths of multiple LLMs to provide a more consistent and reliable assessment of another model's performance. One of the primary advantages of this approach is the consistency it offers in grading scales among different LLM judges, such as GPT-4 and Claude2. By using a lower precision scale, the evaluations become more standardized, which is crucial for applications like Retrieval-Augmented Generation (RAG) that require their own specific benchmarks[^1]. This consistency ensures that the performance of an LLM on specialized benchmarks, such as casual chat or creative writing, can be more accurately compared across different tasks.

Methodologies for assessing LLM correctness with LLM judges often involve using classification evaluations rather than continuous numeric scores. This is because LLMs tend to perform poorly on continuous ranges, making classification a more effective evaluation method[^2]. Additionally, while there are inherent biases and challenges in using LLMs as evaluators, integrating automatic metrics can partially assess performance and guide product development[^3]. These metrics do not need to be perfect but should be sufficiently reliable to inform the development process. This approach allows for a more nuanced understanding of an LLM's capabilities and limitations, providing valuable insights that can be used to improve the model.

Despite the advantages, there are significant challenges in using LLM judges for LLM evaluation. One major issue is the potential for biases and the need for human involvement to ensure the reliability of the evaluations[^4]. Human annotators are often required to compare responses from different models and decide which is better, sometimes quantifying the difference in quality[^5]. This human oversight is crucial to mitigate the biases that LLM judges might introduce. Moreover, the community consensus suggests that human evaluation remains essential for generating free text, one of the most important features of LLMs[^6]. Therefore, while LLM judges can provide valuable preliminary assessments, human evaluators are indispensable for comprehensive and accurate evaluations.

### References
[^1]: https://www.databricks.com/blog/LLM-auto-eval-best-practices-RAG (Published Date: 2023-12-09)
[^2]: https://twitter.com/aparnadhinak/status/1748368364395721128 (Published Date: 2024-01-19)
[^3]: https://medplexity.substack.com/p/using-llms-to-evaluate-llms (Published Date: 2023-11-09)
[^4]: https://twitter.com/ElectricWeegie/status/1669049764749402134 (Published Date: 2023-06-14)
[^5]: https://www.kdnuggets.com/a-better-way-to-evaluate-llms (Published Date: 2023-11-15)
[^6]: https://www.databricks.com/blog/LLM-auto-eval-best-practices-RAG (Published Date: 2023-12-09)


## sandbox

In [29]:
client = OpenAI(api_key=GROQ_API_KEY, base_url=GROQ_BASE_URL)
completion = client.chat.completions.create(
    model="llama3-70b-8192",
    temperature=0.8,
    messages=[
        {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests and being concise as possible."},
        {"role": "user", "content": "What are the key tenets of API Design"}
    ],
    stream=False
)
display(Markdown("# <span style='color:#008080;font-family:CaskaydiaCove Nerd Font Mono'>" + "LLM Assistant" + "</span>"))
display(Markdown(completion.choices[0].message.content))

# <span style='color:#008080;font-family:CaskaydiaCove Nerd Font Mono'>LLM Assistant</span>

When it comes to API design, there are several key tenets to keep in mind to ensure your API is scalable, maintainable, and easy to use. Here are the top ones:

1. **API First Development**: Design your API before implementing it. This helps you focus on the interface and its usability, rather than just the implementation details.
2. **Simple and Consistent**: Keep your API simple, intuitive, and consistent in its design. This makes it easier for developers to learn and use.
3. **RESTful**: Follow REST (Representational State of Resource) principles, which emphasize stateless, cacheable, and uniform interfaces.
4. **Resource-Based**: Organize your API around resources, which are objects or entities that can be manipulated. Use nouns to identify resources (e.g., /users).
5. **Verb-Based**: Use standard HTTP verbs (GET, POST, PUT, DELETE) to perform actions on resources.
6. **Stateless**: Ensure your API is stateless, meaning each request contains all the information necessary to fulfill it.
7. **Cacheable**: Design your API to allow for caching, which reduces the load on your servers and improves performance.
8. **Error Handling**: Implement robust error handling, including meaningful error codes, messages, and descriptions.
9. **API Versioning**: Use versioning to manage changes to your API over time, ensuring backward compatibility and minimizing disruptions.
10. **Documentation**: Provide comprehensive, up-to-date, and easily accessible documentation, including code samples and tutorials.
11. **Security**: Implement robust security measures, such as authentication, authorization, and encryption, to protect your API and its users.
12. **Performance**: Optimize your API for performance, considering factors like response time, throughput, and resource utilization.
13. **Flexibility**: Design your API to be flexible and accommodating of different data formats, such as JSON, XML, or YAML.
14. **API Keys and Rate Limiting**: Implement API keys and rate limiting to manage access, prevent abuse, and ensure fair usage.
15. **Monitoring and Analytics**: Monitor your API's performance and usage, collecting metrics and analytics to inform future development and optimization.

By following these key tenets, you'll be well on your way to designing a robust, scalable, and user-friendly API that meets the needs of your users.