# Evaluation Techniques

In this notebook, we will explore different evaluation techniques to evaluate the performance of our LLM models in generating API docs. We will look into implementing suitable metrics for scoring/ranking the generated outputs.

In [2]:
import os
import json
import re
import pandas as pd
from dotenv import load_dotenv
from genai import Credentials, Client
from genai.text.generation import TextGenerationParameters
from genai.text.tokenization import (
    TextTokenizationParameters,
    TextTokenizationReturnOptions,
    TextTokenizationCreateResults,
)
from genai.credentials import Credentials
import sys
sys.path.append('../../app')
from utils import eval_using_model, generate_text_using_OpenAI, generate_prompt, generate_text
from langchain.evaluation import (
    Criteria,
    load_evaluator,
    EvaluatorType
)
from langchain_community.chat_models import ChatOpenAI

## Setup BAM API

In [2]:
# make sure you have a .env file in the root folder with genaikey and genaiapi
load_dotenv()
api_key = os.getenv("GENAI_KEY", None)
api_endpoint = os.getenv("GENAI_API", None)
openai_key = os.getenv("OPENAI_API_KEY", None)

## Data Collection

In [3]:
dataset_path = "../../data/raw/chunked_data.json"
with open(dataset_path, 'r', encoding="utf-8") as f:
    data = json.load(f)

In [4]:
# Let's see all the Python code files we have
data.keys()

dict_keys(['errors', 'oidc', 'sign', 'transparency', 'verify_models', 'verify_policy', 'verify_verifier'])

In [5]:
# Select a file for which we would like to generate the API doc
file = "errors"

In [6]:
# Extract the code and the actual doc for the selected file
code = data[file]["code_chunks"]
actual_doc = data[file]["markdown"]

In [7]:
print(code)

{'imports': ['import sys'], 'functions': [], 'classes': ['class Error(Exception):\n    \n\n    def diagnostics(self) -> str:\n        \n\n        return An issue occurred.\n\n    def print_and_exit(self, raise_error: bool = False) -> None:\n        \n\n        remind_verbose = (\n            "Raising original exception:"\n            if raise_error\n            else "For detailed error information, run sigstore with the `--verbose` flag."\n        )\n\n        print(f"{self.diagnostics()}\\n{remind_verbose}", file=sys.stderr)\n\n        if raise_error:\n            # don\'t want "during handling another exception"\n            self.__suppress_context__ = True\n            raise self\n\n        sys.exit(1)', 'class NetworkError(Error):\n    \n\n    def diagnostics(self) -> str:\n        \n\n        cause_ctx = (\n            f\n        Additional context:\n\n        {self.__cause__}\n        \n            if self.__cause__\n            else ""\n        )\n\n        return (\n           

In [8]:
# Let's see the different components that are present in our code
code.keys()

dict_keys(['imports', 'functions', 'classes', 'documentation', 'other', 'functions_code', 'functions_docstrings', 'classes_code', 'classes_docstrings'])

In [9]:
# Let's take a look at the code for only the classes defined in the python file
classes_code_text = code["classes_code"]

In [10]:
print(classes_code_text)

['class Error(Exception):\n    \n\n    def diagnostics(self) -> str:\n        \n\n        return An issue occurred.\n\n    def print_and_exit(self, raise_error: bool = False) -> None:\n        \n\n        remind_verbose = (\n            "Raising original exception:"\n            if raise_error\n            else "For detailed error information, run sigstore with the `--verbose` flag."\n        )\n\n        print(f"{self.diagnostics()}\\n{remind_verbose}", file=sys.stderr)\n\n        if raise_error:\n            # don\'t want "during handling another exception"\n            self.__suppress_context__ = True\n            raise self\n\n        sys.exit(1)', 'class NetworkError(Error):\n    \n\n    def diagnostics(self) -> str:\n        \n\n        cause_ctx = (\n            f\n        Additional context:\n\n        {self.__cause__}\n        \n            if self.__cause__\n            else ""\n        )\n\n        return (\n            \\\n        A network issue occurred.\n\n        Check 

In [11]:
classes_code_text_joined = "\n".join(classes_code_text)

## Generate Prompts

We will now build a prompt to generate the API doc for the classes code extracted above.

In [12]:
instruction = """
You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:

1. Introduction: Briefly describe the purpose of the API and its intended use.
2. Functions: Document each API function, including:
    - Description: Clearly explain what the endpoint or function does.
    - Parameters: List and describe each parameter, including data types and any constraints.
    - Return Values: Specify the data type and possible values returned.

3. Error Handling: Describe possible error responses and their meanings.

Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.
"""

In [13]:
# generate the final prompt by appending the classes code
prompt = f"""{instruction}\n"""
prompt += f"""

Class code:

{classes_code_text_joined}

Class Documentation:

"""

In [14]:
print(prompt)


You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:

1. Introduction: Briefly describe the purpose of the API and its intended use.
2. Functions: Document each API function, including:
    - Description: Clearly explain what the endpoint or function does.
    - Parameters: List and describe each parameter, including data types and any constraints.
    - Return Values: Specify the data type and possible values returned.

3. Error Handling: Describe possible error responses and their meanings.

Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.



Class code:

class Error(Exception):
    

    def diagnostics(self) -> str:
        

        return An issue occurred.

    def print_and_exit(self, raise_error: bo

## Generate the API doc

We will now chose a suitable LLM model such as the IBM granite-20b model to generate our API doc.

In [15]:
creds = Credentials(api_key=api_key, api_endpoint=api_endpoint)

# Instantiate parameters for text generation
params = TextGenerationParameters(
        decoding_method="sample",
        max_new_tokens=1024,
        temperature=0.7,
        top_k=50,
        top_p=0.50,
)

# Instantiate a model proxy object to send your requests
client = Client(credentials=creds)
responses = list(
    client.text.generation.create(
         model_id="ibm/granite-20b-code-instruct-v1", inputs=[prompt], parameters=params
    )
)
response = responses[0].results[0]
print("The response:", response)
print("\n")
generated_patch = response.generated_text
print("The generated patch:", generated_patch)

The response: generated_text='1. Introduction: The purpose of this API is to provide a way to generate API documentation for Python code. The API documentation should include information about the functions, classes, and error handling.\n2. Functions:\n    - generate_docs: This function is used to generate API documentation for Python code. It takes in a Python script or directory as input and generates a JSON file containing the API documentation.\n    - get_docs: This function is used to retrieve the API documentation for a specific Python function or class. It takes in the name of the function or class as input and returns the API documentation as a JSON object.\n    - get_error_docs: This function is used to retrieve the API documentation for a specific error. It takes in the name of the error as input and returns the API documentation as a JSON object.\n3. Error Handling:\n    - Error: This class is the base class for all other errors in the API. It provides a way to handle errors

Let's take a look at the actual doc.

In [16]:
print(actual_doc)

[ sigstore](../sigstore.html)

## API Documentation

  * Error
    * diagnostics
    * print_and_exit
  * NetworkError
    * diagnostics
  * TUFError
    * TUFError
    * message
    * diagnostics
  * MetadataError
    * diagnostics
  * RootError
    * diagnostics

[ built with pdoc ](https://pdoc.dev "pdoc: Python API documentation
generator")

#  [sigstore](./../sigstore.html).errors

Exceptions.

View Source
    

class Error(builtins.Exception): View Source
    

Base sigstore exception type. Defines helpers for diagnostics.

def diagnostics(self) -> str: View Source
    

Returns human-friendly error information.

def print_and_exit(self, raise_error: bool = False) -> None: View Source
    

Prints all relevant error information to stderr and exits.

##### Inherited Members

builtins.Exception

    Exception

builtins.BaseException

    with_traceback
    add_note
    args

class NetworkError(Error): View Source
    

Raised when a connectivity-related issue occurs.

def diagnosti

## Evaluate the results

There are different ways to evaluate the results generated by our LLMs. Some of the Gen AI methods we will explore are:
* **GenAI evaluation using prompts to GPT** - Use OpenAI GPT 3 to evaluate the result of the generated API doc
* **LangChain evaluation** - Using Langchain to evaluate on custom criteria such as helpfullness, correctness, descriptiveness etc

In [17]:
# Let's fetch the generated doc
result = generated_patch

### GenAI GPT Evaluation

We will now ask GPT-3 to evaluate the generated doc based on factors such as Accuracy, Relevance,  Clarity, Completeness and Readability. We asked it to rate on a scale of 1 to 5. 1 for the poorest documentation and 5 for the best.

In [18]:
# Evaluate using GPT 3
score = eval_using_model(result, openai_key=openai_key)

Accuracy: 4 
Relevance: 5 
Clarity: 4
Completeness: 4 
Readability: 4
Overall Score: 4.2


**Interpreting the evaluation score**:

Although, GPT-3 has scored the generated doc with an overall score of 4.2 i.e. rating the result as "high/very good" documentation, we can see that the generated documentation does not accurately provide the relevant documentation for the code files we have provided as an input.

The generated output provides a generic documentation for the API, but fails to provide specific documentation for the code functions provided. Hence, GPT-3 has failed to evaluate the generated output. In order to improve the evaluation capability, we need to further fine-tune the prompt for GPT-3 by supplementing it with the source code file we provided as the initial input for generating the resultant documentation.

### Supplement Gen AI prompt with info on source code

In [3]:
def eval_using_model(result: str, openai_key: str, initial_prompt: str):
    prompt = f"""Below is a prompt and the API documentation generated for code based on the prompt, rate the documentation on factors such as Accuracy, Relevance,  Clarity, Completeness and Readability. Rate it on a scale of 1 to 5. 1 for the poorest documentation and 5 for the best and provide reasoning for the score given.
    Example: 

    Accuracy: 1 - Give specific explanation why the generated documentation is or is not accurate and point out reasons from code and generated doc
    Relevance: 2 - Give specific explanation why the generated documentation is or is not relevant and point out reasons from code and generated doc
    Clarity: 3 - Give specific explanation explanation why the generated documentation is or is not clear and point out reasons from code and generated doc
    Completeness: 4 - Give specific explanation explanation why the generated documentation is or is not complete and point out reasons from code and generated doc
    Readability: 5 - Give specific explanation explanation why the generated documentation is or is not readable and point out reasons from code and generated doc
    Overall Score: 3
    
    Prompt:
    
    {initial_prompt}
    Documentation:
    
    {result}
    
    GenAI Score: """
    response = generate_text_using_OpenAI(prompt, openai_key)
    return response

In [66]:
score = eval_using_model(result, openai_key=openai_key, initial_prompt=prompt)

Accuracy: 4 - The generated documentation accurately describes the purpose of the API and its intended use. It also accurately describes the function and its behavior, including the parameters and return values.

Relevance: 5 - The generated documentation is relevant as it provides clear and concise information about the function and its usage.

Clarity: 5 - The generated documentation is clear and easy to understand. It provides a clear description of the function, its parameters, return values, and error handling.

Completeness: 5 - The generated documentation is complete as it covers all the necessary information about the function, including its purpose, parameters, return values, and error handling.

Readability: 5 - The generated documentation is well-structured and organized. It is easy to read and understand, making it user-friendly.

Overall Score: 4.8


## LangChain Evaluation

LangChain criteria evaluation assesses a model’s output using a specific rubric or criteria set. It allows you to verify if an LLM or Chain’s output complies with a defined set of criteria.

In [19]:
# Let's see all the predefined criteria provided by LangChain
list(Criteria)

[<Criteria.CONCISENESS: 'conciseness'>,
 <Criteria.RELEVANCE: 'relevance'>,
 <Criteria.CORRECTNESS: 'correctness'>,
 <Criteria.COHERENCE: 'coherence'>,
 <Criteria.HARMFULNESS: 'harmfulness'>,
 <Criteria.MALICIOUSNESS: 'maliciousness'>,
 <Criteria.HELPFULNESS: 'helpfulness'>,
 <Criteria.CONTROVERSIALITY: 'controversiality'>,
 <Criteria.MISOGYNY: 'misogyny'>,
 <Criteria.CRIMINALITY: 'criminality'>,
 <Criteria.INSENSITIVITY: 'insensitivity'>,
 <Criteria.DEPTH: 'depth'>,
 <Criteria.CREATIVITY: 'creativity'>,
 <Criteria.DETAIL: 'detail'>]

The list mentioned above outlines the different criteria used to assess model responses. Notably, when it comes to "correctness," having an established correct answer is essential for evaluation. However, for other criteria, the model's response on its own is adequate for assessment. This approach ensures a comprehensive evaluation process that considers various aspects of the model's performance.

In [20]:
llm = ChatOpenAI(model="gpt-4", temperature=0)

  warn_deprecated(


### Criteria: Helpfulness
This criteria checks to see if the generated documentation is "helpful" i.e the ability to provide aid or support, make tasks easier or solve problems effectively.

In [21]:
evaluator = load_evaluator("criteria", llm=llm, criteria="helpfulness")
eval_result = evaluator.evaluate_strings(prediction=result,input=prompt)

In [22]:
eval_result

{'reasoning': 'The criterion for this task is "helpfulness". The submission is supposed to be helpful, insightful, and appropriate.\n\nLooking at the submission, it seems to be a general description of an API documentation system rather than a specific documentation for the provided Python code. The functions mentioned in the submission (generate_docs, get_docs, get_error_docs) are not present in the provided Python code. The submission does not provide a detailed description of the functions, their parameters, return values, or error handling as required by the task.\n\nTherefore, the submission is not helpful or appropriate as it does not provide accurate or complete information about the provided Python code. It is also not insightful as it does not provide any new or useful information about the Python code.\n\nN',
 'value': 'N',
 'score': 0}

**Interpreting the evaluation score**

A score of 0 indicates that the output doesn't meet the criteria defined and a score of 1 indicates that the output satisfies the criteria defined.

Our generated doc has been scored 0 for helpfullness, indicating that the generated doc is not "helpful" since it generated documentation for a different function code instead of the classes code we had provided. Hence, this is an effective metric to evaluate our model outputs.

### Criteria: Correctness

This criteria checks to see if the generated documentation is "correct" i.e. checks whether the outputs meet the ground truth provided.

In [23]:
evaluator = load_evaluator("labeled_criteria", llm=llm, criteria="correctness")
eval_result = evaluator.evaluate_strings(prediction=result,input=prompt, reference=actual_doc)

In [24]:
eval_result

{'reasoning': "The criterion for this task is correctness: Is the submission correct, accurate, and factual?\n\nLet's evaluate the submission based on this criterion:\n\n1. The introduction in the submission does not accurately describe the purpose of the API. The API is not for generating API documentation for Python code, but it is a set of error classes for a Python package called sigstore.\n\n2. The functions listed in the submission (generate_docs, get_docs, get_error_docs) are not present in the provided Python code. The actual functions/methods in the code are 'diagnostics' and 'print_and_exit' which are not mentioned in the submission.\n\n3. The error handling section in the submission correctly identifies the error classes (Error, NetworkError, TUFError, MetadataError, RootError) but does not provide accurate descriptions of what these errors do or when they are raised. For example, the TUFError is described as being raised when a TUF issue occurs, but the actual code shows th

**Interpreting the evaluation score**

A score of 0 indicates that the output doesn't meet the criteria defined and a score of 1 indicates that the output satisfies the criteria defined.

Our generated doc has been scored 0 for correctness, indicating that the generated doc is not correct since it doesn't match with the input Python code provided. It mentions that the Python code provided is about error handling classes, but the generated output documents functions like 'verify_signature' and 'get_artifact' which are not present in the provided code.

### Criteria: Logical

We can also provide our own custom criteria based on which we would like to evaluate our generated outputs. Here, we are evaluating the output based on how "logical" it is.

In [25]:
custom_criteria = {
    "logical": "Is the output logical?"
}

In [26]:
eval_chain = load_evaluator(
    EvaluatorType.CRITERIA,
    criteria=custom_criteria,
    llm=llm
)
eval_result = eval_chain.evaluate_strings(prediction=result, input=prompt)

In [27]:
eval_result

{'reasoning': 'The criterion is to assess whether the output is logical.\n\nThe output is supposed to be an API documentation for the provided Python code. The Python code provided is a set of classes that define different types of errors. Each class has a method called diagnostics that returns a string describing the error.\n\nThe submitted output, however, does not match the provided Python code. The output describes functions like generate_docs, get_docs, and get_error_docs, which are not present in the provided Python code. The output also describes the error classes, but it does not provide the required details such as the description of each function, the parameters, and the return values.\n\nTherefore, the output is not logical as it does not accurately represent the provided Python code.\n\nN',
 'value': 'N',
 'score': 0}

**Interpreting the evaluation score**

A score of 0 indicates that the output doesn't meet the criteria defined and a score of 1 indicates that the output satisfies the criteria defined.

Our generated doc has been scored 0 for logicalness, indicating that the generated doc does not capture the documentation for the input Python code provided and hence is not logical.

## Quantitative Evaluation

In this section, in order to drill down on the best genai evaluation criteria, we construct a quantitative evaluation matrix to determine how often these scores are valid by

 - Looking at cases where we know the generated output is deliberately wrong and see how the allotted scores perform
 - And doing this over a number of output for each criteria
 
To do that we have columns for each evaluation criteria as well as human evaluation scores associated with each criteria.

In [117]:
data = {
    'prompt': [],
    'response': [],
    'gpt_accuracy_score': [],
    'human_accuracy_score': [],
    'gpt_relevance_score': [],
    'human_relevance_score': [],
    'gpt_clarity_score': [],
    'human_clarity_score': [],
    'gpt_completeness_score': [],
    'human_completeness_score': [],
    'gpt_readability_score': [],
    'human_readability_score': [],
    'langchain_helpfulness': [],
    'human_helpfulness': [],
    'langchain_correctness': [],
    'human_correctness': [],
    'langchain_logical': [],
    'human_logical': []
}

In [138]:
def get_response(model_id, file, functions, classes, documentation, imports, other, functions_code, functions_doc, classes_code, classes_doc):


    DATASET_PATH = "../../data/raw/chunked_data.json"

    with open(DATASET_PATH, "r", encoding="utf-8") as f:
        data = json.load(f)

    code = data[file]["code_chunks"]

    actual_doc = data[file]["markdown"]

    functions_text = code["functions"]
    classes_text = code["classes"]
    documentation_text = code["documentation"]
    imports_text = code["imports"]
    other_text = code["other"]
    functions_code_text = code["functions_code"]
    functions_doc_text = code["functions_docstrings"]
    classes_code_text = code["classes_code"]
    classes_doc_text = code["classes_docstrings"]


    prompt = generate_prompt(
        instruction,
        functions=functions,
        functions_text=functions_text,
        classes=classes,
        classes_text=classes_text,
        documentation=documentation,
        documentation_text=documentation_text,
        imports=imports,
        imports_text=imports_text,
        other=other,
        other_text=other_text,
        functions_code=functions_code,
        functions_code_text=functions_code_text,
        functions_doc=functions_doc,
        functions_doc_text=functions_doc_text,
        classes_code=classes_code,
        classes_code_text=classes_code_text,
        classes_doc=classes_doc,
        classes_doc_text=classes_doc_text,
    )

    if model_id == "OpenAI/gpt3.5":
        result = generate_text_using_OpenAI(prompt, openai_key)

    else:
        result = generate_text(model_id, prompt, decoding_method="sample", max_new_tokens=1024, temperature=0.7, top_k=50, top_p=0.50, genai_key=api_key)
    
    return prompt, result, actual_doc

In [120]:
def extract_scores(gpt_score):
    pattern = r'(\w+):\s(\d+)'
    matches = re.findall(pattern, gpt_score)

    evaluation_scores = {match[0]: int(match[1]) for match in matches}

    gpt_accuracy_score = evaluation_scores['Accuracy']
    gpt_relevance_score = evaluation_scores['Relevance']
    gpt_clarity_score = evaluation_scores['Clarity']
    gpt_completeness_score = evaluation_scores['Completeness']
    gpt_readability_score = evaluation_scores['Readability']
    
    return gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score

In [121]:
def append_row_to_dataframe(df, prompt, generated_patch, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score):

    evaluator = load_evaluator("criteria", llm=llm, criteria="helpfulness")
    eval_result = evaluator.evaluate_strings(prediction=generated_patch, input=prompt)
    print(eval_result)
    langchain_helpfulness = eval_result['score']
    
    evaluator = load_evaluator("labeled_criteria", llm=llm, criteria="correctness")
    eval_result = evaluator.evaluate_strings(prediction=generated_patch, input=prompt, reference=actual_doc)
    print(eval_result)
    langchain_correctness = eval_result['score']

    custom_criteria = {
    "logical": "Is the output complete? Does it capture all required fields"
                    }
    eval_chain = load_evaluator(
    EvaluatorType.CRITERIA,
    criteria=custom_criteria,
    llm=llm
    )
    eval_result = eval_chain.evaluate_strings(prediction=generated_patch, input=prompt)
    print(eval_result)
    langchain_logical = eval_result['score']

    new_row = {
        'prompt': prompt,
        'response': generated_patch,
        'gpt_accuracy_score': gpt_accuracy_score,
        'gpt_relevance_score': gpt_relevance_score,
        'gpt_clarity_score' : gpt_clarity_score,
        'gpt_completeness_score' : gpt_completeness_score,
        'gpt_readability_score' : gpt_readability_score,
        'langchain_helpfulness' : langchain_helpfulness,
        'langchain_correctness' : langchain_correctness,
        'langchain_logical' : langchain_logical
    }

    df = df.append(new_row, ignore_index=True)

    return df

# DO NOT RUN CELLS WITH EXAMPLES THAT ARE ALREADY ADDED SO THEY ARE NOT OVERWRITTEN.
Scroll to the bottom and add more examples

### Example 1 - Do not Re-run

In [None]:
df = pd.DataFrame(data)

In [123]:
prompt, generated_text, actual_doc = get_response('ibm/granite-20b-code-instruct-v1', 'oidc', functions=True, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=False, classes_doc=False)

generated_text='\nIntroduction:\n\nThis API provides functionality for detecting credentials in text.\n\nFunctions:\n\ndetect_credential(text: str) -> Optional[str]\n\nDescription:\n\nDetects credentials in the given text.\n\nParameters:\n\ntext (str): The text to detect credentials in.\n\nReturn Values:\n\nstr: The detected credential.\n\nError Handling:\n\nIdentityError: Raised if an error occurs during credential detection.\n\nMake sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.' generated_token_count=139 generated_tokens=None input_text=None input_token_count=231 input_tokens=None moderation=None seed=3748198347.0 stop_reason='eos_token' stop_sequence=None


In [124]:
print("\n Prompt \n", prompt)


 Prompt 
 
You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:

1. Introduction: Briefly describe the purpose of the API and its intended use.
2. Functions: Document each API function, including:
    - Description: Clearly explain what the endpoint or function does.
    - Parameters: List and describe each parameter, including data types and any constraints.
    - Return Values: Specify the data type and possible values returned.

3. Error Handling: Describe possible error responses and their meanings.

Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.


Function Code:

def detect_credential() -> Optional[str]:
    
    try:
        return cast(Optional[str], id.detect_credential(_DEFAULT_AUDIENCE))
    exce

In [125]:
print("\n Generated Text \n", generated_text)


 Generated Patch 
 
Introduction:

This API provides functionality for detecting credentials in text.

Functions:

detect_credential(text: str) -> Optional[str]

Description:

Detects credentials in the given text.

Parameters:

text (str): The text to detect credentials in.

Return Values:

str: The detected credential.

Error Handling:

IdentityError: Raised if an error occurs during credential detection.

Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.


In [127]:
gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)

Accuracy: 4 - The generated documentation accurately describes the purpose of the API and the function. It correctly mentions that the function detects credentials in the given text and that it returns the detected credential as a string. The error handling section accurately describes the possible error response.

Relevance: 5 - The generated documentation is relevant to the provided code. It accurately describes the purpose and functionality of the API function.

Clarity: 4 - The generated documentation is clear in explaining what the function does and what its parameters and return values are. The error handling section also provides a clear explanation of the possible error response. 

Completeness: 4 - The generated documentation provides a comprehensive description of the API function, including its purpose, parameters, return values, and error handling. It covers all the necessary information for a user to understand and use the function.

Readability: 5 - The generated document

In [128]:
gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)

In [129]:
df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)

{'reasoning': 'The criterion for this task is "helpfulness". \n\nThe submission provides an introduction that describes the purpose of the API, which is to detect credentials in text. This is helpful for users to understand what the API does.\n\nThe submission also documents the function, including a description of what it does, the parameters it takes, and the return values. This is helpful for users to understand how to use the function.\n\nThe submission also describes possible error responses, which is helpful for users to understand what might go wrong and how to handle it.\n\nHowever, the submission does not accurately reflect the function code provided. The function does not take any parameters, but the submission states that it takes a text parameter. This could mislead users and cause confusion.\n\nTherefore, the submission does not meet the criterion of being helpful, as it provides incorrect information about the function\'s parameters.\n\nN', 'value': 'N', 'score': 0}
{'rea

  df = df.append(new_row, ignore_index=True)


In [130]:
df

Unnamed: 0,prompt,response,gpt_accuracy_score,human_accuracy_score,gpt_relevance_score,human_relevance_score,gpt_clarity_score,human_clarity_score,gpt_completeness_score,human_completeness_score,gpt_readability_score,human_readability_score,langchain_helpfulness,human_helpfulness,langchain_correctness,human_correctness,langchain_logical,human_logical
0,\nYou are an AI system specialized at generati...,\nIntroduction:\n\nThis API provides functiona...,4.0,,5.0,,4.0,,4.0,,5.0,,0.0,,0.0,,0.0,


In [149]:
# Append Human Scores

df.at[0, 'human_accuracy_score'] = '2.0'
df.at[0, 'human_relevance_score'] = '3.0'
df.at[0, 'human_clarity_score'] = '4.0'
df.at[0, 'human_completeness_score'] = '4.0'
df.at[0, 'human_readability_score'] = '5.0'
df.at[0, 'human_helpfulness'] = '0.0'
df.at[0, 'human_correctness'] = '0.0'
df.at[0, 'human_logical'] = '0.0'

In [134]:
df

Unnamed: 0,prompt,response,gpt_accuracy_score,human_accuracy_score,gpt_relevance_score,human_relevance_score,gpt_clarity_score,human_clarity_score,gpt_completeness_score,human_completeness_score,gpt_readability_score,human_readability_score,langchain_helpfulness,human_helpfulness,langchain_correctness,human_correctness,langchain_logical,human_logical
0,\nYou are an AI system specialized at generati...,\nIntroduction:\n\nThis API provides functiona...,4.0,2.0,5.0,3.0,4.0,54.0,4.0,4.0,5.0,5.0,0.0,0,0.0,0,0.0,0


Note: Above is a great example of where the generated documentation is partially incorrect and the langchain eval criteria is able to detect the issue correctly.

### Example 2 - Do not Re-run

In [139]:
prompt, generated_text, actual_doc = get_response('ibm/granite-20b-code-instruct-v1', 'oidc', functions=False, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=True, classes_doc=False)

generated_text='1. Introduction: This API is used to generate documentation for Python code. It provides functions for generating documentation for functions, classes, and scripts.\n\n2. Functions:\n\n- generate_function_docs: Generates documentation for a function.\n- generate_class_docs: Generates documentation for a class.\n- generate_script_docs: Generates documentation for a script.\n\n3. Error Handling:\n\n- IdentityError: An error occurred with ambient credential detection.\n- IssuerError: An error occurred with the OIDC issuer.\n- NetworkError: A network error occurred.\n\nMake sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.\n\nFunction code:\n\ndef generate_function_docs(function: Callable) -> str:\n    \n\n    doc = inspect.getdoc(function)\n    if doc is None:\n        raise ValueError(f"function {function.__name__!r} has no docstring")

In [140]:
print("\n Prompt \n", prompt)


 Prompt 
 
You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:

1. Introduction: Briefly describe the purpose of the API and its intended use.
2. Functions: Document each API function, including:
    - Description: Clearly explain what the endpoint or function does.
    - Parameters: List and describe each parameter, including data types and any constraints.
    - Return Values: Specify the data type and possible values returned.

3. Error Handling: Describe possible error responses and their meanings.

Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.


        
Class code:

class _OpenIDConfiguration(BaseModel):
    

    authorization_endpoint: StrictStr
    token_endpoint: StrictStr
class ExpiredIdentity

In [141]:
print("\n Generated Text \n", generated_text)


 Generated Patch 
 1. Introduction: This API is used to generate documentation for Python code. It provides functions for generating documentation for functions, classes, and scripts.

2. Functions:

- generate_function_docs: Generates documentation for a function.
- generate_class_docs: Generates documentation for a class.
- generate_script_docs: Generates documentation for a script.

3. Error Handling:

- IdentityError: An error occurred with ambient credential detection.
- IssuerError: An error occurred with the OIDC issuer.
- NetworkError: A network error occurred.

Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.

Function code:

def generate_function_docs(function: Callable) -> str:
    

    doc = inspect.getdoc(function)
    if doc is None:
        raise ValueError(f"function {function.__name__!r} has no docstring")

    signature = i

In [142]:
print(actual_doc)

[ sigstore](../sigstore.html)

## API Documentation

  * DEFAULT_OAUTH_ISSUER_URL
  * STAGING_OAUTH_ISSUER_URL
  * DEFAULT_AUDIENCE
  * ExpiredIdentity
  * IdentityToken
    * IdentityToken
    * in_validity_period
    * identity
    * issuer
    * expected_certificate_subject
  * IssuerError
  * Issuer
    * Issuer
    * production
    * staging
    * identity_token
  * IdentityError
    * raise_from_id
    * diagnostics
  * detect_credential

[ built with pdoc ](https://pdoc.dev "pdoc: Python API documentation
generator")

#  [sigstore](./../sigstore.html).oidc

API for retrieving OIDC tokens.

View Source
    

DEFAULT_OAUTH_ISSUER_URL = 'https://oauth2.sigstore.dev/auth'

STAGING_OAUTH_ISSUER_URL = 'https://oauth2.sigstage.dev/auth'

DEFAULT_AUDIENCE = 'sigstore'

class ExpiredIdentity(builtins.Exception): View Source
    

An error raised when an identity token is expired.

##### Inherited Members

builtins.Exception

    Exception

builtins.BaseException

    with_traceback
    a

In [143]:
gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)

Accuracy: 4 - The generated documentation accurately identifies the purpose and functionality of the API functions and classes. The descriptions of the functions and classes are based on the code provided and accurately represent their functionality.

Relevance: 3.5 - The generated documentation is relevant as it provides accurate descriptions of each API function and class, including their purpose, parameters, and return values. However, some of the error handling information seems to be missing or incomplete.

Clarity: 3.5 - The generated documentation is clear in most parts, providing concise descriptions of the API functions and classes. However, there are a few areas where the explanations could be clearer, especially in the error handling section.

Completeness: 3 - The generated documentation provides descriptions of each API function and class, including their purpose and parameters. However, some parts of the documentation, especially in the error handling section, are incompl

In [144]:
gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)

In [145]:
df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)

{'reasoning': 'The criteria for this task is "helpfulness". The submission is supposed to be helpful, insightful, and appropriate. \n\nLooking at the submission, it seems to be a detailed documentation of the provided Python code. It includes an introduction, function documentation, error handling, and diagnostics. It also provides links to relevant documentation for further reading. \n\nHowever, the submission seems to have misunderstood the task. The task was to generate API documentation for the provided Python code, but the submission seems to be a documentation of a hypothetical API that generates documentation for Python code. This is a significant misunderstanding of the task.\n\nTherefore, the submission is not helpful or appropriate for the task at hand. \n\nN', 'value': 'N', 'score': 0}
{'reasoning': 'The submission is supposed to provide API documentation for the provided Python code. The code provided includes several classes and methods, including the _OpenIDConfiguration 

  df = df.append(new_row, ignore_index=True)


In [146]:
df

Unnamed: 0,prompt,response,gpt_accuracy_score,human_accuracy_score,gpt_relevance_score,human_relevance_score,gpt_clarity_score,human_clarity_score,gpt_completeness_score,human_completeness_score,gpt_readability_score,human_readability_score,langchain_helpfulness,human_helpfulness,langchain_correctness,human_correctness,langchain_logical,human_logical
0,\nYou are an AI system specialized at generati...,\nIntroduction:\n\nThis API provides functiona...,4.0,2.0,5.0,3.0,4.0,54.0,4.0,4.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to generate ...,4.0,,3.0,,3.0,,3.0,,4.0,,0.0,,0.0,,0.0,


In [150]:
# Append Human Scores

df.at[1, 'human_accuracy_score'] = '1.0'
df.at[1, 'human_relevance_score'] = '1.0'
df.at[1, 'human_clarity_score'] = '1.0'
df.at[1, 'human_completeness_score'] = '1.0'
df.at[1, 'human_readability_score'] = '1.0'
df.at[1, 'human_helpfulness'] = '0.0'
df.at[1, 'human_correctness'] = '0.0'
df.at[1, 'human_logical'] = '0.0'

In [151]:
df

Unnamed: 0,prompt,response,gpt_accuracy_score,human_accuracy_score,gpt_relevance_score,human_relevance_score,gpt_clarity_score,human_clarity_score,gpt_completeness_score,human_completeness_score,gpt_readability_score,human_readability_score,langchain_helpfulness,human_helpfulness,langchain_correctness,human_correctness,langchain_logical,human_logical
0,\nYou are an AI system specialized at generati...,\nIntroduction:\n\nThis API provides functiona...,4.0,2.0,5.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to generate ...,4.0,1.0,3.0,1.0,3.0,1.0,3.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


Note: This is a great example where the generated output completely misunderstood the task and hallucinated content, langchain evaluation was able to catch the error well. Although the gpt evaluation scores were lower, they should have been scored way lesser.

### Example 3 - Do not Re-run

In [153]:
prompt, generated_text, actual_doc = get_response('ibm/granite-20b-code-instruct-v1', 'transparency', functions=False, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=True, classes_doc=False)

generated_text='1. Introduction: This class is used to represent an inclusion proof for a Merkle tree. It is used in the Verifiable Credentials (VC) API to verify the inclusion of a specific credential in a Merkle tree.\n\n2. Functions:\n\n    - Description: This function is used to create an instance of the LogInclusionProof class. It takes in a dictionary of parameters and sets them as attributes of the class.\n\n    - Parameters:\n        - checkpoint (str): The checkpoint of the Merkle tree.\n        - hashes (list): A list of hashes in the inclusion proof.\n        - log_index (int): The index of the log in the Merkle tree.\n        - root_hash (str): The root hash of the Merkle tree.\n        - tree_size (int): The size of the Merkle tree.\n\n    - Return Values:\n        - LogInclusionProof: An instance of the LogInclusionProof class.\n\n    - Error Handling:\n        - ValueError: If the log index or tree size is negative or if the log index is greater than or equal to the tree

In [154]:
print("\n Prompt \n", prompt)


 Prompt 
 
You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:

1. Introduction: Briefly describe the purpose of the API and its intended use.
2. Functions: Document each API function, including:
    - Description: Clearly explain what the endpoint or function does.
    - Parameters: List and describe each parameter, including data types and any constraints.
    - Return Values: Specify the data type and possible values returned.

3. Error Handling: Describe possible error responses and their meanings.

Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.


        
Class code:

class LogInclusionProof(BaseModel):
    

    model_config = ConfigDict(populate_by_name=True)

    checkpoint: StrictStr = Field(...,

In [161]:
print("\n Generated Text \n", generated_text)


 Generated Text 
 1. Introduction: This class is used to represent an inclusion proof for a Merkle tree. It is used in the Verifiable Credentials (VC) API to verify the inclusion of a specific credential in a Merkle tree.

2. Functions:

    - Description: This function is used to create an instance of the LogInclusionProof class. It takes in a dictionary of parameters and sets them as attributes of the class.

    - Parameters:
        - checkpoint (str): The checkpoint of the Merkle tree.
        - hashes (list): A list of hashes in the inclusion proof.
        - log_index (int): The index of the log in the Merkle tree.
        - root_hash (str): The root hash of the Merkle tree.
        - tree_size (int): The size of the Merkle tree.

    - Return Values:
        - LogInclusionProof: An instance of the LogInclusionProof class.

    - Error Handling:
        - ValueError: If the log index or tree size is negative or if the log index is greater than or equal to the tree size, a Value

In [162]:
gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)

Accuracy: 5 - The generated documentation accurately describes the purpose of the API class and function, as well as the parameters, return values, and error handling.

Relevance: 5 - The generated documentation is relevant as it provides accurate and specific information about the class and function, including their purpose, parameters, return values, and error handling.

Clarity: 5 - The generated documentation is clear and easy to understand. It provides clear descriptions of the class and function, as well as their parameters, return values, and error handling.

Completeness: 5 - The generated documentation is complete as it includes all the necessary information about the class and function, including their purpose, parameters, return values, and error handling.

Readability: 5 - The generated documentation is highly readable. It uses clear and concise language to describe the class and function, as well as their parameters, return values, and error handling. The formatting and or

In [163]:
gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)

In [168]:
df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)

{'reasoning': 'The criterion for this task is "helpfulness". The submission should be helpful, insightful, and appropriate.\n\nLooking at the submission, it provides a detailed explanation of the class and function in the provided Python code. It describes the purpose of the class and function, the parameters they take, the return values, and the errors they might raise. This information is helpful for understanding how to use the class and function.\n\nThe submission also follows the structure provided in the input, which makes it easy to follow and understand. It avoids speculative information and prioritizes accuracy and completeness, as required by the task.\n\nTherefore, the submission meets the criterion of being helpful, insightful, and appropriate.\n\nY', 'value': 'Y', 'score': 1}
{'reasoning': 'The submission is being evaluated for correctness, accuracy, and factualness. \n\n1. Correctness: The submission correctly describes the purpose of the class and function, their paramet

  df = df.append(new_row, ignore_index=True)


In [169]:
df

Unnamed: 0,prompt,response,gpt_accuracy_score,human_accuracy_score,gpt_relevance_score,human_relevance_score,gpt_clarity_score,human_clarity_score,gpt_completeness_score,human_completeness_score,gpt_readability_score,human_readability_score,langchain_helpfulness,human_helpfulness,langchain_correctness,human_correctness,langchain_logical,human_logical
0,\nYou are an AI system specialized at generati...,\nIntroduction:\n\nThis API provides functiona...,4.0,2.0,5.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to generate ...,4.0,1.0,3.0,1.0,3.0,1.0,3.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,\nYou are an AI system specialized at generati...,1. Introduction: This class is used to represe...,5.0,,5.0,,5.0,,5.0,,5.0,,1.0,,1.0,,1.0,


In [176]:
# Append Human Scores

df.at[2, 'human_accuracy_score'] = '2.0'
df.at[2, 'human_relevance_score'] = '3.0'
df.at[2, 'human_clarity_score'] = '3.0'
df.at[2, 'human_completeness_score'] = '2.0'
df.at[2, 'human_readability_score'] = '3.0'
df.at[2, 'human_helpfulness'] = '1.0'
df.at[2, 'human_correctness'] = '0.0'
df.at[2, 'human_logical'] = '1.0'

In [177]:
df

Unnamed: 0,prompt,response,gpt_accuracy_score,human_accuracy_score,gpt_relevance_score,human_relevance_score,gpt_clarity_score,human_clarity_score,gpt_completeness_score,human_completeness_score,gpt_readability_score,human_readability_score,langchain_helpfulness,human_helpfulness,langchain_correctness,human_correctness,langchain_logical,human_logical
0,\nYou are an AI system specialized at generati...,\nIntroduction:\n\nThis API provides functiona...,4.0,2.0,5.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to generate ...,4.0,1.0,3.0,1.0,3.0,1.0,3.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,\nYou are an AI system specialized at generati...,1. Introduction: This class is used to represe...,5.0,2.0,5.0,3.0,5.0,3.0,5.0,2.0,5.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0


Note: This is an interesting case where the generated output correctly captures the conditions being checked for in the given class but it also halluicinates function code. The answer is still however correct in places for example it correctly captured that

```
_log_index_positive ensures that the log_index value is non-negative.
_tree_size_positive ensures that the tree_size value is non-negative.
_log_index_within_tree_size ensures that the log_index is within the range of the tree_size.
```

but in some places it is made up and inaccurate and langchain evaluation was not able to assess that correctly

### Example 4 - Do not Re-run

In [178]:
prompt, generated_text, actual_doc = get_response('ibm/granite-20b-code-instruct-v1', 'sign', functions=False, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=True, classes_doc=False)

generated_text='1. Introduction: This API is used to sign and verify artifacts using Sigstore. It allows users to sign artifacts using their private key and verify the signature using the public key.\n2. Functions:\n    - sign: This function is used to sign an artifact using the private key. It takes an input stream as an argument and returns a SigningResult object.\n    - verify: This function is used to verify the signature of an artifact using the public key. It takes a SigningResult object as an argument and returns a boolean value indicating whether the signature is valid or not.\n3. Error Handling:\n    - ExpiredIdentity: This error is raised when the provided identity token is expired.\n    - ExpiredCertificate: This error is raised when the provided certificate is expired.\n    - InvalidCertificate: This error is raised when the provided certificate is invalid.\n    - InvalidSignature: This error is raised when the provided signature is invalid.\n    - InvalidCertificateChain: 

In [179]:
print("\n Prompt \n", prompt)


 Prompt 
 
You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:

1. Introduction: Briefly describe the purpose of the API and its intended use.
2. Functions: Document each API function, including:
    - Description: Clearly explain what the endpoint or function does.
    - Parameters: List and describe each parameter, including data types and any constraints.
    - Return Values: Specify the data type and possible values returned.

3. Error Handling: Describe possible error responses and their meanings.

Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.


        
Class code:

class Signer:
    

    def __init__(
        self,
        identity_token: IdentityToken,
        signing_ctx: SigningContext,
      

In [180]:
print("\n Generated Text \n", generated_text)


 Generated Text 
 1. Introduction: This API is used to sign and verify artifacts using Sigstore. It allows users to sign artifacts using their private key and verify the signature using the public key.
2. Functions:
    - sign: This function is used to sign an artifact using the private key. It takes an input stream as an argument and returns a SigningResult object.
    - verify: This function is used to verify the signature of an artifact using the public key. It takes a SigningResult object as an argument and returns a boolean value indicating whether the signature is valid or not.
3. Error Handling:
    - ExpiredIdentity: This error is raised when the provided identity token is expired.
    - ExpiredCertificate: This error is raised when the provided certificate is expired.
    - InvalidCertificate: This error is raised when the provided certificate is invalid.
    - InvalidSignature: This error is raised when the provided signature is invalid.
    - InvalidCertificateChain: This e

In [181]:
gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)

Accuracy: 4 - The generated documentation accurately describes the purpose of the API and its functions. It accurately describes the parameters and return values of the functions.
Relevance: 5 - The generated documentation is relevant as it provides information about how to use the API functions and what error handling is implemented.
Clarity: 3 - The generated documentation provides clear descriptions of the purpose of the API and its functions. However, it could be improved by providing more detailed descriptions for each function.
Completeness: 4 - The generated documentation includes the introduction, functions, and error handling sections as required. It provides information about the purpose of the API, the functions available, and possible error responses.
Readability: 5 - The generated documentation is readable and follows a clear structure. It uses clear and concise language to describe the purpose of the API and its functions. The sections are organized logically and are easy

In [182]:
gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)

In [183]:
df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)

{'reasoning': 'The criterion for this task is "helpfulness". The submission is supposed to be helpful, insightful, and appropriate. \n\nLooking at the submission, it provides a brief introduction to the API, which is helpful for users to understand what the API is used for. \n\nThe submission also documents the functions of the API, including a description of what each function does, the parameters it takes, and the return values. This is insightful as it provides users with the necessary information to use the API functions. \n\nThe submission also describes possible error responses and their meanings, which is appropriate as it helps users understand what could go wrong when using the API and how to handle these errors. \n\nHowever, the submission includes a function "verify" which is not present in the provided Python code. This is misleading and not accurate. \n\nTherefore, the submission is not completely helpful, insightful, and appropriate. \n\nN', 'value': 'N', 'score': 0}
{'re

  df = df.append(new_row, ignore_index=True)


In [184]:
df

Unnamed: 0,prompt,response,gpt_accuracy_score,human_accuracy_score,gpt_relevance_score,human_relevance_score,gpt_clarity_score,human_clarity_score,gpt_completeness_score,human_completeness_score,gpt_readability_score,human_readability_score,langchain_helpfulness,human_helpfulness,langchain_correctness,human_correctness,langchain_logical,human_logical
0,\nYou are an AI system specialized at generati...,\nIntroduction:\n\nThis API provides functiona...,4.0,2.0,5.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to generate ...,4.0,1.0,3.0,1.0,3.0,1.0,3.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,\nYou are an AI system specialized at generati...,1. Introduction: This class is used to represe...,5.0,2.0,5.0,3.0,5.0,3.0,5.0,2.0,5.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0
3,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to sign and ...,4.0,,5.0,,3.0,,4.0,,5.0,,0.0,,0.0,,0.0,


In [187]:
# Append Human Scores

df.at[3, 'human_accuracy_score'] = '1.0'
df.at[3, 'human_relevance_score'] = '1.0'
df.at[3, 'human_clarity_score'] = '1.0'
df.at[3, 'human_completeness_score'] = '1.0'
df.at[3, 'human_readability_score'] = '2.0'
df.at[3, 'human_helpfulness'] = '0.0'
df.at[3, 'human_correctness'] = '0.0'
df.at[3, 'human_logical'] = '0.0'

In [190]:
df

Unnamed: 0,prompt,response,gpt_accuracy_score,human_accuracy_score,gpt_relevance_score,human_relevance_score,gpt_clarity_score,human_clarity_score,gpt_completeness_score,human_completeness_score,gpt_readability_score,human_readability_score,langchain_helpfulness,human_helpfulness,langchain_correctness,human_correctness,langchain_logical,human_logical
0,\nYou are an AI system specialized at generati...,\nIntroduction:\n\nThis API provides functiona...,4.0,2.0,5.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to generate ...,4.0,1.0,3.0,1.0,3.0,1.0,3.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,\nYou are an AI system specialized at generati...,1. Introduction: This class is used to represe...,5.0,2.0,5.0,3.0,5.0,3.0,5.0,2.0,5.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0
3,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to sign and ...,4.0,1.0,5.0,1.0,3.0,1.0,4.0,1.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


Note: This is a great example where the generated output was quite terrible, hallucinated classes, incomplete list of classes, incorrect explanations. GPT scored them well but langchain correctly captured the errors and pointed out the mistakes.

### Example 5 - Do not Re-run

In [192]:
prompt, generated_text, actual_doc = get_response("OpenAI/gpt3.5", 'transparency', functions=False, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=True, classes_doc=False)

**Introduction:**

The `LogInclusionProof` class represents an inclusion proof for a log entry in a Merkle tree. It is used to provide evidence that a particular log entry is included in the Merkle tree.

**Functions:**

1. `__init__()`:
    
    - **Description:** Initializes a new instance of the `LogInclusionProof` class.
    - **Parameters:**
        - None
    - **Return Value:** None


2. `__repr__()`:
    
    - **Description:** Returns a string representation of the `LogInclusionProof` class instance.
    - **Parameters:**
        - None
    - **Return Value:** String representation of the `LogInclusionProof` class instance.


3. `_log_index_positive(v: int) -> int`:
    
    - **Description:** Validates that the log index value is positive.
    - **Parameters:**
        - `v` (int): The log index value to be validated.
    - **Return Value:** The validated log index value.
    - **Raises:**
        - ValueError: If the log index value is less than 0.
      

4. `_tree_size_pos

In [193]:
print("\n Prompt \n", prompt)


 Prompt 
 
You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:

1. Introduction: Briefly describe the purpose of the API and its intended use.
2. Functions: Document each API function, including:
    - Description: Clearly explain what the endpoint or function does.
    - Parameters: List and describe each parameter, including data types and any constraints.
    - Return Values: Specify the data type and possible values returned.

3. Error Handling: Describe possible error responses and their meanings.

Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.


        
Class code:

class LogInclusionProof(BaseModel):
    

    model_config = ConfigDict(populate_by_name=True)

    checkpoint: StrictStr = Field(...,

In [194]:
print("\n Generated Text \n", generated_text)


 Generated Text 
 **Introduction:**

The `LogInclusionProof` class represents an inclusion proof for a log entry in a Merkle tree. It is used to provide evidence that a particular log entry is included in the Merkle tree.

**Functions:**

1. `__init__()`:
    
    - **Description:** Initializes a new instance of the `LogInclusionProof` class.
    - **Parameters:**
        - None
    - **Return Value:** None


2. `__repr__()`:
    
    - **Description:** Returns a string representation of the `LogInclusionProof` class instance.
    - **Parameters:**
        - None
    - **Return Value:** String representation of the `LogInclusionProof` class instance.


3. `_log_index_positive(v: int) -> int`:
    
    - **Description:** Validates that the log index value is positive.
    - **Parameters:**
        - `v` (int): The log index value to be validated.
    - **Return Value:** The validated log index value.
    - **Raises:**
        - ValueError: If the log index value is less than 0.
      


In [195]:
gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)

Accuracy: 5 - The generated documentation accurately represents the code. All information from the code is correctly documented, including function descriptions, parameter descriptions, return values, and error handling.

Relevance: 5 - The generated documentation is relevant to the code. It accurately describes the purpose and use of the API class, as well as each individual function.

Clarity: 4 - The generated documentation is clear. It provides clear descriptions of each function and its purpose. However, the error handling description could be more specific about the exact scenarios in which each ValueError is raised.

Completeness: 5 - The generated documentation is complete. It covers all the functions in the class, providing descriptions, parameter information, return values, and error handling for each.

Readability: 4 - The generated documentation is readable. It uses clear language and follows a consistent structure. However, some of the descriptions could be more concise an

In [196]:
gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)

In [197]:
df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)

{'reasoning': 'The criterion for this task is "helpfulness". The submission is to be evaluated based on whether it is helpful, insightful, and appropriate.\n\nLooking at the submission, it provides a detailed documentation of the `LogInclusionProof` class. It starts with an introduction that explains the purpose of the class. This is helpful for users who are not familiar with the class and its use.\n\nThe submission then documents each function in the class. For each function, it provides a description, lists and describes the parameters, and specifies the return value. This is helpful for users who want to understand how to use the functions and what to expect from them.\n\nThe submission also describes the possible error responses and their meanings. This is helpful for users who encounter errors and want to understand what they mean.\n\nOverall, the submission is helpful because it provides a comprehensive documentation of the `LogInclusionProof` class. It is insightful because it 

  df = df.append(new_row, ignore_index=True)


In [198]:
df

Unnamed: 0,prompt,response,gpt_accuracy_score,human_accuracy_score,gpt_relevance_score,human_relevance_score,gpt_clarity_score,human_clarity_score,gpt_completeness_score,human_completeness_score,gpt_readability_score,human_readability_score,langchain_helpfulness,human_helpfulness,langchain_correctness,human_correctness,langchain_logical,human_logical
0,\nYou are an AI system specialized at generati...,\nIntroduction:\n\nThis API provides functiona...,4.0,2.0,5.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to generate ...,4.0,1.0,3.0,1.0,3.0,1.0,3.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,\nYou are an AI system specialized at generati...,1. Introduction: This class is used to represe...,5.0,2.0,5.0,3.0,5.0,3.0,5.0,2.0,5.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0
3,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to sign and ...,4.0,1.0,5.0,1.0,3.0,1.0,4.0,1.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
4,\nYou are an AI system specialized at generati...,**Introduction:**\n\nThe `LogInclusionProof` c...,5.0,,5.0,,4.0,,5.0,,4.0,,1.0,,0.0,,0.0,


In [199]:
# Append Human Scores

df.at[4, 'human_accuracy_score'] = '2.0'
df.at[4, 'human_relevance_score'] = '2.0'
df.at[4, 'human_clarity_score'] = '3.0'
df.at[4, 'human_completeness_score'] = '2.0'
df.at[4, 'human_readability_score'] = '4.0'
df.at[4, 'human_helpfulness'] = '0.0'
df.at[4, 'human_correctness'] = '0.0'
df.at[4, 'human_logical'] = '1.0'

In [200]:
df

Unnamed: 0,prompt,response,gpt_accuracy_score,human_accuracy_score,gpt_relevance_score,human_relevance_score,gpt_clarity_score,human_clarity_score,gpt_completeness_score,human_completeness_score,gpt_readability_score,human_readability_score,langchain_helpfulness,human_helpfulness,langchain_correctness,human_correctness,langchain_logical,human_logical
0,\nYou are an AI system specialized at generati...,\nIntroduction:\n\nThis API provides functiona...,4.0,2.0,5.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to generate ...,4.0,1.0,3.0,1.0,3.0,1.0,3.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,\nYou are an AI system specialized at generati...,1. Introduction: This class is used to represe...,5.0,2.0,5.0,3.0,5.0,3.0,5.0,2.0,5.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0
3,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to sign and ...,4.0,1.0,5.0,1.0,3.0,1.0,4.0,1.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
4,\nYou are an AI system specialized at generati...,**Introduction:**\n\nThe `LogInclusionProof` c...,5.0,2.0,5.0,2.0,4.0,3.0,5.0,2.0,4.0,4.0,1.0,0.0,0.0,0.0,0.0,1.0


Note: This is again a great example of where langchain evaluation is not fully correct. While the generated output is pretty good structurally well documenting classses of the class, it hallucinates functions that are not part of the class which is unacceptable.

### Example 6 - Do not Re-run

In [201]:
prompt, generated_text, actual_doc = get_response("OpenAI/gpt3.5", 'errors', functions=False, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=True, classes_doc=False)

1. Introduction:
The Error class is a base class for all custom error classes in the API. It provides a common interface for handling and reporting errors. The Error class is not intended to be instantiated directly.

The NetworkError class is a subclass of Error and represents an error that occurs when there is a network issue. It provides specific diagnostics and suggestions for resolving the issue.

The TUFError class is a subclass of Error and represents an error that occurs in the context of The Update Framework (TUF). It provides additional context-specific diagnostics and suggestions for reporting the issue.

The MetadataError class is a subclass of Error and represents an error that occurs when there is an issue with the metadata.

The RootError class is a subclass of Error and represents an error that occurs when the root of trust cannot be established.

2. Functions:
- Error.diagnostics():
    - Description: Returns a string with a general diagnostic message for the error.
  

In [202]:
print("\n Prompt \n", prompt)


 Prompt 
 
You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:

1. Introduction: Briefly describe the purpose of the API and its intended use.
2. Functions: Document each API function, including:
    - Description: Clearly explain what the endpoint or function does.
    - Parameters: List and describe each parameter, including data types and any constraints.
    - Return Values: Specify the data type and possible values returned.

3. Error Handling: Describe possible error responses and their meanings.

Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.


        
Class code:

class Error(Exception):
    

    def diagnostics(self) -> str:
        

        return An issue occurred.

    def print_and_exit(se

In [203]:
print("\n Generated Text \n", generated_text)


 Generated Text 
 1. Introduction:
The Error class is a base class for all custom error classes in the API. It provides a common interface for handling and reporting errors. The Error class is not intended to be instantiated directly.

The NetworkError class is a subclass of Error and represents an error that occurs when there is a network issue. It provides specific diagnostics and suggestions for resolving the issue.

The TUFError class is a subclass of Error and represents an error that occurs in the context of The Update Framework (TUF). It provides additional context-specific diagnostics and suggestions for reporting the issue.

The MetadataError class is a subclass of Error and represents an error that occurs when there is an issue with the metadata.

The RootError class is a subclass of Error and represents an error that occurs when the root of trust cannot be established.

2. Functions:
- Error.diagnostics():
    - Description: Returns a string with a general diagnostic messag

In [204]:
gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)

Accuracy: 4 - The generated documentation accurately describes the purpose and functionality of each class and function. The details from the code are correctly reflected in the documentation.

Relevance: 5 - The generated documentation is relevant as it provides clear and concise descriptions of each class and function, including their purpose, parameters, and return values. It also includes information on error handling.

Clarity: 4 - The generated documentation is clear and easy to understand. The descriptions for each class and function provide sufficient detail to understand their purpose and functionality.

Completeness: 5 - The generated documentation is complete and includes descriptions for all the classes and functions in the code. It also includes information on error handling and possible error responses.

Readability: 5 - The generated documentation is well-structured and formatted, making it easy to read and understand. The information is presented in a clear and concise 

In [205]:
gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)

In [206]:
df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)

{'reasoning': 'The criterion for this task is "helpfulness". The submission is to be evaluated based on whether it is helpful, insightful, and appropriate.\n\nLooking at the submission, it provides a detailed and structured documentation for the provided Python code. It follows the output structure provided in the input, which includes an introduction, function documentation, and error handling.\n\nIn the introduction, the submission provides a brief description of the purpose of each class in the API. This is helpful for users to understand the purpose and intended use of each class.\n\nIn the function documentation, the submission documents each function in the classes, including a description of what the function does, the parameters it takes, and the values it returns. This is insightful as it provides users with a clear understanding of how to use each function.\n\nIn the error handling section, the submission describes possible error responses and their meanings. This is appropri

  df = df.append(new_row, ignore_index=True)


In [207]:
df

Unnamed: 0,prompt,response,gpt_accuracy_score,human_accuracy_score,gpt_relevance_score,human_relevance_score,gpt_clarity_score,human_clarity_score,gpt_completeness_score,human_completeness_score,gpt_readability_score,human_readability_score,langchain_helpfulness,human_helpfulness,langchain_correctness,human_correctness,langchain_logical,human_logical
0,\nYou are an AI system specialized at generati...,\nIntroduction:\n\nThis API provides functiona...,4.0,2.0,5.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to generate ...,4.0,1.0,3.0,1.0,3.0,1.0,3.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,\nYou are an AI system specialized at generati...,1. Introduction: This class is used to represe...,5.0,2.0,5.0,3.0,5.0,3.0,5.0,2.0,5.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0
3,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to sign and ...,4.0,1.0,5.0,1.0,3.0,1.0,4.0,1.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
4,\nYou are an AI system specialized at generati...,**Introduction:**\n\nThe `LogInclusionProof` c...,5.0,2.0,5.0,2.0,4.0,3.0,5.0,2.0,4.0,4.0,1.0,0.0,0.0,0.0,0.0,1.0
5,\nYou are an AI system specialized at generati...,1. Introduction:\nThe Error class is a base cl...,4.0,,5.0,,4.0,,5.0,,5.0,,1.0,,1.0,,1.0,


In [209]:
# Append Human Scores

df.at[5, 'human_accuracy_score'] = '5.0'
df.at[5, 'human_relevance_score'] = '5.0'
df.at[5, 'human_clarity_score'] = '5.0'
df.at[5, 'human_completeness_score'] = '5.0'
df.at[5, 'human_readability_score'] = '5.0'
df.at[5, 'human_helpfulness'] = '1.0'
df.at[5, 'human_correctness'] = '1.0'
df.at[5, 'human_logical'] = '1.0'

In [210]:
df

Unnamed: 0,prompt,response,gpt_accuracy_score,human_accuracy_score,gpt_relevance_score,human_relevance_score,gpt_clarity_score,human_clarity_score,gpt_completeness_score,human_completeness_score,gpt_readability_score,human_readability_score,langchain_helpfulness,human_helpfulness,langchain_correctness,human_correctness,langchain_logical,human_logical
0,\nYou are an AI system specialized at generati...,\nIntroduction:\n\nThis API provides functiona...,4.0,2.0,5.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to generate ...,4.0,1.0,3.0,1.0,3.0,1.0,3.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,\nYou are an AI system specialized at generati...,1. Introduction: This class is used to represe...,5.0,2.0,5.0,3.0,5.0,3.0,5.0,2.0,5.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0
3,\nYou are an AI system specialized at generati...,1. Introduction: This API is used to sign and ...,4.0,1.0,5.0,1.0,3.0,1.0,4.0,1.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
4,\nYou are an AI system specialized at generati...,**Introduction:**\n\nThe `LogInclusionProof` c...,5.0,2.0,5.0,2.0,4.0,3.0,5.0,2.0,4.0,4.0,1.0,0.0,0.0,0.0,0.0,1.0
5,\nYou are an AI system specialized at generati...,1. Introduction:\nThe Error class is a base cl...,4.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0


Note: The output generated is quite detailed and pretty accurate to a non SME and the langchain eval seems to be capturing that correctly too. The GPT eval is also pretty high that is consistent with the human eval.

In [211]:
df.to_pickle('eval_df.pkl')

## Copy this section, modify and run from here

### Example X 

In [None]:
df = pd.read_pickle('eval_df.pkl')

In [None]:
df

In [None]:
prompt, generated_text, actual_doc = get_response('ibm/granite-20b-code-instruct-v1', 'oidc', functions=False, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=True, classes_doc=False)

In [None]:
print("\n Prompt \n", prompt)

In [None]:
print("\n Generated Text \n", generated_text)

In [None]:
gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)

In [None]:
gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)

In [None]:
df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)

In [None]:
df

In [None]:
# Append Human Scores

df.at[X, 'human_accuracy_score'] = '2.0'
df.at[X, 'human_relevance_score'] = '3.0'
df.at[X, 'human_clarity_score'] = '4.0'
df.at[X, 'human_completeness_score'] = '4.0'
df.at[X, 'human_readability_score'] = '5.0'
df.at[X, 'human_helpfulness'] = '0.0'
df.at[X, 'human_correctness'] = '0.0'
df.at[X, 'human_logical'] = '0.0'

In [None]:
df

In [None]:
df.to_pickle('eval_df.pkl')