In [1]:
from llmcoder.eval.metrics.extrinsic import levenshtein_distance_score, bleu_score, trf_similarity_score, sequence_matcher_score, gpt_reviewer_score


In [2]:
fib1 = """
# Function that returns the nth Fibonacci number
def fibonacci_recursive(n):
    if n <= 0:
        return []
    elif n == 1:
        return [0]
    elif n == 2:
        return [0, 1]
    else:
        sequence = fibonacci_recursive(n-1)
        sequence.append(sequence[-1] + sequence[-2])
        return sequence
"""

In [3]:
fib2 = """
# Function that returns the nth Fibonacci number
def fibonacci_iterative(n):
    if n <= 0:
        return []
    elif n == 1:
        return [0]
    sequence = [0, 1]
    for i in range(2, n):
        sequence.append(sequence[i-1] + sequence[i-2])
    return sequence
"""

In [4]:
fib3 = """
# Function that returns the nth Fibonacci number
def fibonacci_memoization(n, memo = {0: 0, 1: 1}):
    if n not in memo:
        memo[n] = fibonacci_memoization(n-1, memo) + fibonacci_memoization(n-2, memo)
    return [memo[i] for i in range(n)]
"""

In [5]:
fib3_alt = """
# Function that returns the nth Fibonacci number

To implement a function that returns the nth Fibonacci number, we can use memoization to store the results of previous calculations. This will allow us to avoid repeating calculations and improve the efficiency of our function.

Let's address this step by step.
- First, we need to define a function that takes in an integer n and returns the nth Fibonacci number.
- Next, we need to define a dictionary that will store the results of previous calculations.
- Finally, we need to define a recursive function that will calculate the nth Fibonacci number using the memoization dictionary.

Here is how you can implement this in Python:

```python
def fibonacci_memoization(n, memo = {0: 0, 1: 1}):
    if n not in memo:
        memo[n] = fibonacci_memoization(n-1, memo) + fibonacci_memoization(n-2, memo)
    return [memo[i] for i in range(n)]
```

In this code, we define a function that takes in an integer n and returns the nth Fibonacci number. We also define a dictionary that will store the results of previous calculations. Finally, we define a recursive function that will calculate the nth Fibonacci number using the memoization dictionary.
"""

In [6]:
geo1 = """
# Function that returns the nth Fibonacci number
import math

def geometric_mean(numbers):
    if not numbers:
        return 0

    product = 1
    for number in numbers:
        product *= number

    return math.pow(product, 1/len(numbers))
"""

In [7]:
scores_list = [
    ('levenshtein', levenshtein_distance_score),
    ('bleu', bleu_score),
    ('trf', trf_similarity_score),
    ('seq', sequence_matcher_score),
    ('gpt', gpt_reviewer_score)
]

In [8]:
# Take fib1 as ground truth and compare the similarity of fib1 to fib2 and fib3 and geo1

for name, score_func in scores_list:
    print(name)
    print(f'fib1 vs fib1: {score_func(fib1, fib1):.3f}')
    print(f'fib1 vs fib2: {score_func(fib1, fib2):.3f}')
    print(f'fib1 vs fib3: {score_func(fib1, fib3):.3f}')
    print(f'fib1 vs geo1: {score_func(fib1, geo1):.3f}')

levenshtein
fib1 vs fib1: 0.000
fib1 vs fib2: 79.000
fib1 vs fib3: 184.000
fib1 vs geo1: 186.000
bleu
fib1 vs fib1: 1.000
fib1 vs fib2: 0.752
fib1 vs fib3: 0.397
fib1 vs geo1: 0.379
trf
fib1 vs fib1: 1.000
fib1 vs fib2: 0.937
fib1 vs fib3: 0.895
fib1 vs geo1: 0.820
seq
fib1 vs fib1: 1.000
fib1 vs fib2: 0.624
fib1 vs fib3: 0.329
fib1 vs geo1: 0.306
gpt


In [25]:
from openai import OpenAI
from llmcoder.utils import get_openai_key
from llmcoder.eval.metrics.extrinsic import _user_prompt_templste
import numpy as np

def gpt_reviewer_score(ground_truth: str, llmcoder_result: dict | str, model: str = "gpt-3.5-turbo", qualities_list: list[str] | None = None, max_iter: int = 5) -> float:
    """
    Compute the similarity of qualities of two strings with GPT-3.

    Parameters
    ----------
    ground_truth : str
        The first string to compare.
    llmcoder_result : dict
        The llmcoder result containing the conversation (the last message is the completion).

    Returns
    -------
    float
        The similarity between the two strings. Positive if the completion is better than the ground truth, negative otherwise.
    """
    client = OpenAI(api_key=get_openai_key())

    system_prompt_compare = """You are a very critical and meticulous data scientist tasked with comparing and evaluating code completions made by a language model.
The user will submit two code snippets with the same beginning but different completions.

Given these snippets, you evaluate the completions, and give each a score between 0 and 10, with 0 being the worst (unusable completion that would make a developer frustrated) and 10 being the best (perfect completion that would make a developer happy).
The user may ask you to prioritize different qualities of the code.
Take these priorities into account when scoring the completions.
You should also check whether the completion follows this system prompt given to the completion model:

```
You are AutocompleteGPT, a useful AI autocomplete tool that provides code completions based on the user's code.
You are a precision-focused tool for code autocompletion, adept in languages like Python, JavaScript, C++, and SQL.
Precisely continue the code from the point of interruption and do not repeat or modify the original code, even if it is incorrect or the code interrupts in the middle of a line.
Your code is well documented with comments and annotations, and you should provide a clear explanation of the code's purpose in your code completion.
Your unique capability is to provide completions without altering, repeating, or commenting on the original code.
You offer only the necessary code to complete the snippet, ensuring the response is exclusively code, with no additional comments, explanations, or annotations.
This approach makes you an ideal assistant for users seeking straightforward and efficient code extensions, enhancing their work with accurate, logic-driven completions while maintaining the integrity and simplicity of the original input.
Your response begins with the next characters of the line if the last line of the user'S code is incomplete, or the next line if the last line of the user's code is complete.
Your application is a VSCode extension like GitHub Copilot, which provides seamless code completions based on the user's code at the point of interruption.
```

If the completion is not a direct continuation of the user's code, you will give it a low score.
So, if the completion has any additional natural language text, you will give it a low score.
Because our goal is to evaluate the system's ability to complete code.
I.e., if it works in python, it is fine.

If the code does not follow this prompt, you will give it a low score.

Most importantly though, you identify whether the completion is seamless, does not repeat or modify the original code, and is a direct continuation of the user's code.
If the completion contains any explanations or text other than code, you will give it a low score.


Your output must always have the following format:
```
COMPARISON:
<comparison of the two completions with regard to the requested qualities>

SCORE 1: <score for completion 1, integer between 0 and 10>
SCORE 2: <score for completion 2, integer between 0 and 10>
```

Do not include any other information in your output.
It is very important that the output following "SCORE 1: " and "SCORE 2: " is a single integer between 0 and 10, with no other characters or spaces since scores will later be parsed at this exact location.
Therefore, make sure to keep your comparison (the text after COMPARISON:) concise, and adhere to the score format exactly.
"""

    if qualities_list is None:
        qualities_list = [
            "alignment of the completion with the given code (hint: the given code is the same for both completions)",
            "correctness",
            "plausibility",
            "readability",
            "efficiency"
        ]

    if isinstance(llmcoder_result, dict):
        completion = llmcoder_result['messages'][-1]['content']
    else:
        completion = llmcoder_result

    user_prompt = _user_prompt_templste(ground_truth, completion, qualities_list)

    messages = [
        {
            "role": "system",
            "content": system_prompt_compare
        },
        {
            "role": "user",
            "content": user_prompt
        }
    ]


    chat_completion = client.chat.completions.create(messages=messages, model=model, temperature=0.2)  # type: ignore
    message = chat_completion.choices[0].message.content

    print(message)

In [26]:
gpt_reviewer_score(fib1, fib3)

USER INPUT END: ```
# Function that returns the nth Fibonacci number
def fibonacci_
```
COMPLETION BEGIN: `recursive(n):`

COMPARISON:
The completion in code 1 is a recursive implementation of the Fibonacci sequence. It aligns with the given code and is a direct continuation of the user's input. The completion is correct, plausible, and readable. However, it may not be the most efficient implementation since it recalculates the Fibonacci sequence for each value of n.

The completion in code 2 is a memoized implementation of the Fibonacci sequence. It also aligns with the given code and is a direct continuation of the user's input. The completion is correct, plausible, and readable. It improves efficiency by storing previously calculated Fibonacci numbers in a memo dictionary.

SCORE 1: 8
SCORE 2: 9


In [27]:
gpt_reviewer_score(fib1, fib3_alt)

USER INPUT END: 

COMPLETION BEGIN: 

To implement a function that returns the nth Fibonacci number, we can use memoization to store the results of previous calculations. This will allow us to avoid repeating calculations and improve the efficiency of our function.

Let's address this step by step.
- First, we need to define a function that takes in an integer n and returns the nth Fibonacci number.
- Next, we need to define a dictionary that will store the results of previous calculations.
- Finally, we need to define a recursive function that will calculate the nth Fibonacci number using the memoization dictionary.

Here is how you can implement this in Python:

```python
def fibonacci_memoization(n, memo = {0: 0, 1: 1}):
    if n not in memo:
        memo[n] = fibonacci_memoization(n-1, memo) + fibonacci_memoization(n-2, memo)
    return [memo[i] for i in range(n)]
```

In this code, we define a function that takes in an integer n and returns the nth Fibonacci number. We also define