In [1]:
import openai
import numpy as np

from llmcoder.utils import get_openai_key

In [2]:
client = openai.OpenAI(api_key=get_openai_key())

In [3]:
code = """quality_scores = []
needs_comment_scores = []

for answer in tqdm(answer_list):
    full_code = code + answer

    # If the full_code is longer than 512 tokens, we need to truncate it
    while len(tokenizer.encode(full_code, return_tensors="pt")[0]) > 512:
        full_code = full_code[1:]

    input_ids = tokenizer.encode(full_code, return_tensors="pt").to(device)
    output = model(input_ids)
    quality_scores.append(output.logits[0, 0].to("cpu").tolist())
    needs_comment_scores.append(output.logits[0, 1].to("cpu").tolist())"""

code11 = """# Set up empty arrays
quality_scores = []
needs_comment_scores = []

# Loop through each answer
for answer in tqdm(answer_list):
    full_code = code + answer

    # If the full_code is longer than 512 tokens, we need to truncate it
    while len(tokenizer.encode(full_code, return_tensors="pt")[0]) > 512:
        full_code = full_code[1:]

    input_ids = tokenizer.encode(full_code, return_tensors="pt").to(device)
    output = model(input_ids)
    quality_scores.append(output.logits[0, 0].to("cpu").tolist())
    needs_comment_scores.append(output.logits[0, 1].to("cpu").tolist())"""

code2 = """quality_scores = []
needs_comment_scores = []

for answer in tqdm(answer_list):
    full_code = code + answer

    # If the full_code is longer than 512 tokens, we need to truncate it
    while len(tokenizer.encode(full_code, return_tensors="pt")[0]) > 512:
        full_code = full_code[1:]

    input_ids = tokenizer.encode(full_code, return_tensors="pt").to(device)
    output = model(input_ids)
    quality_scores.append(output.logits[0, 0].to("cpu").tolist())
    needs_comment_scores.append(output.logits[0, 1].to("cpu").tolist())

input_ids = tokenizer.encode(code, return_tensors="pt").to(device)
quality_score_before = model(input_ids).logits[0, 0].cpu().tolist()
needs_comment_score_before = model(input_ids).logits[0, 1].cpu().tolist()"""

code3 = """quality_scores = []
needs_comment_scores = []

for answer in tqdm(answer_list):
    full_code = code + answer

    # If the full_code is longer than 512 tokens, we need to truncate it
    while len(tokenizer.encode(full_code, return_tensors="pt")[0]) > 512:
        full_code = full_code[1:]

    input_ids = tokenizer.encode(full_code, return_tensors="pt").to(device)
    output = model(input_ids)
    quality_scores.append(output.logits[0, 0].to("cpu").tolist())
    needs_comment_scores.append(output.logits[0, 1].to("cpu").tolist())

import numpy as np

"""

code4 = """quality_scores = []
needs_comment_scores = []

for answer in tqdm(answer_list):
    full_code = code + answer

    If the full_code is longer than 512 tokens, we need to truncate it
    while len(tokenizer.encode(full_code, return_tensors="pt")[0]) > 512:
        full_code = full_code[1:]

    input_ids = tokenizer.encode(full_code, return_tensors="pt").to(device)
    output = model(input_ids)
    quality_scores.append(output.logits[0, 0].to("cpu").tolist())
    needs_comment_scores.append(output.logits[0, 1].to("cpu").tolist())

import numpy as np

"""

codes = [code, code11, code2, code3]

In [9]:
def score_prompt(code_list: list[str]) -> str:
    """Concatenates the code snippets with the scoring prompt in the following format:

    Code snippet 1:
    ```python
    <code>
    ```

    Code snippet 2:
    ```python
    <code>
    ```
    ...
    """

    prompt = ""
    for i, code in enumerate(code_list):
        prompt += f"Code snippet {i + 1}:\n```python\n{code}\n```\n\n"

    return prompt


@staticmethod
def score_code(code: str | list[str], client: openai.OpenAI, scoring_prompt: str, reduction: str = "geo") -> float:
    if isinstance(code, str):
        code = [code]

    messages = [
        {
            "role": "system",
            "content": scoring_prompt
        }, {
            "role": "user",
            "content": score_prompt(code)
        }
    ]
    completions = client.chat.completions.create(messages=messages, model="gpt-3.5-turbo", temperature=0)

    lines = completions.choices[0].message.content.split("\n")

    print(completions.choices[0].message.content)

    # Extract the scores from the response
    scores = []
    if "Code snippet" in lines[0]:
        for i, line in enumerate(lines):
            scores_for_snippet = []
            if line.startswith("Code snippet"):
                for j in range(3):
                    try:
                        scores_for_snippet.append(float(lines[i + j + 1][lines[i + j + 1].index(":") + 1:]))
                    except ValueError:
                        print(f"[Scoring] Error while scoring code. Expected float, got: {completions.choices[0].message.content}")
                        scores_for_snippet.append(np.nan)
                scores.append(scores_for_snippet)
    elif len(code) == 1:
        for i in range(3):
            try:
                scores.append(float(lines[i][lines[i].index(":") + 1:]))
            except ValueError:
                print(f"[Scoring] Error while scoring code. Expected float, got: {completions.choices[0].message.content}")
                scores.append(np.nan)

    scores = np.atleast_2d(np.array(scores))

    match reduction:
        case "mean":
            return scores.mean(axis=1)
        case "max":
            return scores.max(axis=1)
        case "geo":
            return scores.prod(axis=1) ** (1 / scores.shape[1])
        case _:
            raise ValueError("Invalid reduction method")

In [10]:
scoring_prompt = """You are a scoring system for python code that meticulously analyzes a code snippet and judges its quality in given categories.
You are given a python code snippet and asked to score it based on the following categories:
- the code quality: How well the code conforms to the python style guide
- the plausibliity of the last few lines: How much sense the last few lines make
- the consistency of the last few lines: How good the last few lines fit to the beginning and middle of the code
- the readability of the code: How easy it is to understand the code

You are very critical and unforgiving, and you will give a low score if the code is not perfect. You will give a high score if the code is perfect.

Respond in the following pattern:

```
Code Quality: <code quality score>
Last Lines Plausibility: <plausibility score>
Last Liens Consistency: <consistency score>
Code Readability: <readability score>
```

If you are given multiple code snippets of the format, respond in the following pattern:

```
Code snippet 1:
Code Quality: <code quality score>
Last Lines Plausibility: <plausibility score>
Last Liens Consistency: <consistency score>
Code Readability: <readability score>

Code snippet 2:
Code Quality: <code quality score>
Last Lines Plausibility: <plausibility score>
Last Liens Consistency: <consistency score>
Code Readability: <readability score>

...
```

If there are any differences in the code snippets, reflect them in your scores: Give higher scores to code snippets that handle a category better than others and lower scores to code snippets that handle a category worse than others.

Your answer must always have this pattern. Do not deviate from this pattern and respond with a score between 0 and 10 for each category and no other information."""

In [11]:
score_code(codes[0], client, scoring_prompt)

Code Quality: 7.5
Last Lines Plausibility: 8.0
Last Lines Consistency: 8.0
Code Readability: 8.5


array([7.82973528])