In [57]:
from llama_cpp import Llama
from human_eval.execution import check_correctness
from human_eval.data import HUMAN_EVAL, stream_jsonl

model = Llama(
    model_path="../phi-2.fp16.gguf",
    n_gpu_layers=-1,
    n_ctx=2048,
    n_threads=10,
)

problems = list(stream_jsonl(HUMAN_EVAL))
tid_map = {problem["task_id"]: problem for problem in problems}

def execute(problem, completion, timeout=10):
    result = check_correctness(problem, completion, timeout)
    return result

llama_model_loader: loaded meta data with 19 key-value pairs and 325 tensors from ../phi-2.fp16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi2
llama_model_loader: - kv   1:                               general.name str              = Phi2
llama_model_loader: - kv   2:                        phi2.context_length u32              = 2048
llama_model_loader: - kv   3:                      phi2.embedding_length u32              = 2560
llama_model_loader: - kv   4:                   phi2.feed_forward_length u32              = 10240
llama_model_loader: - kv   5:                           phi2.block_count u32              = 32
llama_model_loader: - kv   6:                  phi2.attention.head_count u32              = 32
llama_model_loader: - kv   7:               phi2.attention.head_count_kv u32              = 32
lla

In [87]:
prompt = problems[0]["prompt"]
print(prompt)
tokens = model.tokenize(prompt.encode("utf-8"))
print(tokens)
model.detokenize([220, 220, 220, 10352]).decode("utf-8")

from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """

[50256, 6738, 19720, 1330, 7343, 628, 198, 4299, 468, 62, 19836, 62, 68, 3639, 7, 77, 17024, 25, 7343, 58, 22468, 4357, 11387, 25, 12178, 8, 4613, 20512, 25, 198, 220, 220, 220, 37227, 6822, 611, 287, 1813, 1351, 286, 3146, 11, 389, 597, 734, 3146, 5699, 284, 1123, 584, 621, 198, 220, 220, 220, 1813, 11387, 13, 198, 220, 220, 220, 13163, 468, 62, 19836, 62, 68, 3639, 26933, 16, 13, 15, 11, 362, 13, 15, 11, 513, 13, 15, 4357, 657, 13, 20, 8, 198, 220, 220, 220, 10352, 198, 220, 220, 220, 13163, 468, 62, 19836, 62, 68, 3639, 26933, 16, 13, 15, 11, 362, 13, 23, 11, 513, 13, 15, 11, 604, 13, 15, 11, 642, 13, 15, 11, 362, 13, 15, 4357, 657, 13, 18,

'    False'

In [58]:
import outlines

@outlines.prompt
def few_shot_prompt(instructions, examples, question):
    """{{ instructions }}

    {% for example in examples %}
    Question:
    ```
    {{ example.prompt }}
    ```
    Answer:
    ```
    {{ example.canonical_solution }}
    ```
    {% endfor %}
    Question:
    ```
    {{ question }}
    ```
    Answer:
    ```
    """


instructions = "Please answer the following question following the examples. Always indent with four spaces for python code."
examples = problems[:2]

In [61]:
results = []
for problem in problems[11:12]:
    # prompt = few_shot_prompt(instructions, examples, problem["prompt"])
    prompt = problem["prompt"]
    output = model(
        prompt=prompt,
        max_tokens=256,
        temperature=0.2,
        stop=["```"],
    )
    completion = output["choices"][0]["text"]
    result = execute(problem, completion)
    result["prompt"] = prompt
    result["completion"] = completion
    results.append(result)

Llama.generate: prefix-match hit

llama_print_timings:        load time =    1368.49 ms
llama_print_timings:      sample time =      58.05 ms /   256 runs   (    0.23 ms per token,  4409.76 tokens per second)
llama_print_timings: prompt eval time =     294.37 ms /    81 tokens (    3.63 ms per token,   275.16 tokens per second)
llama_print_timings:        eval time =    8925.84 ms /   255 runs   (   35.00 ms per token,    28.57 tokens per second)
llama_print_timings:       total time =   10089.17 ms


In [62]:
print(results[0]["prompt"])
print(results[0]["completion"])

from typing import List


def string_xor(a: str, b: str) -> str:
    """ Input are two strings a and b consisting only of 1s and 0s.
    Perform binary XOR on these inputs and return result also as a string.
    >>> string_xor('010', '110')
    '100'
    """

if len(a) != len(b):
raise ValueError("Strings must be of same length")

result = ""
for i in range(len(a)):
if a[i] == b[i]:
result += "0"
else:
result += "1"

return result


def string_xor2(a: str, b: str) -> str:
""" Input are two strings a and b consisting only of 1s and 0s.
 Perform binary XOR on these inputs and return result also as a string.
 >>> string_xor('010', '110')
'100'
"""

if len(a) != len(b):
raise ValueError("Strings must be of same length")

result = ""
for i in range(len(a)):
result += str(int(a[i]) ^ int(b[i]))

return result


def string_xor3(a: str, b: str) -> str:
""" Input are two strings a and b consisting only of 1s and
