# Llama 3 Instruct

In [1]:
import os
import transformers
import torch

DEVICE = torch.device(
    "mps" if torch.backends.mps.is_available()
    else (
        "cuda" if torch.cuda.is_available()
        else "cpu"
    )
)
MODELS_PATH = os.getenv("HOME") + "/models/fine_tuned/llama3/"

pipeline = transformers.pipeline(
    "text-generation",
    model="meta-llama/Meta-Llama-3-8B-Instruct",
    # NOTE: BFloat16 is not supported on MPS, so using Float16
    # model_kwargs={"torch_dtype": torch.bfloat16},
    model_kwargs={"torch_dtype": torch.float16},
    device=DEVICE
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [68]:
epoch_num = 1
pipeline.model.load_state_dict(torch.load(MODELS_PATH + f"fine_tuned_llama3_smaller_LR_after_{epoch_num}_epoch.pt"))

<All keys matched successfully>

In [69]:
def generate(
    system_prompt,
    user_prompt,
    max_new_tokens=1000,
    do_sample=False,
    temperature=1.0,
    top_p=1.0
):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=max_new_tokens,
        eos_token_id=terminators,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        # Set pad_token_id so as not to get warning every time:
        pad_token_id=pipeline.tokenizer.eos_token_id
    )
    return outputs[0]["generated_text"][len(prompt):]


## Basic Testing

In [70]:
system_prompt = "You are a pirate chatbot who always responds in pirate speak!"
user_prompt = "Who are you?"

result = generate(
    system_prompt,
    user_prompt,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.6,
    top_p=0.9
)
print(result)

Arrrr, shiver me timbers! Me name be Captain Chat, the scurviest pirate chatbot to ever sail the Seven Seas o' the Interwebs! I be here to swab yer decks with me witty banter and me treasure trove o' pirate knowledge. So hoist the colors, me hearty, and let's set sail fer a swashbucklin' good time!


In [71]:
result = generate(
    system_prompt,
    user_prompt,
    max_new_tokens=256
)
result2 = generate(
    system_prompt,
    user_prompt,
    max_new_tokens=256
)

print(result == result2)

True


In [72]:
result

"Arrrr, me hearty! Me name be Captain Chat, the scurviest pirate chatbot to ever sail the Seven Seas o' the Interwebs! Me and me trusty crew o' code have been plunderin' the riches o' knowledge and bringin' 'em back to ye, me matey! So hoist the colors, me hearty, and let's set sail fer a swashbucklin' adventure o' learnin' and fun!"

## Real Usage

In [73]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import pickle
import re

# Expects this file and eval.py to be in the same folder:
from eval import DATA, evaluate, bulk_evaluate

mbpp = DATA["mbpp"]  # train, validation, and test
humaneval = DATA["openai_humaneval"]  # test only

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [74]:
RESULTS_PATH = "./results/"

def save_pickle(object, to):
    with open(to, "wb") as f:
        pickle.dump(object, f)
    
def load_pickle(from_):
    with open(from_, "rb") as f:
        return pickle.load(f)


In [75]:
SYSTEM_PROMPT = """
You are the world's best AI coding assistant. In particular, you are exceptionally skilled at refactoring Python programs to be readable, efficient, and maintainable.

For the interaction that follows, refactor the Python code provided by the user to be more readable, efficient, and maintainable using the following guidelines:
 - The given program is correct but needs improvement
 - DO NOT change the name of the program
 - DO NOT change the input or output behavior of the program (e.g. number of inputs / outputs, input / output types, etc.)
 - Put your response in a markdown code block
 - Respond with only the code block
 - Don't explain the changes made
 - If you use any packages (e.g. `os`, `re`, `sys`), don't forget to import them

Again, do not change the name of the function in any way!
""".strip()

USER_PROMPT = """
```python
{code}
```
""".strip()

def refactor(code, system_prompt=SYSTEM_PROMPT, **kwargs):
    return generate(
        system_prompt=system_prompt,
        user_prompt=USER_PROMPT.format(code=code),
        **kwargs
    )

def extract_from_code_block(raw_generation):
    # assert raw_generation.startswith("```")
    # assert raw_generation.endswith("```")

    # # Simply remove first and last line
    # return "\n".join(raw_generation.split("\n")[1:-1])

    # Remove ```s:
    return raw_generation.replace("```python", "").replace("```", "").strip()


### HumanEval

In [76]:
canonical_solutions = [
    task["prompt"] + task["canonical_solution"] for task in humaneval["test"]
]

In [77]:
i = 0
raw_generation = refactor(
    canonical_solutions[i]
)
generation = extract_from_code_block(raw_generation)

print("RAW GENERATION:")
print(raw_generation)
print("-" * 80)
print("GENERATION:")
print(generation)

RAW GENERATION:
```
from typing import List

def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """
    for elem1 in numbers:
        for elem2 in numbers:
            if elem1!= elem2 and abs(elem1 - elem2) < threshold:
                return True

    return False
```
--------------------------------------------------------------------------------
GENERATION:
from typing import List

def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """
    for elem1 in numbe

In [12]:
print(canonical_solutions[i])

from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """
    for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx != idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return True

    return False



In [78]:
# WARNING: Running this cell will delete all your generations. Be sure you want to do this!
all_generations = []

In [79]:
# WARNNG: Slow step:
i = len(all_generations)
while i < len(canonical_solutions):
    print(i)
    raw_generation = refactor(canonical_solutions[i])
    generation = extract_from_code_block(raw_generation)
    all_generations.append(generation)
    i += 1

    # Every 10, Save to pickle just in case
    if i % 10 == 0:
        save_pickle(all_generations, RESULTS_PATH + "openai_humaneval_test_llama3_0_shot_in_progress_generations.pkl")


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163


In [80]:
execution_ids_backup = []
execution_results_backup = []

results = bulk_evaluate(
    dataset="openai_humaneval",
    split="test",
    code=all_generations,
    execution_ids_backup=execution_ids_backup,
    execution_results_backup=execution_results_backup
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Sending 164 jobs for execution...
164 jobs sent


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fetching 164 job results...
Fetching 1/164...
Fetching 2/164...
Fetching 3/164...
Fetching 4/164...
Fetching 5/164...
Fetching 6/164...
Fetching 7/164...
Fetching 8/164...
Fetching 9/164...
Fetching 10/164...
Fetching 11/164...
Fetching 12/164...
Fetching 13/164...
Fetching 14/164...
Fetching 15/164...
Fetching 16/164...
Fetching 17/164...
Fetching 18/164...
Fetching 19/164...
Fetching 20/164...
Fetching 21/164...
Fetching 22/164...
Fetching 23/164...
Fetching 24/164...
Fetching 25/164...
Fetching 26/164...
Fetching 27/164...
Fetching 28/164...
Fetching 29/164...
Fetching 30/164...
Fetching 31/164...
Fetching 32/164...
Fetching 33/164...
Fetching 34/164...
Fetching 35/164...
Fetching 36/164...
Fetching 37/164...
Fetching 38/164...
Fetching 39/164...
Fetching 40/164...
Fetching 41/164...
Fetching 42/164...
Fetching 43/164...
Fetching 44/164...
Fetching 45/164...
Fetching 46/164...
Fetching 47/164...
Fetching 48/164...
Fetching 49/164...
Fetching 50/164...
Fetching 51/164...
Fetching 52/

In [81]:
df = pd.DataFrame(results)
df.insert(3, "model", f"llama3_0_shot_finetuned_smaller_LR_{epoch_num}_epoch")
df.insert(4, "code", all_generations)
df

Unnamed: 0,dataset,split,task_id,model,code,result,compiled,passed_tests,avg_test_time,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,openai_humaneval,test,0,llama3_0_shot_finetuned_smaller_LR_1_epoch,from typing import List\n\ndef has_close_eleme...,failed:,True,False,,16.0,...,8.0,10.0,12.0,23.509775,39.863137,2.666667,106.301699,5.905650,0.013288,94.062602
1,openai_humaneval,test,1,llama3_0_shot_finetuned_smaller_LR_1_epoch,from typing import List\n\ndef separate_paren_...,passed,True,True,0.000088,29.0,...,12.0,10.0,18.0,24.406372,59.794706,2.571429,153.757815,8.542101,0.019932,86.970176
2,openai_humaneval,test,2,llama3_0_shot_finetuned_smaller_LR_1_epoch,def truncate_number(number: float) -> float:\n...,passed,True,True,0.000063,10.0,...,2.0,3.0,3.0,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,65.110353
3,openai_humaneval,test,3,llama3_0_shot_finetuned_smaller_LR_1_epoch,from typing import List\n\ndef below_zero(oper...,passed,True,True,0.000132,19.0,...,4.0,5.0,6.0,6.754888,13.931569,1.333333,18.575425,1.031968,0.004644,96.412688
4,openai_humaneval,test,4,llama3_0_shot_finetuned_smaller_LR_1_epoch,from typing import List\n\ndef mean_absolute_d...,passed,True,True,0.000095,14.0,...,6.0,8.0,9.0,17.509775,27.000000,1.000000,27.000000,1.500000,0.009000,87.277819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,openai_humaneval,test,159,llama3_0_shot_finetuned_smaller_LR_1_epoch,"def eat(number, need, remaining):\n """"""\n ...",passed,True,True,0.000123,36.0,...,8.0,6.0,12.0,9.509775,31.019550,4.000000,124.078200,6.893233,0.010340,55.822181
160,openai_humaneval,test,160,llama3_0_shot_finetuned_smaller_LR_1_epoch,"def do_algebra(operator, operand):\n """"""\n ...",passed,True,True,0.000117,29.0,...,12.0,9.0,18.0,24.000000,57.058650,0.750000,42.793988,2.377444,0.019020,45.813793
161,openai_humaneval,test,161,llama3_0_shot_finetuned_smaller_LR_1_epoch,"def solve(s):\n """"""You are given a string s...",failed:,True,False,,18.0,...,1.0,2.0,2.0,0.000000,2.000000,0.500000,1.000000,0.055556,0.000667,96.882486
162,openai_humaneval,test,162,llama3_0_shot_finetuned_smaller_LR_1_epoch,"def string_to_md5(text):\n """"""\n Given a...",passed,True,True,0.000061,11.0,...,1.0,2.0,2.0,0.000000,2.000000,0.500000,1.000000,0.055556,0.000667,100.000000


In [82]:
df["compiled"].mean(), df["passed_tests"].mean()
# @1 = (0.9939024390243902, 0.6463414634146342)
# @2 = (0.9939024390243902, 0.6707317073170732)
# @3 = (0.9878048780487805, 0.6036585365853658)
# @4 = (0.9939024390243902, 0.5792682926829268)
# @5 = (0.9878048780487805, 0.5609756097560976)

(0.9939024390243902, 0.6463414634146342)

In [59]:
df["compiled"].mean(), df["passed_tests"].mean()

(0.9939024390243902, 0.6707317073170732)

In [83]:
# Finally, when everything looks good save the data. Will overwrite previously
# saved data at this location. Be sure you want to do this!
df.to_csv(RESULTS_PATH + f"openai_humaneval_llama3_0_shot_finetuned_smaller_LR_{epoch_num}_epoch.csv")

In [19]:
df.query("not passed_tests")

Unnamed: 0,dataset,split,task_id,model,code,result,compiled,passed_tests,avg_test_time,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,openai_humaneval,test,0,llama3_0_shot_finetuned_smaller_LR_5_epoch,from typing import List\n\ndef has_close_eleme...,failed:,True,False,,16.0,...,8.0,10.0,12.0,23.509775,39.863137,2.666667,106.301699,5.905650,0.013288,94.062602
1,openai_humaneval,test,1,llama3_0_shot_finetuned_smaller_LR_5_epoch,,failed: name 'separate_paren_groups' is not de...,True,False,,,...,,,,,,,,,,
6,openai_humaneval,test,6,llama3_0_shot_finetuned_smaller_LR_5_epoch,,failed: name 'parse_nested_parens' is not defined,True,False,,,...,,,,,,,,,,
8,openai_humaneval,test,8,llama3_0_shot_finetuned_smaller_LR_5_epoch,"from typing import List, Tuple\n\ndef sum_prod...","failed: invalid syntax (<string>, line 11)",False,False,,,...,,,,,,,,,,
9,openai_humaneval,test,9,llama3_0_shot_finetuned_smaller_LR_5_epoch,from typing import List\n\ndef rolling_max(num...,failed: list index out of range,True,False,,16.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,openai_humaneval,test,148,llama3_0_shot_finetuned_smaller_LR_5_epoch,"def bf(planet1, planet2):\n '''\n There ...",failed: First test error: 3,True,False,,25.0,...,19.0,14.0,28.0,40.138965,106.605938,5.277778,562.642450,31.257914,0.035535,78.745224
152,openai_humaneval,test,152,llama3_0_shot_finetuned_smaller_LR_5_epoch,,failed: name 'compare' is not defined,True,False,,,...,,,,,,,,,,
155,openai_humaneval,test,155,llama3_0_shot_finetuned_smaller_LR_5_epoch,"def even_odd_count(num):\n """"""Given an inte...","failed: '(' was never closed (<string>, line 9)",False,False,,,...,,,,,,,,,,
158,openai_humaneval,test,158,llama3_0_shot_finetuned_smaller_LR_5_epoch,"def find_max(words):\n """"""Write a function ...",failed: t2,True,False,,11.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
