In [1]:
# Imports
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login
import torch

# Bits and bytes quantizes CodeLlama. My testing saw a decrease of computation speed by a factor of 15x.
model_name = "codellama/CodeLlama-7b-Instruct-hf"
quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.float16
)

# Login if Necessary
# hf_token = ""
# with open("hugging_face_token.txt","r") as f:
#     hf_token = f.readline()
# login(hf_token)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Tokenizer. An LLM model has some process of tokenizing text into numbers and this pretrained tokenizer needs to be imported.
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
# Importing the model. The LLM model is downloaded and quantized with our config. This can take up to 10 minutes to download (model can be several gigabytes) 
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.33s/it]


In [8]:
# Prompt Engineering. Format the prompt such that the LLM will process it and give us a desired output. Selective prompts can decrease computation time. 
sys = "Write python code to solve the following coding problem that obeys the constraints. Please wrap your code answer using ``` and do not include explanations. "
user = "Write a function that prints numbers from 1-10"
prompt = f"<s> <<SYS>>\\n{sys}\\n<</SYS>>\\n\\n[INST]{user}[/INST]"

# Encode prompt
inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to("cuda")

# Generate output. Model parameters can be found: https://huggingface.co/docs/transformers/en/main_classes/text_generation
output = model.generate(
    inputs["input_ids"],
    do_sample=True,
    top_p=0.9,
    temperature=0.1,
)

# Decode and print output
output = output[0].to("cpu")
print(tokenizer.decode(output))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>  <<SYS>>\nWrite python code to solve the following coding problem that obeys the constraints. Please wrap your code answer using ``` and do not include explanations. \n<</SYS>>\n\n[INST]Write a function that prints numbers from 1-10[/INST]  ```
def print_numbers(n):
    for i in range(1, n+1):
        print(i)
```</s>
