# Output Parsing with Code Llama

In [1]:
!export "CUDA_VISIBLE_DEVICES"=[1,2] jupyter notebook
!export "CUDA_VISIBLE_DEVICES"=[1,2]

In [2]:
import os

PROJECT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
CACHE_DIR = os.path.join(PROJECT_DIR, '.cache')
RESPONSES_DIR = os.path.join(PROJECT_DIR, 'responses')

schemas_path = os.path.join(PROJECT_DIR, 'schemas/output-parsing_schemas.json')
template_path = os.path.join(PROJECT_DIR, 'prompt_templates/output-parsing_templates.json')
responses_path = os.path.join(RESPONSES_DIR, 'freetext_turbo0301_700dev_14081857.json')

In [3]:
import torch

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=1)

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "codellama/CodeLlama-7b-Instruct-hf" # 7b, 13b, 34b
quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    #device_map='auto',
    cache_dir=CACHE_DIR,
)

model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32016, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm

In [5]:
import sys
sys.path.append("../utils/")

from prompting import get_llama_prompt
from output import build_parser
from langchain import PromptTemplate
import json

template_str = json.load(open(template_path, "r"))['v3']
template = PromptTemplate.from_template(template_str)

output_parser, format_instructions = build_parser(schemas_version="v3",
                                                  parser_type="structured",
                                                  only_json=False)

In [6]:
responses = json.load(open(responses_path, "r"))
responses = {int(idx): response.strip('\n') for idx, response in responses.items()}

In [7]:
format_instructions

'The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"answer_letter": string  // The letter corresponding to the final answer. For example, if the text mentions \'Answer G) penguing is the most likely\', then this field is \'G\'. If multiple final answers are given, such as \'G\' and \'U\', then this field is \'G,U\'. If a final answer is not chosen, then this field is an empty string.\n\t"answer_text": string  // The word or short phrase corresponding to the final answer. For example, if the text mentions \'Answer G) feeding penguings is the most likely\', then this field is \'feeding penguins\'. In case of multiple final answers, they are separate by a comma. If the answer_letter field is empty, then this field is an empty string.\n}\n```'

In [8]:
import pprint as pp

prompt = template.format(freetext_response=responses[2],
                         format_instructions=format_instructions)
pp.pprint(prompt, compact=True)

('Transform the free text into structured json, following the instructions and '
 'schema provided. Do not add information that is not from the original text. '
 'The text is the response to a multiple choice question, which includes the '
 'reasoning before choosing a final answer. Sometimes the deliberation does '
 'not lead to an answer being chosen as the final answer.\n'
 '\n'
 'Text: First, we need to identify what type of printed works are being '
 'referred to. The question mentions magazines specifically, but also mentions '
 '"many other printed works." This suggests that we are looking for a place '
 'where a variety of types of printed materials are available.\n'
 '\n'
 "Option A, a doctor's office, may have some magazines but is unlikely to have "
 'a wide variety of printed works.\n'
 '\n'
 'Option C, a market, may have some magazines available for purchase but is '
 'also unlikely to have a wide variety of printed works.\n'
 '\n'
 'Option E, a mortuary, is an unlikely pl

In [9]:
format_prompt = template_str.split('\n\n')[0]
sys_prompt = f"{format_prompt}\n\n{format_instructions}\n"

llama_prompt = get_llama_prompt(sys_prompt, responses[2])
pp.pprint(llama_prompt, compact=True)

('<s>[INST] <<SYS>>\n'
 'Transform the free text into structured json, following the instructions and '
 'schema provided. Do not add information that is not from the original text. '
 'The text is the response to a multiple choice question, which includes the '
 'reasoning before choosing a final answer. Sometimes the deliberation does '
 'not lead to an answer being chosen as the final answer.\n'
 '\n'
 'The output should be a markdown code snippet formatted in the following '
 'schema, including the leading and trailing "```json" and "```":\n'
 '\n'
 '```json\n'
 '{\n'
 '\t"answer_letter": string  // The letter corresponding to the final answer. '
 "For example, if the text mentions 'Answer G) penguing is the most likely', "
 "then this field is 'G'. If multiple final answers are given, such as 'G' and "
 "'U', then this field is 'G,U'. If a final answer is not chosen, then this "
 'field is an empty string.\n'
 '\t"answer_text": string  // The word or short phrase corresponding to 

In [10]:
'''Llama prompt'''

inputs = tokenizer(llama_prompt, return_tensors="pt").to(device)

output = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    temperature=0.1,
    max_new_tokens=50,
    do_sample=False,
    top_p=0.5,
)
output = output[0].to('cpu')
output_decoded_llama = tokenizer.decode(output[inputs['input_ids'].shape[1]:])

print(output_decoded_llama)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 {
	"answer_letter": "B",
	"answer_text": "bookstore"
}</s>


In [11]:
'''Normal prompt'''

inputs = tokenizer(prompt, return_tensors="pt").to(device)

output = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    temperature=0.1,
    max_new_tokens=50,
    do_sample=False,
    top_p=0.5,
)
output = output[0].to('cpu')
output_decoded = tokenizer.decode(output[inputs['input_ids'].shape[1]:])

# return gen only (no prompt)
# tokenizer.decode(output[inputs['input_ids'].shape[1]:])
# tokenizer.batch_decode(gen_tokens[:, input_ids.shape[1]:])[0]

print(output_decoded)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




For example, the output for the given text would be:

```json
{
	"answer_letter": "B",
	"answer_text": "bookstore"
}
```

### 


In [12]:
# langchain output parser cannot parse output from the llama prompt (not formatted as expected with ```json..)

formatted_response = output_parser.parse(output_decoded) # dict
formatted_response

{'answer_letter': 'B', 'answer_text': 'bookstore'}

## Adding examples (few-shot)

In [86]:
raw_examples = {
    "0": {
        "answer_letter": "A",
        "answer_text": "bank",
        "full_text": "Revolving doors are convenient for two-direction travel because they allow people to enter and exit a building without having to wait for another person to open or close a regular door. However, they can also serve as a security measure because they limit the number of people who can enter or exit at one time and can prevent unauthorized access. \n\nOut of the given options, a revolving door would most likely be used as a security measure at a bank, where it is important to control access to the building and protect the safety of employees and customers. Therefore, the correct answer is A) bank."
    },
    "1": {
        "answer_letter": "A,B",
        "answer_text": "completing the job, learning from each other",
        "full_text": "A) Completing the job is one aim that people have at work. This means finishing the tasks assigned to them within the given time frame and meeting the expectations of the employer.\n\nB) Learning from each other is another aim that people have at work. This means interacting with colleagues and gaining new skills and knowledge that can help in personal and professional growth.\n\nC) Killing animals is not an aim that people have at work unless they work in specific industries like hunting or meat processing.\n\nD) Wearing hats is not an aim that people have at work unless it is a part of their uniform or dress code.\n\nE) Talking to each other is a common activity that happens at work, but it may not necessarily be an aim for everyone. Some people prefer to work independently and limit their interactions with others. \n\nTherefore, the most common aims that people have at work are completing the job and learning from each other."
    },
    "3": {
        "answer_letter": "A",
        "answer_text": "fast food restaurant",
        "full_text": "- A fast food restaurant is a common place to find hamburgers, so A is a possible answer.\n- Pizza is not related to hamburgers, so B is not a valid answer.\n- C is technically correct, as hamburgers are made from ground beef, but it is not a common or appropriate answer option.\n- D is not a location, but rather where a person eats a hamburger, so it is not a valid answer option.\n- E is not a valid answer, as it is not a location where hamburgers are typically found.\n\nTherefore, the most likely answer is A) fast food restaurant."
    },
    "4": {
        "answer_letter": "A,E",
        "answer_text": "the midwest, Illinois",
        "full_text": "First, James is looking for farmland, which suggests that he wants to buy a piece of land for agricultural purposes. \n\nOption A, the midwest, is a popular region for farming in the United States, with states like Iowa, Illinois, and Indiana known for their fertile soil and agricultural production. So, A could be a good option for James to consider. \n\nOption B, the countryside, is a more general term and could refer to any rural area outside of urban centers. While there may be farmland in the countryside, it is not a specific location for finding farmland. \n\nOption C, an estate, typically refers to a large property or mansion, which may or may not include farmland. So, this option is not necessarily relevant to James's search. \n\nOption D, farming areas, is a more general term similar to the countryside, and may or may not lead James to specific farmland for sale. \n\nOption E, Illinois, is a specific state in the midwest region mentioned in option A. So, E is a viable option for James to consider if he wants to focus on a particular state.\n\nOverall, the best answers for James to find farmland would be A, the midwest, or E, a specific state like Illinois in the midwest region."
    },
    "6": {
        "answer_letter": "B",
        "answer_text": "Mexico",
        "full_text": "The question is asking for a Spanish-speaking North American country where you can get a great cup of coffee. \n\nOption A, Mildred's coffee shop, is not a country but a specific coffee shop. \n\nOption C, diner, and option E, canteen, are not countries but types of establishments where you could potentially get coffee. \n\nOption D, kitchen, is neither a country nor a type of establishment where you would typically get coffee. \n\nThat leaves us with option B, Mexico, as the correct answer. Mexico is a Spanish-speaking country located in North America, and it is well-known for its coffee production, particularly in the southern states of Chiapas and Oaxaca. So, if you're looking for a great cup of coffee in a Spanish-speaking North American country, Mexico is a great choice."
    },
    "9": {
        "answer_letter": "E",
        "answer_text": "making music",
        "full_text": "First of all, people typically play guitar to make music. So, we can eliminate options A, B, and D. \n\nWhile playing guitar, people may also sing along to the music they are playing, so option C is a valid choice. \n\nTherefore, the answer is E) making music, with the possibility of also singing."
    },
    "14": {
        "answer_letter": "",
        "answer_text": "",
        "full_text": "Firstly, it is important to note that drinking alcohol can impair judgment and decision-making abilities, so it is not recommended to drink excessively or to engage in any potentially dangerous activities while under the influence.\n\nAssuming the person is drinking responsibly, here are some options to stay busy:\n\n- Have a conversation with friends or family members who are also drinking (option A: reach tentative agreement).\n- Relax and unwind by reading a book, watching a movie, or listening to music in bed (option B: stay in bed).\n- Take a break from drinking and go for a bike ride (option C: stop bicycle).\n- Engage in a hobby or activity that doesn't require a lot of focus or coordination, such as coloring or organizing a closet (option D: examine thing).\n\nThe option of suicide (option E) is not only inappropriate but also a serious concern. If anyone is struggling with thoughts of self-harm, it is important to seek immediate help and support from a mental health professional or a crisis hotline."
    },
    "17": {
        "answer_letter": "",
        "answer_text": "",
        "full_text": "Glue sticks are commonly used for arts and crafts, as well as for bonding materials in various projects. Since adults may engage in such activities both at work and at home, it is possible that they use glue sticks in different settings. \n\nOption A mentions a classroom, indicating that adults may be teachers or educators who use glue sticks during their lessons. Option C also refers to school, which may include other settings such as a laboratory or a workshop where glue sticks can be used. \n\nOption D mentions an office, where adults may use glue sticks for various purposes such as paper crafts, scrapbooking, or labeling. Option E mentions a kitchen drawer, which may be relevant for adults who use glue sticks for DIY projects, repairs, or decorations at home.\n\nTherefore, the correct answer is: all of the above options could be correct, depending on the context and the specific adult's activities."
    },
    "25": {
        "answer_letter": "D",
        "answer_text": "lie",
        "full_text": "Perjury is defined as lying under oath in a court of law. So, the answer is D) lie. However, it is also considered a serious crime and can lead to consequences such as fines and imprisonment."
    },
    "28": {
        "answer_letter": "E",
        "answer_text": "slow down",
        "full_text": "The light turned yellow, which means it is about to turn red. Sean was in a rush, so he didn't want to stop. However, it is not safe to continue driving when the light turns red. Therefore, Sean was forced to slow down (E) and stop before the light turned red."
    },
    "34": {
        "answer_letter": "B,C,D",
        "answer_text": "front door, doorway, entrance porch",
        "full_text": "A doormat is a mat placed at the entrance of a building to prevent dirt or debris from being tracked inside. Therefore, it is most likely to be found in front of the front door, doorway, or entrance porch, where people enter and exit the building. So, the correct answer is either B) front door, C) doorway, or D) entrance porch."
    },
}

print(f"Number of examples: {len(raw_examples)}")

examples = []
for idx, response_dict in raw_examples.items():
    json_str = json.dumps({
        "answer_letter": response_dict['answer_letter'],
        "answer_text": response_dict['answer_text']
        }
    )
    examples.append(
        (response_dict['full_text'],
         json_str)
    )
    
examples

Number of examples: 11


[('Revolving doors are convenient for two-direction travel because they allow people to enter and exit a building without having to wait for another person to open or close a regular door. However, they can also serve as a security measure because they limit the number of people who can enter or exit at one time and can prevent unauthorized access. \n\nOut of the given options, a revolving door would most likely be used as a security measure at a bank, where it is important to control access to the building and protect the safety of employees and customers. Therefore, the correct answer is A) bank.',
  '{"answer_letter": "A", "answer_text": "bank"}'),
 ('A) Completing the job is one aim that people have at work. This means finishing the tasks assigned to them within the given time frame and meeting the expectations of the employer.\n\nB) Learning from each other is another aim that people have at work. This means interacting with colleagues and gaining new skills and knowledge that can

In [87]:
sys_prompt_examples = f"""{sys_prompt}\n
Examples:
"""

ex_num = 0
for ex in examples:
    text = ex[0].replace('\n', ' ')
    output = ex[1]
    sys_prompt_examples+=f"{text}\n"
    sys_prompt_examples+=f"\n```json\n{output}\n```\n\n"
    ex_num+=1
    if ex_num == 3:
        break

print(sys_prompt_examples)

Transform the free text into structured json, following the instructions and schema provided. Do not add information that is not from the original text. The text is the response to a multiple choice question, which includes the reasoning before choosing a final answer. Sometimes the deliberation does not lead to an answer being chosen as the final answer.

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"answer_letter": string  // The letter corresponding to the final answer. For example, if the text mentions 'Answer G) penguing is the most likely', then this field is 'G'. If multiple final answers are given, such as 'G' and 'U', then this field is 'G,U'. If a final answer is not chosen, then this field is an empty string.
	"answer_text": string  // The word or short phrase corresponding to the final answer. For example, if the text mentions 'Answer G) feeding penguings is the most likel

In [88]:
llama_prompt_examples = get_llama_prompt(sys_prompt_examples, f"Input: {responses[66]}")
print(llama_prompt_examples)

<s>[INST] <<SYS>>
Transform the free text into structured json, following the instructions and schema provided. Do not add information that is not from the original text. The text is the response to a multiple choice question, which includes the reasoning before choosing a final answer. Sometimes the deliberation does not lead to an answer being chosen as the final answer.

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"answer_letter": string  // The letter corresponding to the final answer. For example, if the text mentions 'Answer G) penguing is the most likely', then this field is 'G'. If multiple final answers are given, such as 'G' and 'U', then this field is 'G,U'. If a final answer is not chosen, then this field is an empty string.
	"answer_text": string  // The word or short phrase corresponding to the final answer. For example, if the text mentions 'Answer G) feeding penguings

## Batch calls

In [89]:
sys_prompt_examples_tokenized = tokenizer(f"{sys_prompt_examples}\n", return_tensors="pt").to(device)

inputs_tokenized = []
responses_to_use = [i for i in range(50,54)]

for idx, response in responses.items():
    if idx in responses_to_use:
        inputs_tokenized.append(
            tokenizer(response, return_tensors="pt").to(device)
        )
        
len(inputs_tokenized)

4

In [90]:
# sys_prompt_examples_tokenized is the prefix for all prompts
# inputs_tokenized contains the input_ids for each response

# prepend sys_prompt_examples_tokenized to each input_ids in inputs_tokenized

# list of tensors (one for each response to process)
combined_input_ids = [torch.cat(
    [sys_prompt_examples_tokenized['input_ids'], 
        input_tokenized['input_ids']],
        dim=1
    ) for input_tokenized in inputs_tokenized]

combined_attention_mask = [torch.cat(
    [sys_prompt_examples_tokenized['attention_mask'],
        input_tokenized['attention_mask']],
        dim=1
    ) for input_tokenized in inputs_tokenized]

In [91]:
len(combined_input_ids), len(combined_attention_mask)==len(combined_input_ids)

(4, True)

In [92]:
from transformers.tokenization_utils_base import BatchEncoding

max_len = max([x.squeeze().numel() for x in combined_input_ids])

# pad all tensors to have same length
padded_input_ids = [torch.nn.functional.pad(
    tensor, 
    pad=(0, max_len - tensor.numel()), 
    mode='constant', 
    value=0
) for tensor in combined_input_ids]

padded_attention_mask = [torch.nn.functional.pad(
    tensor,
    pad=(0, max_len - tensor.numel()),
    mode='constant',
    value=0
) for tensor in combined_attention_mask]

# make BatchEncoding object with keys 'input_ids' and 'attention_mask'
# where 'input_ids' is a tensor of shape (num_responses, num_input_ids)
# and 'attention_mask' is a tensor of shape (num_responses, num_attention_mask)
# avoid inputs_encoded['inputs_ids'] having 3 dimensions (only 2: num_responses, num_input_ids)
inputs_encoded = BatchEncoding(
    {'input_ids': torch.stack(padded_input_ids),
        'attention_mask': torch.stack(padded_attention_mask)}
)

# pop the second dimension 
# inputs_encoded['input_ids'] must have shape (num_responses, num_input_ids)
inputs_encoded['input_ids'] = inputs_encoded['input_ids'].squeeze(1)
inputs_encoded['attention_mask'] = inputs_encoded['attention_mask'].squeeze(1)

inputs_encoded['input_ids'].shape, inputs_encoded['attention_mask'].shape


(torch.Size([4, 1049]), torch.Size([4, 1049]))

In [93]:
type(inputs_encoded), inputs_encoded.keys()

(transformers.tokenization_utils_base.BatchEncoding,
 dict_keys(['input_ids', 'attention_mask']))

In [94]:
with torch.no_grad():
    output_batch = model.generate(
        **inputs_encoded, 
        max_new_tokens=50, 
        temperature=0.1, 
        do_sample=False, 
        top_p=0.5,
    )

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [95]:
output_batch.shape

torch.Size([4, 1099])

In [96]:
output_batch_decoded = tokenizer.batch_decode(output_batch[:, inputs_encoded['input_ids'].shape[1]:], skip_special_tokens=True)
output_batch_decoded

['.\n\n```json\n{"answer_letter": "B", "answer_text": "countryside"}\n```\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n',
 '.\n\n```json\n{"answer_letter": "C,D", "answer_text": "cinema, friend\'s house"}\n```\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n',
 '.\n\n```json\n{"answer_letter": "C", "answer_text": "sandy"}\n```\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n',
 '\n\n```json\n{"answer_letter": "A", "answer_text": "it depends on the individual and their specific curiosity"}\n```\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']

In [99]:
output_parser.parse(output_batch_decoded[0])

{'answer_letter': 'B', 'answer_text': 'countryside'}