In [1]:
!export "CUDA_VISIBLE_DEVICES"=[1,2] jupyter notebook

import os
import torch

PROJECT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
CACHE_DIR = os.path.join(PROJECT_DIR, '.cache')
RESPONSES_DIR = os.path.join(PROJECT_DIR, 'responses')

schemas_path = os.path.join(PROJECT_DIR, 'schemas/output-parsing_schemas.json')
template_path = os.path.join(PROJECT_DIR, 'prompt_templates/output-parsing_templates.json')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
# import codellama functions

import sys
sys.path.append("../utils/")

import codellama


In [3]:
# helper functions in freetext_to_format.py
import json

# set N
def load_freetext_responses(filename:str, full_run:bool=False):
        ''' 
        Load responses from jsonl file
        Format: {id: idx, uuid: uuid, response: freetext response}, {...}
        '''
        filepath = os.path.join(RESPONSES_DIR, filename)
        N = 5
        with open(filepath, "r") as f:
            responses = [json.loads(line) for line in f.readlines()]
            if not full_run:
                responses = responses[:N]
        
        return responses
    
def load_template(template_version:str):
        template_filepath = os.path.join(
              PROJECT_DIR,
              "prompt_templates/output-parsing_templates.json"
            )
        with open(template_filepath, "r") as f:
            templates = json.load(f)
            template = templates[template_version]
        
        return template

In [4]:
from langchain import PromptTemplate
import output

responses = load_freetext_responses("freetext_turbo_700dev_14081857.jsonl")

template = PromptTemplate.from_template(load_template("v3"))
    
output_parser, format_instructions = output.build_parser(
    schemas_version="v3",
    parser_type="structured",
    only_json=False
)

In [5]:
model, tokenizer = codellama.load_codellama()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded model on device: cuda | cuda:0


In [6]:
sys_prompt = template.template.split("\n\n")[0] + "\n\n" + format_instructions
# if examples, add to sys_prompt

user_message = responses[1]['text']

import prompting

llama_prompt = prompting.get_llama_prompt(
    sys_prompt=sys_prompt,
    user_message=user_message
)

print(llama_prompt)


<s>[INST] <<SYS>>
Transform the free text into structured json, following the instructions and schema provided. Do not add information that is not from the original text. The text is the response to a multiple choice question, which includes the reasoning before choosing a final answer. Sometimes the deliberation does not lead to an answer being chosen as the final answer.

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"answer_letter": string  // The letter corresponding to the final answer. For example, if the text mentions 'Answer G) penguing is the most likely', then this field is 'G'. If multiple final answers are given, such as 'G' and 'U', then this field is 'G,U'. If a final answer is not chosen, then this field is an empty string.
	"answer_text": string  // The word or short phrase corresponding to the final answer. For example, if the text mentions 'Answer G) feeding penguings

In [7]:
inputs = tokenizer(llama_prompt, return_tensors="pt").to(device)

output = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    temperature=0.1,
    max_new_tokens=50,
    do_sample=False,
    top_p=0.5,
)
output = output[0].to('cpu')
output_decoded_llama = tokenizer.decode(output[inputs['input_ids'].shape[1]:], skip_special_tokens=True)

print(output_decoded_llama)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 ```json
{
	"answer_letter": "A,B",
	"answer_text": "completing the job, learning from each other"
}
```


In [8]:
for response in responses:
    #sys prompt stays the same
    user_message = response['text']
    llama_prompt = prompting.get_llama_prompt(
        sys_prompt=sys_prompt,
        user_message=user_message
    )
    inputs = tokenizer(
        llama_prompt,
        return_tensors="pt"
    ).to(device)
    output = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        temperature=0.1,
        max_new_tokens=50,
        do_sample=False,
        top_p=0.5,
    )
    output = output[0].to('cpu')
    output_decoded = tokenizer.decode(
        output[inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    )
    print(output_decoded)
    
    
    output_parsed = output_parser.parse(output_decoded)
    print(output_parsed)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 ```json
{
	"answer_letter": "A",
	"answer_text": "bank"
}
```
{'answer_letter': 'A', 'answer_text': 'bank'}


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 ```json
{
	"answer_letter": "A,B",
	"answer_text": "completing the job, learning from each other"
}
```
{'answer_letter': 'A,B', 'answer_text': 'completing the job, learning from each other'}


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 {
	"answer_letter": "B",
	"answer_text": "bookstore"
}
{'answer_letter': 'B', 'answer_text': 'bookstore'}


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 ```json
{
	"answer_letter": "A",
	"answer_text": "fast food restaurant"
}
```
{'answer_letter': 'A', 'answer_text': 'fast food restaurant'}
 ```json
{
	"answer_letter": "A,E",
	"answer_text": "midwest, Illinois"
}
```
{'answer_letter': 'A,E', 'answer_text': 'midwest, Illinois'}


In [9]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [10]:
# !! batch generate() triggers CUDA error which kills the kernel
batch_inputs = tokenizer(
    [prompting.get_llama_prompt(
        sys_prompt=sys_prompt,
        user_message=response['text']
    ) for response in responses[:2]],
    return_tensors="pt",
    padding='longest',
    truncation=True,
    max_length=2048
).to(device)

In [11]:
batch_inputs['input_ids'].shape

torch.Size([2, 504])