In [1]:
import os
import torch
import json

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

device = 'cuda:0'
model_id = 'mistralai/Mistral-7B-Instruct-v0.2'
tokenizer = AutoTokenizer.from_pretrained(model_id)

### Load test set and store domains, inputs and corresponding outputs

In [2]:
with open('shot_set_simple_domain.jsonl', 'r') as infile:
    shot_data = json.load(infile)

domains, inputs, outputs = [], [], []

for i in range(len(shot_data['shots'])):
    domains.append(shot_data['shots'][i]['domain'])
    inputs.append(shot_data['shots'][i]['input'])
    outputs.append(shot_data['shots'][i]['output'])

with open('test_set.jsonl', 'r') as infile:
    test_data = json.load(infile)

test_domains, commands, solutions = [], [], []

for i in range(len(test_data['tests'])):
    test_domains.append(test_data['tests'][i]['domain'])
    commands.append(test_data['tests'][i]['command'])
    solutions.append(test_data['tests'][i]['solution'])

### System prompt

In [3]:
system_prompt = 'You are a helpful PDDL assistant that will list up the available instances, predicates and goals for the given domain and natural language command. You can only answer in the desired format.'

### Create file for storing test data

In [4]:
results_file = 'mistral_results.txt'
if os.path.exists(results_file):
    os.remove(results_file)

header = "******************** Mistral Large Language Model test ********************\n"
underline = "___________________________________________________________________________\n"

with open(results_file, 'a') as f:
    f.write(header)
    f.write(underline)

### Mistral 7B 4-bit quantization

In [5]:
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# torch.cuda.empty_cache()
# model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
# gpu_mem_loaded = torch.cuda.memory_allocated(device)

In [6]:
messages = [
        {'role': 'user', 'content': system_prompt},
        {'role': 'assistant', 'content': 'Please provide the domain.pddl and corresponding command.'},
        {'role': 'user', 'content': f'domain.pddl: {domains[0]}, command: {inputs[0]}'},
        {'role': 'assistant', 'content': 'Understood. What is the expected output format?'},
        {'role': 'user', 'content': f'Expected output: {outputs[0]} // Correct'},
    ]

for i in range(len(shot_data['shots'])):
    # Load model for every iteration to avoid full GPU memory
    torch.cuda.empty_cache()
    model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
    gpu_mem_loaded = torch.cuda.memory_allocated(device)
    print(gpu_mem_loaded)

    number_of_examples = i+1
    test_set_number = 1
    number_of_max_new_tokens = 250  # default

    # if the loop has been executed at least once, remove the last two messages which is the ones that have to be last
    # the new messages to be added is another example from the shot_data
    if i>0:
        del messages[-2:]

        messages.append({'role': 'assistant', 'content': 'Please provide the domain.pddl and corresponding command.'})
        messages.append({'role': 'user', 'content': f'domain.pddl: {domains[i]}, command: {inputs[i]}'})
        messages.append({'role': 'assistant', 'content': 'Understood. What is the expected output format?'})
        messages.append({'role': 'user', 'content': f'Expected output: {outputs[i]} // Correct'})

    messages.append({'role': 'assistant', 'content': 'Thank you. Ready for the new instruction.'})
    messages.append({'role': 'user', 'content': f'domain.pddl: {test_domains[0]}, command: {commands[0]}'})

    # start timer
    start = torch.cuda.Event(enable_timing=True)
    start.record()

    # inference
    encodeds = tokenizer.apply_chat_template(messages, return_tensors='pt')
    model_inputs = encodeds.to(device)
    generated_ids = model_4bit.generate(
        model_inputs,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=number_of_max_new_tokens,
        do_sample=True,
    )
    decoded = tokenizer.batch_decode(generated_ids)

    end = torch.cuda.Event(enable_timing=True)
    end.record()

    torch.cuda.synchronize()

    # compute GPU memory usage during inference and inference time
    gpu_mem_inference = torch.cuda.memory_allocated(device)
    timer = start.elapsed_time(end)/1000

    torch.cuda.empty_cache()

    # format the output string
    output_tokens = decoded[0]
    end_token = '[/INST]'

    end_tag_index = output_tokens.rfind(end_token)
    end_of_sentence = -4
    sliced_output = output_tokens[end_tag_index + len(end_token):end_of_sentence]

    delimiter = '|'
    last_delimiter = sliced_output.rfind(delimiter)
    output = sliced_output[:last_delimiter+1]

    # remove newlines in output
    output = output.replace('\n', ' ')
    output = output.replace('\t', ' ')
    output = output.replace('\r', ' ')
    output = output.replace('  ', ' ')

    # write the results to file
    f1_score = None
    semantic_similarity = None

    result = {
        'Model': model_id + '-4bit',
        'Max new tokens': number_of_max_new_tokens,
        'Test set #': test_set_number,
        'Number of examples': number_of_examples,
        'F1Score': f1_score,
        'Semantic similarity': semantic_similarity,
        'Inference time': timer,
        'GPU memory loaded [MB]': round(gpu_mem_loaded*1e-6, 5),
        'GPU memory inference [MB]': round(gpu_mem_inference*1e-6, 5),
        'Output': output,
        'Solution': solutions[0],
        'Simple': 'Yes',
    }

    with open(results_file, 'a') as outfile:
        for key, value in result.items():
            outfile.write(f'{key:<25}: {value}\n')
        outfile.write(underline)

    print(f'Mistral 7B instruct 4-bit finished on test {test_set_number} with {number_of_examples} examples.')

    # delete model to free up GPU memory
    del model_4bit
    print(torch.cuda.memory_allocated(device))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

4663403520
Mistral 7B instruct 4-bit finished on test 1 with 1 examples.
4671933440


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

4670884864
Mistral 7B instruct 4-bit finished on test 1 with 2 examples.
4670898176


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

4670898176
Mistral 7B instruct 4-bit finished on test 1 with 3 examples.
4670907392


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

4670907392
Mistral 7B instruct 4-bit finished on test 1 with 4 examples.
4670912000


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

4670912000
Mistral 7B instruct 4-bit finished on test 1 with 5 examples.
4670931968


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

4670931968
Mistral 7B instruct 4-bit finished on test 1 with 6 examples.
4670944256


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

4670944256


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.70 GiB. GPU 0 has a total capacty of 11.75 GiB of which 1.55 GiB is free. Including non-PyTorch memory, this process has 10.05 GiB memory in use. Of the allocated memory 9.72 GiB is allocated by PyTorch, and 202.32 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF