In [93]:
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"] = "6" 

from flask import Flask, request
from transformers import (LlamaForCausalLM, CodeLlamaTokenizer,
                          BitsAndBytesConfig, StoppingCriteriaList)
import torch

from tactic_gen.lm_example import LmExample 
from tactic_gen.train_codellama import (collate_input, CONF_NAME, load_config,
                                        get_tokenizer)
from model_deployment.serve_base_codellama import PeriodStoppingCriteria


In [2]:

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model_name = "codellama/CodeLlama-7b-hf"
model = LlamaForCausalLM.from_pretrained(
    model_name, quantization_config=quantization_config
)
tokenizer = CodeLlamaTokenizer.from_pretrained(model_name)

device = "cuda" 

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.90s/it]


In [3]:
period_stopping = PeriodStoppingCriteria.from_tokenizer(tokenizer)

In [4]:
reverse_tok_vocab = dict((v, k) for k, v in tokenizer.get_vocab().items())
[reverse_tok_vocab[i] for i in period_stopping.stop_tok_ids]

[').',
 '..',
 '...',
 '▁.',
 '."',
 '`.',
 '$.',
 '".',
 '».',
 '.,',
 '].',
 '}.',
 '.)',
 '▁...',
 '().',
 '").',
 "').",
 '._',
 '....',
 '.”',
 "'.",
 '}$.',
 '.:',
 ')$.',
 '.}',
 '▁`.',
 '_.',
 '.*',
 '.]',
 ".'",
 '.).',
 '/.',
 '▁..',
 './',
 '../',
 '.[',
 '.$',
 ',.',
 '}).',
 '.--',
 '.\\',
 ')).',
 '▁$.',
 '”.',
 '.;',
 '.~\\',
 '.-',
 '“.',
 '.):',
 '.),',
 '?.',
 '▁».',
 '*.',
 '........',
 '².',
 '▁".',
 '...)',
 '▁./',
 '(.',
 '/).',
 "▁$('.",
 "('.",
 '`).',
 '▁....',
 '.~',
 "'].",
 '()`.',
 '▁).',
 '.(',
 ')`.',
 '▁(.',
 '.“',
 ']).',
 "▁'.",
 '>.',
 '%.',
 '].[',
 '.»',
 '▁.=',
 '...,',
 '.’',
 '()).',
 '"].',
 '.__',
 '(".',
 '...]',
 '▁$(".',
 '.`',
 '..."',
 '.");',
 '.....',
 '▁%.',
 '$).',
 "▁'./",
 '(...',
 '.<',
 '.$$',
 '.",',
 ')..',
 '▁*.',
 '=.',
 '.</',
 '.@',
 '!.',
 '../../',
 '▁[...]',
 '.—',
 '".$',
 "'.$",
 ']$.',
 '▁}).',
 '.\r',
 '.")',
 '\\.',
 '})$.',
 '\\}$.',
 '.](',
 '.=',
 '!...',
 '................',
 ')».',
 '{.',
 '}}$.',
 '.*;',
 '.:\u2

In [6]:
tokenizer.encode(pad_token)

[1, 32007]

In [8]:
pad_token = "<PRE>" 
encoded_ids = tokenizer.encode(pad_token)
assert len(encoded_ids) == 2
assert encoded_ids[0] == tokenizer.bos_token_id

tokenizer.pad_token = pad_token
tokenizer.pad_token_id = encoded_ids[1] 

In [9]:
test_in = 'Lemma exists_min: forall (l : (list nat)), \n    (l <> nil) -> exists h, min(l) = Some(h).\nProof. \nintros l H. <FILL_ME>' 
test_out = "\n    + "

In [65]:
input_ids = tokenizer(test_in, return_tensors="pt")["input_ids"].to("cuda")
period_stopping.set_num_periods(input_ids)
stopping_list = StoppingCriteriaList([period_stopping])

In [89]:
model_output = model.generate(
    input_ids, 
    temperature=1,
    do_sample=True,
    max_new_tokens=200, 
    output_scores=True,
    return_dict_in_generate=True,
    stopping_criteria=stopping_list,
    ) 
single_output = model_output.sequences[0].to("cpu")
token_output = tokenizer.decode(single_output[input_ids.shape[1]:], skip_special_tokens=True)
token_output


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'\ndestruct l as [| l0 l1].'

In [90]:
type(model_output)

transformers.generation.utils.SampleDecoderOnlyOutput

In [67]:
tokenizer.batch_decode(model_output.sequences)

['<s> Lemma exists_min: forall (l : (list nat)), \n    (l <> nil) -> exists h, min(l) = Some(h).\nProof. \nintros l H. \napply exists_min_nonempty in H.']

In [92]:
tokenizer.decode(model_output.sequences[0, input_ids.shape[1]:], skip_special_tokens=True)

'\ndestruct l as [| l0 l1].'

In [80]:
model_output.scores

(tensor([[   -inf,    -inf, 10.8047,  ...,    -inf,    -inf,    -inf]],
        device='cuda:0'),
 tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'),
 tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'),
 tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'),
 tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'),
 tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'),
 tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'),
 tensor([[  -inf,   -inf, 9.7266,  ...,   -inf,   -inf,   -inf]],
        device='cuda:0'),
 tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'),
 tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'),
 tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'))

In [81]:

def get_sequence_score(input_sequence: torch.LongTensor,
                       whole_sequence: torch.LongTensor, 
                       scores: tuple[torch.FloatTensor],
                       stop_criteria: PeriodStoppingCriteria) -> float:
    assert len(scores) == int(whole_sequence.shape[0] - input_sequence.shape[0])
    sequence_score = 0
    start_idx = whole_sequence.shape[0] - len(scores)
    stop_criteria.set_num_periods(input_sequence[None, :])
    for i in range(len(scores)):
        index = whole_sequence[start_idx + i] 
        score_at_i = scores[i][0, index] - torch.logsumexp(scores[i][0], axis=0)
        sequence_score += (score_at_i)
        if stop_criteria(whole_sequence[None, :(start_idx + i + 1)], scores):
            break
    return sequence_score



In [86]:
get_sequence_score(
    input_ids[0], 
    model_output.sequences[0], 
    model_output.scores, period_stopping)

tensor(-5.2211, device='cuda:0')

In [36]:
model_output.scores[1].shape

torch.Size([1, 32016])

In [37]:
len(model_output.scores)

8

In [32]:
model_output

SampleDecoderOnlyOutput(sequences=tensor([[    1, 11894,  4864, 29918,  1195, 29901, 25345,   313, 29880,   584,
           313,  1761, 14033,  8243, 29871,    13,  1678,   313, 29880, 15271,
          4263, 29897,  1599,  4864,   298, 29892,  1375, 29898, 29880, 29897,
           353,  3834, 29898, 29882,   467,    13, 28116, 29889, 29871,    13,
           524,  1883,   301,   379, 29889, 29871,    13,  7854,  1247,   313,
          1195, 29898, 29880,  8106]], device='cuda:0'), scores=(tensor([[   -inf,    -inf, 10.8047,  ...,    -inf,    -inf,    -inf]],
       device='cuda:0'), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'), tensor([[  -inf,   -inf, 7.7188,  ...,   -inf,   -inf,   -inf]],
       device='cuda:0'), tensor([[  -inf,   -inf, 8.1875,  ...,   -inf,   -inf,   -inf]],
       device='cuda:0'), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'), tensor([[  -inf,   -inf, 7.5156,  ...,   -inf,   -inf,   -inf]],
       device='cuda:0')

In [31]:
model_output.scores

(tensor([[   -inf,    -inf, 10.8047,  ...,    -inf,    -inf,    -inf]],
        device='cuda:0'),
 tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'),
 tensor([[  -inf,   -inf, 7.7188,  ...,   -inf,   -inf,   -inf]],
        device='cuda:0'),
 tensor([[  -inf,   -inf, 8.1875,  ...,   -inf,   -inf,   -inf]],
        device='cuda:0'),
 tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'),
 tensor([[  -inf,   -inf, 7.5156,  ...,   -inf,   -inf,   -inf]],
        device='cuda:0'),
 tensor([[   -inf,    -inf, 10.0156,  ...,    -inf,    -inf,    -inf]],
        device='cuda:0'),
 tensor([[  -inf,   -inf, 9.2188,  ...,   -inf,   -inf,   -inf]],
        device='cuda:0'))

In [20]:
period_stopping.set_num_periods(input_ids)

In [8]:
period_stopping.num_input_periods

tensor(2, device='cuda:0')

In [19]:
tokenizer.batch_decode(model_output.sequences)

['<s> Lemma exists_min: forall (l : (list nat)), \n    (l <> nil) -> exists h, min(l) = Some(h).\nProof. \nintros l H. \ndestruct l as [|h t].</s>',
 "<s> Lemma exists_min: forall (l : (list nat)), \n    (l <> nil) -> exists h, min(l) = Some(h).\nProof. \nintros l H. \ndestruct l as [|x l'].</s>",
 '<s> Lemma exists_min: forall (l : (list nat)), \n    (l <> nil) -> exists h, min(l) = Some(h).\nProof. \nintros l H. \ndestruct l as [|x xs].</s>',
 '<s> Lemma exists_min: forall (l : (list nat)), \n    (l <> nil) -> exists h, min(l) = Some(h).\nProof. \nintros l H. \ndestruct l as [| h t].</s>',
 "<s> Lemma exists_min: forall (l : (list nat)), \n    (l <> nil) -> exists h, min(l) = Some(h).\nProof. \nintros l H. \ndestruct l as [| x l'].</s>"]

In [None]:
#collated_in = collate_input(test_in)
prompt = '''def remove_non_ascii(s: str) -> str:
    """ <FILL_ME>
    return result
'''

In [None]:
input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")

In [None]:
output = model.generate(input_ids, max_new_tokens=200)
output = output[0].to("cpu")

In [None]:
only_output = tokenizer.decode(output[input_ids.shape[1]:], skip_special_tokens=True)

In [None]:
2 in output

In [None]:
tokenizer.eos_token_id

In [None]:
print(only_output)