In [1]:
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"] = "6" 

from flask import Flask, request
from transformers import (LlamaForCausalLM, CodeLlamaTokenizer,
                          BitsAndBytesConfig, StoppingCriteriaList)
import torch

from tactic_gen.lm_example import LmExample 
from tactic_gen.train_codellama import (collate_input, CONF_NAME, load_config,
                                        get_tokenizer)
from model_deployment.serve_base_codellama import PeriodStoppingCriteria


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model_name = "codellama/CodeLlama-7b-hf"
model = LlamaForCausalLM.from_pretrained(
    model_name, quantization_config=quantization_config
)
tokenizer = CodeLlamaTokenizer.from_pretrained(model_name)

device = "cuda" 

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.97s/it]


In [3]:
period_stopping = PeriodStoppingCriteria.from_tokenizer(tokenizer)

In [4]:
reverse_tok_vocab = dict((v, k) for k, v in tokenizer.get_vocab().items())
[reverse_tok_vocab[i] for i in period_stopping.stop_tok_ids]

[').',
 '..',
 '...',
 '▁.',
 '."',
 '`.',
 '$.',
 '".',
 '».',
 '.,',
 '].',
 '}.',
 '.)',
 '▁...',
 '().',
 '").',
 "').",
 '._',
 '....',
 '.”',
 "'.",
 '}$.',
 '.:',
 ')$.',
 '.}',
 '▁`.',
 '_.',
 '.*',
 '.]',
 ".'",
 '.).',
 '/.',
 '▁..',
 './',
 '../',
 '.[',
 '.$',
 ',.',
 '}).',
 '.--',
 '.\\',
 ')).',
 '▁$.',
 '”.',
 '.;',
 '.~\\',
 '.-',
 '“.',
 '.):',
 '.),',
 '?.',
 '▁».',
 '*.',
 '........',
 '².',
 '▁".',
 '...)',
 '▁./',
 '(.',
 '/).',
 "▁$('.",
 "('.",
 '`).',
 '▁....',
 '.~',
 "'].",
 '()`.',
 '▁).',
 '.(',
 ')`.',
 '▁(.',
 '.“',
 ']).',
 "▁'.",
 '>.',
 '%.',
 '].[',
 '.»',
 '▁.=',
 '...,',
 '.’',
 '()).',
 '"].',
 '.__',
 '(".',
 '...]',
 '▁$(".',
 '.`',
 '..."',
 '.");',
 '.....',
 '▁%.',
 '$).',
 "▁'./",
 '(...',
 '.<',
 '.$$',
 '.",',
 ')..',
 '▁*.',
 '=.',
 '.</',
 '.@',
 '!.',
 '../../',
 '▁[...]',
 '.—',
 '".$',
 "'.$",
 ']$.',
 '▁}).',
 '.\r',
 '.")',
 '\\.',
 '})$.',
 '\\}$.',
 '.](',
 '.=',
 '!...',
 '................',
 ')».',
 '{.',
 '}}$.',
 '.*;',
 '.:\u2

In [10]:
test_in = 'Lemma exists_min: forall (l : (list nat)), \n    (l <> nil) -> exists h, min(l) = Some(h).\nProof. \nintros l H. <FILL_ME>' 
test_out = "\n    + "

In [11]:
input_ids = tokenizer(test_in, return_tensors="pt")["input_ids"].to("cuda")
period_stopping.set_num_periods(input_ids)
stopping_list = StoppingCriteriaList([period_stopping])

In [12]:
model_output = model.generate(
    input_ids, 
    temperature=1,
    do_sample=True,
    num_beams=5,
    num_return_sequences=5, 
    max_new_tokens=200, 
    stopping_criteria=stopping_list) 
single_output = model_output[0].to("cpu")
token_output = tokenizer.decode(single_output[input_ids.shape[1]:], skip_special_tokens=True)
token_output


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'\ndestruct l as [|h t].\n'

In [20]:
period_stopping.set_num_periods(input_ids)

In [8]:
period_stopping.num_input_periods

tensor(2, device='cuda:0')

In [13]:
tokenizer.batch_decode(model_output)

['<s> Lemma exists_min: forall (l : (list nat)), \n    (l <> nil) -> exists h, min(l) = Some(h).\nProof. \nintros l H. \ndestruct l as [|h t].\n</s>',
 '<s> Lemma exists_min: forall (l : (list nat)), \n    (l <> nil) -> exists h, min(l) = Some(h).\nProof. \nintros l H. \ndestruct l as [|h t]. </s>',
 "<s> Lemma exists_min: forall (l : (list nat)), \n    (l <> nil) -> exists h, min(l) = Some(h).\nProof. \nintros l H. \ndestruct l as [|x l'].\n</s>",
 '<s> Lemma exists_min: forall (l : (list nat)), \n    (l <> nil) -> exists h, min(l) = Some(h).\nProof. \nintros l H. \ndestruct l as [|x xs].\n</s>',
 "<s> Lemma exists_min: forall (l : (list nat)), \n    (l <> nil) -> exists h, min(l) = Some(h).\nProof. \nintros l H. \ndestruct l as [|x l']. </s>"]

In [None]:
#collated_in = collate_input(test_in)
prompt = '''def remove_non_ascii(s: str) -> str:
    """ <FILL_ME>
    return result
'''

In [None]:
input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")

In [None]:
output = model.generate(input_ids, max_new_tokens=200)
output = output[0].to("cpu")

In [None]:
only_output = tokenizer.decode(output[input_ids.shape[1]:], skip_special_tokens=True)

In [None]:
2 in output

In [None]:
tokenizer.eos_token_id

In [None]:
print(only_output)