In [2]:
# pip install bitsandbytes accelerate flash-attn

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch
import bz2
import pickle

from tqdm import tqdm


import time
from IPython.display import JSON
import sys
sys.settrace(None)
import pdb

import copy

import psycopg

In [4]:
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer, StoppingCriteria, StoppingCriteriaList, DynamicCache, OffloadedCache #, CodeGenTokenizer
from transformers.generation.logits_process import LogitsProcessorList

In [5]:
model_name = "microsoft/phi-4"

In [6]:
device = 'cuda'

In [7]:
postgresql_connection = psycopg.connect('postgres://postgres:secret@10.0.0.118:5432/postgres', autocommit=False)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
rootkey = 150000

In [10]:
tokenizer.eos_token

'<|im_end|>'

In [11]:
tokenizer.encode('<end> .')

[27, 408, 29, 662]

In [12]:
ob = {'ab':1}

In [13]:
end_of_triple = 662

In [14]:
tokenizer.convert_ids_to_tokens([29871, 869, 29871])

['.amazonaws', 'ov', '.amazonaws']

In [15]:
assert rootkey > max(tokenizer.vocab.values())

In [16]:
tokenizer.convert_ids_to_tokens([0])

['!']

In [17]:
assert torch.cuda.is_available()

In [18]:
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype='bfloat16'
            )

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             #trust_remote_code=True,
                                             quantization_config=quantization_config,
                                             low_cpu_mem_usage=True,
                                             #attn_implementation="flash_attention_2",
                                             #attn_implementation="flash_attention",
                                            )

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [19]:
assert model.device.type == device

In [68]:
prompt = [
    {
        'role':'system',
        'content': '''You are a helpful question answering assistant that bases its answer on facts from a knowledge base.
1) You receive an input question.
3) You explicitly provide relevant facts, one per line starting with "Fact:".
5) You provide a short concise answer.
'''
    },
    {
        'role': 'user',
        'content': '''Which mountain is taller between Mont Blanc and Mount Rainier?
'''
    },
    {
        'role': 'assistant',
        'content': '''
Fact: <Mont Blanc> <elevation above sea level> <4,807.02±0.5 meters> .
Fact: <Mount Rainier> <elevation above sea level> <4,389 meters> .

Answer: Mont Blanc.
'''
    }]

In [69]:
print(prompt)

[{'role': 'system', 'content': 'You are a helpful question answering assistant that bases its answer on facts from a knowledge base.\n1) You receive an input question.\n3) You explicitly provide relevant facts, one per line starting with "Fact:".\n5) You provide a short concise answer.\n'}, {'role': 'user', 'content': 'Which mountain is taller between Mont Blanc and Mount Rainier?\n'}, {'role': 'assistant', 'content': '\nFact: <Mont Blanc> <elevation above sea level> <4,807.02±0.5 meters> .\nFact: <Mount Rainier> <elevation above sea level> <4,389 meters> .\n\nAnswer: Mont Blanc.\n'}]


## Find switch pattern
may be tokenizer dependent

In [70]:
switch_pattern = tokenizer('''
Fact:''').input_ids
print(switch_pattern)
tokenizer.convert_ids_to_tokens(switch_pattern)

[198, 17873, 25]


['Ċ', 'Fact', ':']

In [71]:
streamer = TextStreamer(tokenizer)

In [72]:
switch_pattern = [17873, 25]

In [73]:
from ctrie import PostgresTrieIndex, ConstrainedLogitsProcessor, ConstrainedStateList, GetAnswer

In [74]:
newline_token = tokenizer('''
''').input_ids[-1]
newline_token

198

In [113]:
myctrie = PostgresTrieIndex(rootkey=rootkey, postgresql_connection=postgresql_connection, switch_parameter=6, table_name='ctriev2')

In [114]:
eos_token = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
eos_token

100265

In [145]:
question = {'role':'user', 'content': '''Which city is the capital of the country where the Eiffel Tower is?
'''}
question = {'role':'user', 'content': '''Who was the quarterback of the team that won Super Bowl 50?
'''}
prompted_texts = [tokenizer.apply_chat_template(prompt + [question], tokenize=False, add_generation_prompt=True)]

In [146]:
print(prompted_texts[0])

<|im_start|>system<|im_sep|>You are a helpful question answering assistant that bases its answer on facts from a knowledge base.
1) You receive an input question.
3) You explicitly provide relevant facts, one per line starting with "Fact:".
5) You provide a short concise answer.
<|im_end|><|im_start|>user<|im_sep|>Which mountain is taller between Mont Blanc and Mount Rainier?
<|im_end|><|im_start|>assistant<|im_sep|>
Fact: <Mont Blanc> <elevation above sea level> <4,807.02±0.5 meters> .
Fact: <Mount Rainier> <elevation above sea level> <4,389 meters> .

Answer: Mont Blanc.
<|im_end|><|im_start|>user<|im_sep|>Who was the quarterback of the team that won Super Bowl 50?
<|im_end|><|im_start|>assistant<|im_sep|>


In [147]:
tokenizer.convert_ids_to_tokens(13)

'.'

In [148]:
tokenizer.pad_token = tokenizer.convert_ids_to_tokens(newline_token) # tokenizer.tokenize('\n')[0]
tokenizer.pad_token

'Ċ'

In [149]:
inputs = tokenizer(prompted_texts, return_tensors='pt', padding=True, padding_side='right')
inputs = inputs.to(model.device)

In [150]:
inputs.input_ids[:,:20]

tensor([[100264,   9125, 100266,   2675,    527,    264,  11190,   3488,  36864,
          18328,    430,  23963,   1202,   4320,    389,  13363,    505,    264,
           6677,   2385]], device='cuda:0')

In [151]:
constrained_processor = ConstrainedLogitsProcessor(index=myctrie, switch_pattern=switch_pattern, end_token=newline_token)#, tokenizer=tokenizer)
logits_processor_list = LogitsProcessorList([
    constrained_processor
])

In [152]:
print(tokenizer.encode('''
Answer:'''))
answer_tokens = [16533, 25]
answer_tokens

[198, 16533, 25]


[16533, 25]

In [153]:
tokenizer.encode(' <Paris>')

[366, 60704, 29]

In [154]:
tokenizer.convert_ids_to_tokens(366)

'Ġ<'

In [155]:
angular_parentheses_right = 29

In [156]:
tokenizer.eos_token_id

100265

In [157]:
getanswer = GetAnswer(answer_tokens, [newline_token, tokenizer.eos_token_id], all)
stopping_criteria = StoppingCriteriaList([
    getanswer
])

In [158]:
answer_tokens

[16533, 25]

In [159]:
inputs.input_ids.shape

torch.Size([1, 149])

In [160]:
#     pdb.Pdb().set_break('/opt/conda/lib/python3.11/site-packages/transformers/generation/utils.py', 3586)

In [163]:
num_beams = 1

states = ConstrainedStateList(num_beams, switch_pattern, newline_token)

constrained_processor = ConstrainedLogitsProcessor(index=converter_index, end_token=newline_token, states=states)#, tokenizer=tokenizer)
logits_processor_list = LogitsProcessorList([
    constrained_processor
])

model.eval()
start = time.time()

with torch.no_grad():

    getanswer.set_prompt(inputs.input_ids[0])

    genargs = dict(
        **inputs,
        logits_processor=logits_processor_list,
        max_new_tokens=200,
        streamer = streamer,
        #do_sample = True,
        #top_k=3,
        num_beams=num_beams,
        num_return_sequences=1,
        #no_repeat_ngram_size=1,
        #remove_invalid_values=True,
        stopping_criteria=stopping_criteria,
        use_cache=True,
        #past_key_values=past_key_values,
        kwargs = {'constrained_state': states}, # passing state
    )
    out = model.generate(**genargs)
    #pdb.runcall(model.generate, None, **genargs)
print('Elapsed', time.time() - start)

<|im_start|>system<|im_sep|>You are a helpful question answering assistant that bases its answer on facts from a knowledge base.
1) You receive an input question.
3) You explicitly provide relevant facts, one per line starting with "Fact:".
5) You provide a short concise answer.<|im_end|><|im_start|>user<|im_sep|>Which mountain is taller between Mont Blanc and Mount Rainier?<|im_end|><|im_start|>assistant<|im_sep|>Fact: <Mont Blanc> <elevation above sea level> <4,807.02±0.5 meters> .
Fact: <Mount Rainier> <elevation above sea level> <4,389 meters> .

50?<|im_end|><|im_start|>assistant<|im_sep|>r<|im_sep|>Who was the quarterback of the team that won Super Bowl 
Fact:<The Denver Post> <headquarters location> <Denver> .
Fact:<Denver Broncos> <home city> <Denver> .
Fact:<Super Bowl 50> <winning team> <Denver Broncos> .
Fact:<Super Bowl 50> <date> <February 7, 2016> .
Fact:<Peyton Manning> <position> <quarterback> .
Fact:<Peyton Manning> <team during Super Bowl 50> <Denver Broncos> .

Answe

In [164]:
for i in range(out.shape[0]):
    print('-'*30, sum(out[i][len(inputs.input_ids[0]):]), len(out[i][len(inputs.input_ids[0]):]))
    print(tokenizer.decode(out[i][len(inputs.input_ids[0]):]))

------------------------------ tensor(1561558, device='cuda:0') 109

Fact:<The Denver Post> <headquarters location> <Denver> .
Fact:<Denver Broncos> <home city> <Denver> .
Fact:<Super Bowl 50> <winning team> <Denver Broncos> .
Fact:<Super Bowl 50> <date> <February 7, 2016> .
Fact:<Peyton Manning> <position> <quarterback> .
Fact:<Peyton Manning> <team during Super Bowl 50> <Denver Broncos> .

Answer: Peyton Manning.<|im_end|>


In [165]:
for i in range(out.shape[0]):
    print(tokenizer.decode(out[i]))

<|im_start|>system<|im_sep|>You are a helpful question answering assistant that bases its answer on facts from a knowledge base.
1) You receive an input question.
3) You explicitly provide relevant facts, one per line starting with "Fact:".
5) You provide a short concise answer.<|im_end|><|im_start|>user<|im_sep|>Which mountain is taller between Mont Blanc and Mount Rainier?<|im_end|><|im_start|>assistant<|im_sep|>Fact: <Mont Blanc> <elevation above sea level> <4,807.02±0.5 meters> .
Fact: <Mount Rainier> <elevation above sea level> <4,389 meters> .

Answer: Mont Blanc.<|im_end|><|im_start|>user<|im_sep|>Who was the quarterback of the team that won Super Bowl 50?<|im_end|><|im_start|>assistant<|im_sep|>
Fact:<The Denver Post> <headquarters location> <Denver> .
Fact:<Denver Broncos> <home city> <Denver> .
Fact:<Super Bowl 50> <winning team> <Denver Broncos> .
Fact:<Super Bowl 50> <date> <February 7, 2016> .
Fact:<Peyton Manning> <position> <quarterback> .
Fact:<Peyton Manning> <team dur

<|im_start|>system<|im_sep|>
You are a helpful question answering assistant that bases its answer on facts from a knowledge base.
1) You receive an input question.
3) You explicitly provide relevant facts, one per line starting with "Fact:".
5) You provide a short concise answer.
<|im_end|><|im_start|>user<|im_sep|>

Which mountain is taller between Mont Blanc and Mount Rainier?
<|im_end|><|im_start|>assistant<|im_sep|>

Fact: <Mont Blanc> <elevation above sea level> <4,807.02±0.5 meters> .
Fact: <Mount Rainier> <elevation above sea level> <4,389 meters> .

Answer: Mont Blanc.<|im_end|><|im_start|>user<|im_sep|>

Which city is the capital of the country where the Eiffel Tower is?
<|im_end|><|im_start|>assistant<|im_sep|>

Fact:<Eiffel Tower> <location> <Champ de Mars> .
Fact:<Champ de Mars> <city> <Paris> .
Fact:<Paris> <capital of> <France> .

Answer: Paris.<|im_end|>

Who was the quarterback of the team that won Super Bowl 50?
<|im_end|><|im_start|>assistant<|im_sep|>

Fact:<The Denver Post> <headquarters location> <Denver> .
Fact:<Denver Broncos> <home city> <Denver> .
Fact:<Super Bowl 50> <winning team> <Denver Broncos> .
Fact:<Super Bowl 50> <date> <February 7, 2016> .
Fact:<Peyton Manning> <position> <quarterback> .
Fact:<Peyton Manning> <team during Super Bowl 50> <Denver Broncos> .

Answer: Peyton Manning.<|im_end|>

In [108]:
stop, ans = getanswer.get_answer(out[0])
# TODO: is getanswer still needed?
stop, tokenizer.decode(list(ans)) # TODO trim and remove special tokens

(False, ' Paris.<|im_end|>')