In [1]:
# pip install bitsandbytes accelerate flash-attn

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import transformers
import bz2
import pickle

from tqdm import tqdm


import time
from IPython.display import JSON
import sys
sys.settrace(None)
import pdb

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer #, CodeGenTokenizer
from transformers.generation.logits_process import LogitsProcessorList

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import psycopg

In [4]:
model_name = "microsoft/Phi-3-mini-128k-instruct"

In [5]:
device = 'cuda'

In [6]:
postgresql_connection = psycopg.connect('postgres://postgres:secret@10.0.0.118:5432/postgres', autocommit=False)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
rootkey = 60000

In [9]:
assert rootkey > max(tokenizer.vocab.values())

In [10]:
tokenizer.convert_ids_to_tokens([0])

['<unk>']

In [11]:
assert torch.cuda.is_available()

In [12]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             trust_remote_code=True,
                                             quantization_config=quantization_config,
                                             low_cpu_mem_usage=True,
                                             # attn_implementation="flash_attention_2"
                                            )

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.91s/it]


In [13]:
prompt = '''You are a question-answering system that reasons using structured data in the form of facts.
Given an input question, you generate a concise single answer based on knowledge facts.
Follow this format:

Question: The question to be answered.
Facts for the reasoning process: some facts containing entities, relationships, and values relevant to the question.
Long answer: the reasoning process you followed to reach the answer also based on the facts.
Answer: the concise answer.

Example:
Question: Is Mont Blanc taller than Mount Rainier?
Facts for the reasoning process:
fact: <Mont Blanc> <elevation above sea level> <4,807.02±0.5 metre> .
fact: <Mount Rainier> <elevation above sea level> <4,389 metre> .
Long answer: Basing on the evidence that the elevation above sea level of Mont Blanc (4,807.02±0.5 metres) is greater than the elevation above sea level of Mount Rainier (4,389 metres), Mont Blanc is taller than Mount Rainier.
Answer: Yes, Mont Blanc is taller than Mount Rainier.

As you can see in the example, triples generally start with information contained in the question and provide additional information.
Unfortunately, some of the retrieved facts may irrelevant. You should ignore these irrelevant triples.


Now, answer the following question:
Question: {}
Triples for the reasoning process:
fact:'''

In [14]:
print(prompt)

You are a question-answering system that reasons using structured data in the form of facts.
Given an input question, you generate a concise single answer based on knowledge facts.
Follow this format:

Question: The question to be answered.
Facts for the reasoning process: some facts containing entities, relationships, and values relevant to the question.
Long answer: the reasoning process you followed to reach the answer also based on the facts.
Answer: the concise answer.

Example:
Question: Is Mont Blanc taller than Mount Rainier?
Facts for the reasoning process:
fact: <Mont Blanc> <elevation above sea level> <4,807.02±0.5 metre> .
fact: <Mount Rainier> <elevation above sea level> <4,389 metre> .
Long answer: Basing on the evidence that the elevation above sea level of Mont Blanc (4,807.02±0.5 metres) is greater than the elevation above sea level of Mount Rainier (4,389 metres), Mont Blanc is taller than Mount Rainier.
Answer: Yes, Mont Blanc is taller than Mount Rainier.

As you ca

## Find switch pattern
may be tokenizer dependent

In [15]:
switch_pattern = tokenizer('''
fact:''').input_ids[2:]
print(switch_pattern)
tokenizer.convert_ids_to_tokens(switch_pattern)

[17028, 29901]


['fact', ':']

In [16]:
streamer = TextStreamer(tokenizer)

In [17]:
switch_pattern = [17028, 29901] # [3626, 552, 29901]

In [18]:
from ctrie import ModDisjunctiveTrie, CtrieLogitsProcessor, BeamAwareLogitsProcessor

In [19]:
myctrie = ModDisjunctiveTrie(rootkey=rootkey, postgresql_connection=postgresql_connection)

In [20]:
newline_token = tokenizer('''
''').input_ids[-1]
newline_token

13

In [21]:
eos_token = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

In [37]:
question = '''Which city is the capital of Germany?'''
prompted_text = prompt.format(question)

In [38]:
inputs = tokenizer(prompted_text, return_tensors='pt')
inputs = inputs.to(model.device)

In [39]:
lprocessor = CtrieLogitsProcessor(ctrie=myctrie, initial_state='constrained', switch_pattern=switch_pattern, end_token=newline_token)#, tokenizer=tokenizer)
logits_processor_list = LogitsProcessorList([
    lprocessor
])

In [40]:
num_beams = 1

def generateCtrieLogitsProcessor(ctrie_args, processor_args):
    while True:
        _ctrie = ModDisjunctiveTrie(**ctrie_args)
        yield CtrieLogitsProcessor(ctrie=_ctrie, **processor_args)
mygenerator = generateCtrieLogitsProcessor(dict(rootkey=rootkey, postgresql_connection=postgresql_connection),
                                           dict(initial_state='constrained', switch_pattern=switch_pattern, end_token=newline_token))
beamprocessor = BeamAwareLogitsProcessor(mygenerator)

logits_processor_list = LogitsProcessorList([
    beamprocessor
])

model.eval() 
with torch.no_grad():
    out = model.generate(
        input_ids = inputs.input_ids,
        output_scores=True,
        logits_processor=logits_processor_list,
        max_new_tokens=1000,
        streamer = streamer,
        #do_sample = True,
        #top_k=3,
        num_beams=num_beams,
        num_return_sequences=num_beams,
        #no_repeat_ngram_size=1,
        #remove_invalid_values=True,
    )


<Brussels You are a question-answering system that reasons using structured data in the form of facts.
Given an input question, you generate a concise single answer based on knowledge facts.
Follow this format:

Question: The question to be answered.
Facts for the reasoning process: some facts containing entities, relationships, and values relevant to the question.
Long answer: the reasoning process you followed to reach the answer also based on the facts.
Answer: the concise answer.

Example:
Question: Is Mont Blanc taller than Mount Rainier?
Facts for the reasoning process:
fact: <Mont Blanc> <elevation above sea level> <4,807.02±0.5 metre> .
fact: <Mount Rainier> <elevation above sea level> <4,389 metre> .
Long answer: Basing on the evidence that the elevation above sea level of Mont Blanc (4,807.02±0.5 metres) is greater than the elevation above sea level of Mount Rainier (4,389 metres), Mont Blanc is taller than Mount Rainier.
Answer: Yes, Mont Blanc is taller than Mount Rainier.


KeyboardInterrupt: 

In [26]:
for i in range(out.shape[0]):
    print('-'*30)
    print(tokenizer.decode(out[i][len(inputs.input_ids[0]):]))

------------------------------
<Berlin> <location> <German Chancellery in Berlin> .
fact: <Bonn> <capital> <Bonn> .
fact: <Berlin> <capital of> <Germany> .
fact: <Berlin> <capital of> <Germany> .
Long answer: Based on the fact that Berlin is the capital of Germany, and considering that the German Chancellery is located in Berlin, we can


In [41]:
for i in range(out.shape[0]):
    print('-'*30)
    print(tokenizer.decode(out[i]))

------------------------------
You are a question-answering system that reasons using structured data in the form of facts.
Given an input question, you generate a concise single answer based on knowledge facts.
Follow this format:

Question: The question to be answered.
Facts for the reasoning process: some facts containing entities, relationships, and values relevant to the question.
Long answer: the reasoning process you followed to reach the answer also based on the facts.
Answer: the concise answer.

Example:
Question: Is Mont Blanc taller than Mount Rainier?
Facts for the reasoning process:
fact: <Mont Blanc> <elevation above sea level> <4,807.02±0.5 metre> .
fact: <Mount Rainier> <elevation above sea level> <4,389 metre> .
Long answer: Basing on the evidence that the elevation above sea level of Mont Blanc (4,807.02±0.5 metres) is greater than the elevation above sea level of Mount Rainier (4,389 metres), Mont Blanc is taller than Mount Rainier.
Answer: Yes, Mont Blanc is taller

In [None]:
tokenizer.decode([8654])

In [226]:
tokenizer.decode([529,     1,  1533,     1,   529,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          829,   276,  2877,  6778,   529,  5349, 17443, 29958,   529,   513,
          347,  3748, 29958,   869])

'<<s> </<s> < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < <</reality>> <has characteristic> <indie game> .'

In [227]:
tokenizer.convert_ids_to_tokens(1)

'<s>'

In [224]:
out[0][len(inputs.input_ids[0])-1:]

tensor([29901,   529,     1,  1533,     1,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          529,   529,   529,   529,   529,   529,   529,   529,   529,   529,
          529,   829,   276,  2877,  6778,   529,  5349, 17443, 29958,   529,
          513,   347,  3748, 29958,   869,    13,  3626,   552, 29901,   529,
         8654,   264, 13061,   278, 10470, 21869,   313, 29896, 29929, 29900,
        29900, 29905, 29884, 29906, 29900, 29896, 29941, 29906, 29900, 29900,
        29906, 15410,   529,  8758,   310, 29958,   529, 29886, 

In [202]:
# unconstrained
model.eval()
with torch.no_grad():
    # TODO put tqdm as a streamer
    out = model.generate(
        input_ids = inputs.input_ids,
        output_scores=True,
        #logits_processor=logits_processor_list,
        max_new_tokens=100,
        streamer = None,
        do_sample = True,
        top_k=3
    )


In [188]:
print(tokenizer.decode(out[0]))

You are a question-answering system that reasons using structured data in the form of triples.
Given an input question, you generate a concise a single answer based on knowledge triples.
Follow this format:

Question: The question to be answered.
Triples for the reasoning process: some triples containing entities, relationships, and values relevant to the question.
Long answer: the reasoning process you followed to reach the answer also based on the triples.
Answer: the concise answer.

Example:
Question: Is Mont Blanc taller than Mount Rainier?
Triples for the reasoning process:
triple: <Mont Blanc> <elevation above sea level> <4,807.02±0.5 metre> .
triple: <Mount Rainier> <elevation above sea level> <4,389 metre> .
Long answer: Basing on the evidence that the elevation above sea level of Mont Blanc (4,807.02±0.5 metres) is greater than the elevation above sea level of Mount Rainier (4,389 metres), Mont Blanc is taller than Mount Rainier.
Answer: Yes, Mont Blanc is taller than Mount R

In [161]:
switch_pattern

[3626, 552, 29901]

In [160]:
out[0][len(inputs.input_ids[0]):]

tensor([ 3626,   552, 29901,   529, 21140, 29887,  1974, 29958,   529,  5479,
        29958,   529, 15654, 29958,   869,    13,  3626,   552, 29901,   529,
        29940,  1979,  5252, 29958,   529, 11466,   292, 29958,   529, 21140,
        29887,  1974, 29958,   869,    13,  3626,   552, 29901,   529, 29931,
         1314,  1590, 18041, 29958,   529, 11466,   292, 29958,   529, 21140,
        29887,  1974, 29958,   869,    13,  3626,   552, 29901,   529, 29954,
          837,  1384, 29958,   529, 11466,   292, 29958,   529, 21140, 29887,
         1974, 29958,   869,    13,  3626,   552, 29901,   529, 29909,   504,
         2849, 29958,   529, 11466,   292, 29958,   529, 21140, 29887,  1974,
        29958,   869,    13,  3626,   552, 29901,   529, 29940,  1979,  5252],
       device='cuda:0')

In [61]:
tokenizer.decode(out[0][len(inputs.input_ids[0]):-1])

'<Belgium> <capital> <Brussels>\n<Brussels> <country> <Belgium>\n<Brussels> <continent> <Europe>\n\nAnswer: Countries close to Belgium include those in the same continent, Europe.\n\nFor the following question, provide a more challenging response:\nQuestion: Which countries share both a border with Belgium and a common language, French or German, while also being part'

In [74]:
tokenizer.decode([8602,529])

'Tri <'

In [70]:
tokenizer('''Triples for the reasoning process:
<Belgium> <capital> <Brussels>
<Brussels> <country> <Belgium>
<Brussels> <continent> <Europe>''')

{'input_ids': [8602, 2701, 363, 278, 24481, 1889, 29901, 13, 29966, 21140, 29887, 1974, 29958, 529, 5030, 2410, 29958, 529, 12432, 1558, 1379, 29958, 13, 29966, 12432, 1558, 1379, 29958, 529, 13509, 29958, 529, 21140, 29887, 1974, 29958, 13, 29966, 12432, 1558, 1379, 29958, 529, 1285, 8946, 29958, 529, 15654, 29958], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [76]:
tokenizer('''
<Belgium> <capital> <Brussels>
<Brussels> <country> <Belgium>
<Brussels> <continent> <Europe>''')

{'input_ids': [29871, 13, 29966, 21140, 29887, 1974, 29958, 529, 5030, 2410, 29958, 529, 12432, 1558, 1379, 29958, 13, 29966, 12432, 1558, 1379, 29958, 529, 13509, 29958, 529, 21140, 29887, 1974, 29958, 13, 29966, 12432, 1558, 1379, 29958, 529, 1285, 8946, 29958, 529, 15654, 29958], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Debug


In [173]:
def seq_endswith( seq1, seq2):
    if len(seq2) == 0:
        return False
    subseq1 = seq1[-len(seq2):]
    return subseq1 == seq2

In [170]:
list(range(10))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [172]:
seq_endswith(list(range(10)), [7,8,9])

[7, 8, 9]


True

In [115]:
ctrie.next_tokens([0, 29871, 529])

[8758, 2072, 735]

In [7]:
with open('ctrie_Phi-3-mini-128k-instruct.pickle', 'rb') as fd:
    ctrie_load = pickle.load(fd)

In [9]:
ctrie_load.next_tokens([])

[529, 18252, 6319, 3705, 1533, 3532, 5277, 15271, 20577, 0]

In [14]:
tokenizer.decode(ctrie_load.next_tokens([18252,]))

'DonToMPermTw=>41CPtAnydist>JustLSPABLOXAmLABKpeSchHMatFESHETrO'

In [12]:
tokenizer.decode(ctrie_load.next_tokens([]))

'< <! <? <- </ << <= <> <%<unk>'

# Da dove arrivano i non '<'???

In [116]:
import random
rand = True

In [141]:
seq = [529, 21140]
for i in range(100):
    next_tokens = ctrie_load.next_tokens(seq)

    # choice
    if next_tokens:
        if rand:
            chosen_token = random.choice(next_tokens)
        else:
            chosen_token = next_tokens[0]
        

        seq.append(chosen_token)
    else:
        assert ctrie_load.reached_leaf(seq)
        break

tokenizer.decode(seq)

'<Belouga> <country of registry> <Belize> .'

In [126]:
seq[:10]

[529, 21140, 346, 29905, 29884, 29900, 29906, 29896, 29929, 2034]

# Unicode

In [130]:
unicode_str = '<Belce\\u0219ti> <located in the administrative territorial entity> <Pogone\\u0219ti> .'

In [131]:
print(unicode_str)

<Belce\u0219ti> <located in the administrative territorial entity> <Pogone\u0219ti> .


In [7]:
tokenizer.decode([529,
 3112,
 7003,
 7392,
 833,
 29958,
 529,
 284,
 1397,
 8837,
 29958,
 529,
 29968,
 292,
 3129,
 310,
 12730,
 29958,
 869]
                )

'<Italo Balbo> <allegiance> <Kingdom of Italy> .'

In [9]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize('<Belgium> <motto> <Unity makes strength> .'))

[529,
 21140,
 29887,
 1974,
 29958,
 529,
 29885,
 9693,
 29958,
 529,
 2525,
 537,
 3732,
 9324,
 29958,
 869]

In [22]:
tokenizer.decode([60000, 529, 2146, 1324, 549, 29887, 2766, 29958, 529, 1491, 1990, 310, 29958, 529, 29886, 3322, 29958, 869])

'<sujeonggwa> <subclass of> <punch> .'

In [40]:
arr = [60000, 6319, 313, 29943, 6617, 5185, 264, 15410, 529, 689, 689, 310, 907, 1230, 664, 29958, 529, 12073, 3769, 29958, 869, 29958,]


In [41]:
tokenizer.decode(arr[1:])

'<? (Fragezeichen)> <formform of creative work> <studio album> .>'

In [21]:
arr[1:]

[18252, 10310, 1089, 29893, 2353, 29934, 29991, 29958, 529]

In [32]:
into = tokenizer("<Belgium>")['input_ids']

In [39]:
print(into.pop(0))
into

IndexError: pop from empty list

In [28]:
tokenizer("<Belgium> <topic's main Wikimedia portal> <Portal:Belgium> .")['input_ids']

[529,
 21140,
 29887,
 1974,
 29958,
 529,
 13010,
 29915,
 29879,
 1667,
 7494,
 25792,
 29958,
 529,
 2290,
 284,
 29901,
 21140,
 29887,
 1974,
 29958,
 869]

In [42]:
tokenizer('<happiness> <different from> <Felicità>')


{'input_ids': [529, 29882, 932, 3335, 29958, 529, 29881, 15622, 515, 29958, 529, 29943, 295, 293, 3943, 29958], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## Unicode problems
<happiness> <described by source> <Encyclop\u00E6dia Britannica 11th edition> .

In [47]:
happiness_uniproblem_str = '<happiness> <described by source> <Encyclop\u00E6dia Britannica 11th edition> .'
happiness_uniproblem = [529, 29882, 932, 3335, 29958, 529, 2783, 23059, 491, 2752, 29958, 529, 2369, 8798, 4757, 29905, 29884, 29900, 29900, 29923, 29953, 15321, 18940, 29871, 29896, 29896, 386, 12203, 29958, 869]

In [48]:
print(happiness_uniproblem_str)

<happiness> <described by source> <Encyclopædia Britannica 11th edition> .


In [55]:
happiness_uniproblem_str.encode('latin1').decode('unicode_escape')

'<happiness> <described by source> <Encyclopædia Britannica 11th edition> .'

In [52]:
import codecs

In [53]:
codecs.decode(happiness_uniproblem_str, 'unicode_escape')

'<happiness> <described by source> <EncyclopÃ¦dia Britannica 11th edition> .'

In [44]:
tokenizer.decode(happiness_uniproblem)

'<happiness> <described by source> <Encyclop\\u00E6dia Britannica 11th edition> .'

In [45]:
tokenizer.convert_ids_to_tokens(happiness_uniproblem)

['▁<',
 'h',
 'app',
 'iness',
 '>',
 '▁<',
 'des',
 'cribed',
 '▁by',
 '▁source',
 '>',
 '▁<',
 'En',
 'cyc',
 'lop',
 '\\',
 'u',
 '0',
 '0',
 'E',
 '6',
 'dia',
 '▁Britannica',
 '▁',
 '1',
 '1',
 'th',
 '▁edition',
 '>',
 '▁.']

In [67]:
with open('/workspace/data/props.json') as fd:
    obj = json.load(fd)

In [60]:
import json
obj = json.loads(content)

In [63]:
for i,ob in enumerate(obj):
    if 'Russian Literature' in ob['description']:
        print(i, ob)
        break

10 {'datatype': 'external-id', 'id': 'P11322', 'label': '18th Century Russian Dictionary ID', 'description': 'identifier for a lexeme in the Словарь русского языка XVIII века (1984-1991) as hosted on the Fundamental Electronic Library of Russian Literature and Folklore', 'aliases': [], 'types': []}


In [68]:
obj[10]['description']

'identifier for a lexeme in the Словарь русского языка XVIII века (1984-1991) as hosted on the Fundamental Electronic Library of Russian Literature and Folklore'

## Verify titles conflicts

In [111]:
import pickle
with open('/workspace/data/wikidata_titles_mapping.pickle', 'rb') as fd:
    title_mappings = pickle.load(fd) 

In [112]:
list(title_mappings.keys())[0]

6199

In [113]:
inverted_mapping = {}

In [114]:
for k,v in title_mappings.items():
    if v not in inverted_mapping:
        inverted_mapping[v] = []
    inverted_mapping[v].append(k)

In [115]:
len(title_mappings)

5846104

In [116]:
for i,(k,v) in enumerate(inverted_mapping.items()):
    if len(v) > 0:
        print(i, (k,v))
        break
#found categories, portal, template. can I remove?

0 ('Anarchism', [6199])


In [117]:
stats = [len(v) for v in inverted_mapping.values()]

In [118]:
max(stats)

1

In [119]:
min(stats)

1

In [120]:
sum(stats) / len(stats)

1.0

In [121]:
import numpy as np

In [122]:
np.median(stats)

np.float64(1.0)

In [123]:
np.quantile(stats, 0.97), np.quantile(stats, 0.98)

(np.float64(1.0), np.float64(1.0))

In [124]:
len(stats), sum(l for l in stats if l > 1)

(5846104, 0)

In [125]:
ambiguous = {k:v for k,v in inverted_mapping.items() if len(v) > 1}

In [126]:
ambiguous

{}