In [59]:
import numpy as np
import json

import torch
from sentence_transformers.util import cos_sim
from docuverse.utils.embeddings.ollama_embedding_function import OllamaSentenceTransformer
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

In [None]:
data=json.load(open("../../dd-out.json"))

In [7]:
def normalize(list):
    tmp = np.array(list)
    return tmp / np.linalg.norm(tmp, axis=1, keepdims=True)



In [8]:
oll=normalize(data['embeddings']['ollama'])
st=normalize(data['embeddings']['pytorch'])

In [17]:
res = [cos_sim(oll[i],st[i]) for i in range(len(oll))]

In [20]:
sorted(res)[:10]

[tensor([[-0.3437]], dtype=torch.float64),
 tensor([[-0.3162]], dtype=torch.float64),
 tensor([[-0.2997]], dtype=torch.float64),
 tensor([[-0.2814]], dtype=torch.float64),
 tensor([[-0.2721]], dtype=torch.float64),
 tensor([[-0.1298]], dtype=torch.float64),
 tensor([[-0.0215]], dtype=torch.float64),
 tensor([[0.1907]], dtype=torch.float64),
 tensor([[0.2177]], dtype=torch.float64),
 tensor([[0.2669]], dtype=torch.float64)]

In [22]:
np.argmin(res)

np.int64(24)

In [23]:
res[24]

tensor([[-0.3437]], dtype=torch.float64)

In [27]:
ollModel = OllamaSentenceTransformer(model_name="granite-r2")

In [29]:
import torch
stModel = SentenceTransformer(model_name_or_path="ibm-granite/granite-embedding-english-r2", model_kwargs={"dtype": torch.bfloat16})

In [30]:
text="In astronomy, axial precession is a gravity-induced, slow, and continuous change in the orientation of an astronomical body's rotational axis. In particular, it can refer to the gradual shift in the orientation of Earth's axis of rotation, which, similar to a wobbling top, traces out a pair of cones joined at their apices in a cycle of approximately 26,000 years. The term \"precession\" typically refers only to this largest part of the motion; other changes in the alignment of Earth's axis -- nutation and polar motion -- are much smaller in magnitude."

In [147]:
def run_comp(text):
    v1 = ollModel.encode(text)
    v2 = stModel.encode(text)
    
    # Cast both to float32 to ensure compatibility and performance
    if hasattr(v1, 'astype'): # if numpy
        v1 = v1.astype('float32')
    elif torch.is_tensor(v1):
        v1 = v1.to(torch.float32)
        
    if hasattr(v2, 'astype'): # if numpy
        v2 = v2.astype('float32')
    elif torch.is_tensor(v2):
        v2 = v2.to(torch.float32)

    return float(cos_sim(v1, v2)[0][0])

In [148]:
run_comp(text)

0.9999679327011108

In [41]:
text1="The oligodynamic effect (from Greek oligos \"few\", and dynamis \"force\") is a biocidal effect of metals, especially heavy metals, that occurs even in low concentrations. The effect was discovered by Karl Wilhelm von Nägeli, although he did not identify the cause. Brass doorknobs and silverware both exhibit this effect to an extent."

In [42]:
run_comp(text1)

tensor([[0.9999]])

In [68]:
from docuverse.utils.jsonl_utils import get_nested_field
benchmark_file = "../../benchmark/nq_new/nq-dev-fixed.jsonl"
bb = [json.loads(l) for l in open(benchmark_file)]
texts = []
for txt in bb:
    texts.append(get_nested_field(txt, 'contexts[*].text'))
from itertools import chain

texts = list(chain.from_iterable(texts))

In [151]:
res100 = np.array([run_comp(texts[i]) for i in tqdm(range(1000))])

  0%|          | 0/1000 [00:00<?, ?it/s]

In [150]:
res100

array([ 0.99995714,  0.99993467,  0.99995053,  0.99994773,  0.99993986,
        0.99995679,  0.99986058,  0.99995381,  0.9999333 ,  0.99996883,
        0.99996638,  0.4451288 ,  0.99991632,  0.99996555,  0.99996054,
        0.99997282,  0.99996179,  0.99992752,  0.99993724,  0.99988806,
        0.99995065,  0.99994528,  0.99993473,  0.99991757, -0.34318107,
        0.99994534,  0.99995559,  0.99992865,  0.99996847,  0.99987739,
        0.99995285,  0.99995077,  0.99993616,  0.9998858 ,  0.99996793,
        0.99993086,  0.99994761,  0.99995065,  0.99996859,  0.99996382,
        0.99996066,  0.99994344,  0.99996161,  0.99998254,  0.99997085,
        0.48569396,  0.99995476,  0.99993914,  0.99997675,  0.99994206,
        0.99990821,  0.99996877,  0.99994886,  0.21583235,  0.36815697,
        0.48784962,  0.99995935,  0.99994344,  0.99994099,  0.99997514,
        0.99997771,  0.99971235,  0.99991357,  0.9999696 ,  0.99996793,
        0.99986571,  0.99988735,  0.9999553 ,  0.99995756,  0.99

In [72]:
min(res100)

array([[-0.34318107]], dtype=float32)

In [73]:
np.argmin(res100)

np.int64(24)

In [75]:
texts[24]

'South Asian Association for Regional Cooperation (SAARC) (show) Bengali: দক্ষিণ এশীয় আঞ্চলিক সহযোগিতা সংস্থা Dari: اتحادیه همکاری\u200cهای منطقه\u200cای جنوب آسی Dzongkha: ༄ ལྷོ ཨེསིཨ ་ རེ ་ གིཨོནལ ་ ཅོཨོཔེརཏིཨོན ་ ཀོ ་ མི ་ ཏི ། Hindi: दक्षिण एशियाई क्षेत्रीय सहयोग संगठन Maldivian: ދެކުނު އޭޝިޔާގެ ސަރަޙައްދީ އެއްބާރުލުމުގެ ޖަމިއްޔާ Nepali: दक्षिण एशियाली क्षेत्रीय सहयोग सङ्गठन Pashto: د سویلي اسیا لپاره د سیمه ایزی همکارۍ ټولنه Sinhalese: දකුණු ආසියාතික කලාපීය සහයෝගිතා සංවිධානය Tamil: தெற்காசிய நாடுகளின் பிராந்தியக் கூட்டமைப்பு Urdu: جنوبی ایشیائی علاقائی تعاون کی تنظیم \u202c Logo Member states Observer states Headquarters Kathmandu Official languages English Demonym South Asian Member states 8 members (show) Afghanistan Bangladesh Bhutan India Maldives Nepal Pakistan Sri Lanka 9 observers (show) Australia China EU Iran Japan Mauritius Myanmar South Korea United States Leaders Secretary-General Amjad B. Hussain Establishment 8 December 1985 Area Total 5,099,611 km (1,968,971 sq mi)

In [79]:
toks = stModel.tokenizer(texts[24])

In [80]:
tt = stModel.tokenizer.decode(toks['input_ids'])

'[CLS]South Asian Association for Regional Cooperation (SAARC) (show) Bengali: দক্ষিণ এশীয় আঞ্চলিক সহযোগিতা সংস্থা Dari: اتحادیه همکاری\u200cهای منطقه\u200cای جنوب آسی Dzongkha: ༄ ལྷོ ཨེསིཨ ་ རེ ་ གིཨོནལ ་ ཅོཨོཔེརཏིཨོན ་ ཀོ ་ མི ་ ཏི ། Hindi: दक्षिण एशियाई क्षेत्रीय सहयोग संगठन Maldivian: ދެކުނު އޭޝިޔާގެ ސަރަޙައްދީ އެއްބާރުލުމުގެ ޖަމިއްޔާ Nepali: दक्षिण एशियाली क्षेत्रीय सहयोग सङ्गठन Pashto: د سویلي اسیا لپاره د سیمه ایزی همکارۍ ټولنه Sinhalese: දකුණු ආසියාතික කලාපීය සහයෝගිතා සංවිධානය Tamil: தெற்காசிய நாடுகளின் பிராந்தியக் கூட்டமைப்பு Urdu: جنوبی ایشیائی علاقائی تعاون کی تنظیم \u202c Logo Member states Observer states Headquarters Kathmandu Official languages English Demonym South Asian Member states 8 members (show) Afghanistan Bangladesh Bhutan India Maldives Nepal Pakistan Sri Lanka 9 observers (show) Australia China EU Iran Japan Mauritius Myanmar South Korea United States Leaders Secretary-General Amjad B. Hussain Establishment 8 December 1985 Area Total 5,099,611 km (1,968,971 s

In [89]:
run_comp("  ")

tensor([[0.9307]])

In [90]:
toks = stModel.tokenizer("  ")

In [97]:
toks=stModel.tokenizer(texts[24])

In [99]:
len(toks['input_ids'])

954

In [174]:
def toklen(text, return_tokens=False):
    return (
        len(stModel.tokenizer(text)['input_ids']) if not return_tokens
        else stModel.tokenizer(text)['input_ids']
    )

In [117]:
toklen(texts[24][:400])

427

In [121]:
run_comp(texts[24][400:500])

tensor([[0.9871]])

In [125]:
run_comp("'سیمه ایزی همکارۍ ټولنه")

tensor([[0.9999]])

In [138]:
run_comp(texts[24][:500])

tensor([[-0.3001]])

In [152]:
sres1000 = sorted([(i, res100[i]) for i in range(len(res100))], key=lambda x: x[1], reverse=False)

In [154]:
sres1000[:10]

[(24, np.float64(-0.3431810736656189)),
 (593, np.float64(-0.3207738697528839)),
 (634, np.float64(-0.3013763129711151)),
 (340, np.float64(-0.28002917766571045)),
 (318, np.float64(-0.2735687792301178)),
 (609, np.float64(-0.12954631447792053)),
 (374, np.float64(-0.021807696670293808)),
 (182, np.float64(0.19155991077423096)),
 (53, np.float64(0.21583235263824463)),
 (238, np.float64(0.2670695185661316))]

In [164]:
texts[634][:3000]

'No. overall No. in season Title Directed by Written by Original air date Prod. code U.S. viewers (millions) 70 "The Flash Reborn" Glen Winter Story by: Andrew Kreisberg Teleplay by: Todd Helbing & Eric Wallace October 10, 2017 (2017-10-10) T27. 13401 2.84 Iris has been aiding Team Flash in Central City for six months, but refuses to grieve Barry. A flying samurai with superpowers appears in Central City, threatening to destroy the city if the real Flash does not face him. Cisco reveals he has formulated a way to bring back Barry without destabilizing the Speed Force and tracks down Caitlin for help, but is forbidden by Iris to bring Barry back. Against Iris\' orders, Team Flash successfully returns Barry, who rambles random statements and continually writes symbols on the walls. Wally engages the samurai, but is defeated. Cisco deciphers Barry\'s writings and finds an apparently meaningless sentence. In an attempt to recover Barry\'s memories, Iris gives herself up to the samurai. The

In [179]:
run_comp(" ")

0.9896807670593262