In [1]:
from langchain_core.documents.base import Document
from nltk.tokenize.punkt import PunktSentenceTokenizer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from glob import glob
from tqdm import tqdm
import numpy as np

def read_sentences(mddfile: str):
    with open(mddfile) as handle:
        lines = [line for line in handle.read().split('\n') if len(line) > 0]
    sentence_tokenizer = PunktSentenceTokenizer('\n'.join(lines))
    sentences = [
        sentence.replace('\n', ' ') for sentence in
        sentence_tokenizer.sentences_from_text('\n'.join(lines))
    ]
    return  sentences

def read_docs(source):
    sentences = read_sentences(source)
    l_docs = []
    for idx, sentence in enumerate(sentences):
        doc = Document(sentence, metadata={'source': source, 'sentence_idx': idx})
        l_docs.append(doc)
    return l_docs

def update_db(mddpath: str, dbpath: str, model_name: str = "joe32140/ModernBERT-base-msmarco"):
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': False}
    hf = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    db = Chroma(persist_directory=dbpath, embedding_function=hf)
    for path in tqdm(glob(f'{mddpath}/*.md')):
        docs = read_docs(path)
        db.add_documents(docs)
    return db

In [9]:
db = update_db('../md', 'bib')

100%|███████████████████████████████████████████| 11/11 [06:19<00:00, 34.49s/it]


In [11]:
query = 'What are the sensory modalities that contribute to self-motion perception, gait, and balance function?'
results = db.similarity_search_with_relevance_scores(
    query,
    k=30
)

  results = db.similarity_search_with_relevance_scores(


In [None]:
context = ' '.join([result[0].page_content for result in results])
context

In [34]:
from huggingface_hub import login

login(token='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')

In [None]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3-8B"

pipeline = transformers.pipeline("text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
out = pipeline(f"""
Answer the user query based on the following information:

--- INFO BEGIN
{context}
--- INFO END

--- QUERY BEGIN
{query}
--- QUERY END

--- RESPONSE BEGIN
""")

In [None]:
out

In [18]:
out = _

In [24]:
print(out[0]['generated_text'])


Answer the user query based on the following information:

--- INFO BEGIN
For instance, vestibular and visual signals are integrated for perception of selfmotion. In such instances, the vestibular system, primarily involved in regulating balance and coordination during self‐motion, also contributes to multisensory integration, providing information signaling an unresolved conflict between vision (“I see motion”) and proprioception (“I feel I am not moving”), which often results in motion sickness (Bertolini & Straumann, 2016). Furthermore, these impaired visual cues of self-motion are overweighted when integrated with largely intact vestibular cues, leading to suboptimal multisensory integration. Self-motion perception relies primarily on vestibular and visual (optic flow) cues (Dichgans and Brandt, 1978; Warren and Hannon, 1988; Fushiki et al., 2005; Gu et al., 2007; Fetsch et al., 2009, 2010; Butler et al., 2010; Zaidel et al., 2015), as well as other somatosensory cues, such as pro