In [1]:
from pathlib import Path
from biokb.llm import AIRLLM
from biokb.embedding import AIREmbedding, AIRPubmedSearch
from biokb.helpers import get_generation_config
from biokb.utils import get_file_names, create_documents_from_text_files, get_logger

2024-10-13 11:23:13,723 - air-llm - INFO - Starting AIR llm...


In [2]:
from biokb.settings import MODEL_NAME, EMBEDDING_MODEL_NAME, DB_DIR, DATA_DIR

In [3]:
logger = get_logger(debug=True)
logger.info("Starting the AIR Agent")

2024-10-13 11:23:15,189 - air-rag - INFO - Starting the AIR Agent


In [4]:
embedding_llm = AIREmbedding(
    model_name=EMBEDDING_MODEL_NAME
)

In [5]:
if Path(DB_DIR).exists():
    docstore = AIRPubmedSearch.load(
        DB_DIR, 
        embedding_llm=embedding_llm,
    )
else:
    files = get_file_names(DATA_DIR)
    documents = create_documents_from_text_files(files)
    print(f"Number of documents: {len(documents)}")
    docstore = AIRPubmedSearch(
        documents=documents,
        embedding_llm=embedding_llm,
    )
    docstore.build(DB_DIR)

In [6]:
generation_config = get_generation_config()

llm = AIRLLM(
    model_name=MODEL_NAME,
    generation_config=generation_config,
    logger=logger
)

2024-10-13 11:23:18,417 - air-rag - INFO - Loading model: meta-llama/Meta-Llama-3.1-8B-Instruct
2024-10-13 11:23:18,418 - air-rag - INFO - Loading model in 4-bit mode
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2024-10-13 11:23:27,787 - air-rag - INFO - Model loaded: meta-llama/Meta-Llama-3.1-8B-Instruct


In [7]:
docstore.search(
    llm = llm,
    query="cancer",
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


('Melatonin sensitizes hepatocellular carcinoma cells to chemotherapy through long non-coding RNA RAD51-AS1-mediated suppression of DNA repair. \n\nThe final answer is Melatonin sensitizes hepatocellular carcinoma cells to chemotherapy through long non-coding RNA RAD51-AS1-mediated suppression of DNA repair..</p>\n<p>2023-02-06</p> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div> </div>',
 [Document(page_content='1. Cancers (Basel). 2018 Sep 10;10(9):320. doi: 10.3390/cancers10090320.\n\nMelatonin Sensiti

In [8]:
docstore.search(
    llm = llm,
    query="p53 and cancer",
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


('The p53 pathway plays a crucial role in regulating the biological behavior and determining the fate of tumor cells in non-small cell lung cancers (NSCLCs). Mutations of p53, a reduced p14 alternate reading frame expression, a reduced herpesvirus-associated ubiquitin-specific protease expression, or a reduced p33 inhibitor of growth gene1b expression are common in NSCLCs. The balance of expression of p53 target genes, such as p21, Bax, and PUMA, regulates the biological behavior and determines the fate of tumor cells. Many studies on cancer gene therapy using these molecules associated with the p53 pathway have been performed to develop new strategies for treating NSCLC patients. Therefore, the establishment of a comprehensive and simple evaluation protocol for the p53 pathway is required for clinical use. Additionally, the p53 tumor suppressor gene has been shown to regulate expression of some metastasis-related genes, including epidermal growth factor receptor, matrix metalloprotein

: 