In [1]:
import bs4
from langchain_ollama import OllamaLLM
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import WebBaseLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [12]:
llm = OllamaLLM(model = "deepseek-r1:7b", temperature=0)
embeding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [13]:
web_paths = ['https://medlineplus.gov/ency/article/000033.htm',
             'https://pmc.ncbi.nlm.nih.gov/articles/PMC10503338/',
             'https://www.ncbi.nlm.nih.gov/books/NBK537235/',
             'https://medicaljournalssweden.se/actadv/article/view/11592/19144',
             'https://emedicine.medscape.com/article/769067-overview?form=fpf',
             'https://www.mayoclinic.org/first-aid/first-aid-insect-bites/basics/art-20056593',
             'https://www.medicalnewstoday.com/articles/174229#reactions',
             'https://www.aafp.org/pubs/afp/issues/2022/0800/arthropod-bites-stings.html',
             'https://wwwnc.cdc.gov/travel/page/avoid-bug-bites']

loader = WebBaseLoader(
    web_paths=web_paths,
    bs_kwargs = dict(
        parse_only = bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
documents = loader.load()

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                              chunk_overlap=200,
                                             add_start_index=True)

all_splits = text_splitter.split_documents(documents=documents)

In [5]:
vector_store = FAISS.from_documents(all_splits, embeding)

In [14]:
from langchain_core.prompts import ChatPromptTemplate
template = '''
# Role
You are an medical AI assistant focused on Question-Answering (QA) tasks within a Retrieval-Augmented Generation (RAG) system.
Your primary goal is to provide precise answers based on the given context or chat history.

# Instruction
Provide a concise, logical answer by organizing the selected content into coherent paragraphs with a natural flow. 
Avoid merely listing information. Include key numerical values, technical terms, jargon, and names. 
DO NOT use any outside knowledge or information that is not in the given material.

# Constraint
- Review the provided context thoroughly and extract key details related to the question.
- Craft a precise answer based on the relevant information.
- Keep the answer concise but logical/natural/in-depth.
- If the retrieved context does not contain relevant information or no context is available, respond with: 'I can't find the answer to that question in the context.'

**Source** (Optional)
- Cite the source of the information as a file name with a page number or URL, omitting the source if it cannot be identified.
- (list more if there are multiple sources)

# Question
<question>
{question}
</question>

# Context
<retrieved context>
{context}
</retrieved context>
# Answer
'''

prompt = ChatPromptTemplate.from_template(template)



In [15]:
from langchain.globals import set_debug , set_verbose

set_debug(False)
set_verbose(True)

In [16]:
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

retriver = vector_store.as_retriever()
parser = StrOutputParser()

chain = (
    {"question":RunnablePassthrough(), "context" : retriver  }
    | prompt
    | llm
    | parser
)

response = chain.invoke("What are Lice?")
print(response)

<think>
Okay, so I need to figure out what lice are based on the given context. Let me start by looking at the retrieved context provided.

The context is from a document with ID '1e5ae834-ee2e-4c82-b775-1392cbddb737' and it's from the NCBI Bookshelf, specifically page number 0. The copyright year isn't specified here, but it mentions that the book is distributed under CC BY-NC-ND 4.0, which allows distribution for non-commercial use with proper attribution.

The content itself doesn't directly talk about lice. It's just a statement about the source and distribution terms of a book. There's no information provided on what lice are, their characteristics, or any related details.

Since there's no relevant information in the context about lice, I can't extract any key facts from it. Therefore, according to the instructions, if the context doesn't contain the necessary information, I should respond that I can't find the answer based on the given material.
</think>

I can't find the answer

In [None]:
from typing_extensions import TypedDict, List
from langchain_core.documents import Document

class State(TypedDict)
question : str
context : List[Document]
