In [44]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
# from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
import openai 
from dotenv import load_dotenv
import os
import shutil
import boto3
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [45]:
load_dotenv()

True

In [46]:
api= os.environ['OPENAI_API_KEY']
os.environ['AWS_ACCESS_KEY_ID'] = os.environ['ACCESS']
os.environ['AWS_SECRET_ACCESS_KEY'] = os.environ['SECRET']
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'
openai.api_key= os.environ['OPENAI_API_KEY_SERVICE']

In [47]:
CHROMA_PATH = "chromaPathology-Copy1"
DATA_PATH = "data/books"

In [48]:
bedrock= boto3.client('bedrock-runtime', aws_access_key_id= os.environ['ACCESS'], aws_secret_access_key=os.environ['SECRET'])

In [49]:
from langchain_community.embeddings.bedrock import BedrockEmbeddings
from langchain_huggingface.embeddings.huggingface import HuggingFaceEmbeddings


def get_embedding_function():
    #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    # embeddings = OllamaEmbeddings(model="nomic-embed-text")
    embeddings= BedrockEmbeddings(client=bedrock, model_id="amazon.titan-embed-text-v2:0")
    return embeddings

In [9]:
def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    #HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", model_kwargs={"device": "cpu"})
    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, get_embedding_function(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [13]:
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=350,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [14]:
def load_documents():
    loader = PyPDFDirectoryLoader(DATA_PATH)
    documents = loader.load()
    return documents

In [16]:
def generate_data_store():
    documents = load_documents()
    print("documents loaded successfully")
    chunks = split_text(documents)
    print("document splitting successfull")
    save_to_chroma(chunks)

In [17]:
generate_data_store()

documents loaded successfully
Split 1504 documents into 3791 chunks.
Harvard Medical School; 
Director, Cardiac Pathology and Executive Vice Chairman, Department of Pathology, Brigham and Women's Hospital, Boston, MA Blood Vessels;  The Heart
Klaus Sellheyer MD Assistant Professor of Pathology, 
Thomas Jefferson University; Attending Dermatopathologist, Jefferson Medical College, Philadelphia, PA The Skin
Arlene H. Sharpe MD, PhD 
Professor of Pathology, 
Harvard Medical School; Chief, Immunology Research Division, Department of Pathology, Brigham and Women's Hospital, Boston, MA Infectious Diseases
Robb E. Wilentz MD Voluntary Faculty, 
Department of Dermatology, University of Miami School of Medicine; Laboratory Director, Division of Pathology, Skin and Cancer Associates, Miami, FL The Pancreas
IX
{'source': 'data\\books\\Robbins-Pathologic2005.pdf', 'page': 8, 'start_index': 0}
document splitting successfull
Saved 3791 chunks to chromaPathology.


  warn_deprecated(


# testing the response

In [50]:
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embedding_function())

In [51]:
query_text= "what is chemotherapy?"
results = db.similarity_search_with_relevance_scores(query_text, k=5)



In [52]:
results

[(Document(metadata={'page': 305, 'source': 'data\\books\\Robbins-Pathologic2005.pdf', 'start_index': 747}, page_content='emerged: cancer mortality for both men and women in the United States declined during the last decade of the 20th century.[4] Thus, there has been progress, but the problem is still \noverwhelming. The discussion that follows deals with both benign tumors and cancers; the latter receive more attention. The focus is on the basic morphologic and biologic properties of \ntumors and on the present understanding of the molecular basis of carcinogenesis. We also discuss the interactions of the tumor with the host and the host response to tumors. Although the discussion of therapy is beyond the scope of this chapter, there are now dramatic improvements in therapeutic responses and 5-year survival rates with many forms of malignancy, notably the leukemias and lymphomas. A greater proportion of cancers is being cured or arrested today than ever before.\nDefinitions'),
  -0.0

In [53]:
if len(results) == 0 or results[0][1] < 0.7:
        print(f"Unable to find matching results.")

Unable to find matching results.


In [54]:
PROMPT_TEMPLATE= """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [55]:
from langchain_core.prompts import ChatPromptTemplate

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)

print(prompt)

Human: 
Answer the question based only on the following context:

emerged: cancer mortality for both men and women in the United States declined during the last decade of the 20th century.[4] Thus, there has been progress, but the problem is still 
overwhelming. The discussion that follows deals with both benign tumors and cancers; the latter receive more attention. The focus is on the basic morphologic and biologic properties of 
tumors and on the present understanding of the molecular basis of carcinogenesis. We also discuss the interactions of the tumor with the host and the host response to tumors. Although the discussion of therapy is beyond the scope of this chapter, there are now dramatic improvements in therapeutic responses and 5-year survival rates with many forms of malignancy, notably the leukemias and lymphomas. A greater proportion of cancers is being cured or arrested today than ever before.
Definitions

---

overwhelming. The discussion that follows deals with both beni

In [61]:
from langchain.llms import Bedrock

llm = Bedrock(
    model_id="amazon.titan-text-express-v1",
    client=bedrock,
    model_kwargs={
        "maxTokenCount": 1000,
        "stopSequences": [],
        "temperature": 0.7,
        "topP": 1,
    }
)

In [62]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=PROMPT_TEMPLATE
)

In [63]:
from langchain.chains import LLMChain

chain = LLMChain(llm=llm, prompt=prompt)

In [65]:
response = chain.run(context=context_text, question=query_text)
response

"Chemotherapy is a cancer treatment that uses drugs to stop the growth of cancer cells, either by killing the cells or by stopping them from dividing. Chemotherapy can be given as a single drug, or it can be given in combination with other drugs.\nThere are several types of chemotherapy drugs, including alkylating agents, anthracyclines, taxanes, and platinum-based drugs. Alkylating agents are drugs that damage DNA, which can kill cancer cells. Anthracyclines are drugs that kill cancer cells by stopping them from dividing. Taxanes are drugs that help stop the growth of cancer cells by blocking the formation of new blood vessels. Platinum-based drugs are drugs that kill cancer cells by damaging their DNA.\n\nChemotherapy can be given through several methods, including intravenous (IV) infusion, oral administration, and topical application. The type of chemotherapy drug used, the dose, and the frequency of treatment are all determined by the type and stage of cancer, as well as the indiv

In [38]:
import os
def generateReply(prompt) -> str:
        from huggingface_hub import InferenceClient
        client = InferenceClient(
            "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
            token=os.environ["HF_TOKEN"],
        )

        response = client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=500,
        )
        return response.choices[0].message.content

In [39]:
generateReply(prompt="how are you?")

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [37]:
response

'related diseases rests on defining and enforcing safe exposure levels, developing new technologies to reduce industrial exposures, and identifying less toxic substitutes for industrial and\nchemical agents. These strategies require a basic understanding of biochemical and molecular mechanisms of toxicity.\nMECHANISMS OF TOXICITY\nToxicology is the scientific discipline that studies the detection, effects, and mechanisms of action of poisons and toxic chemicals. Toxicity is a relative phenomenon that depends on the inherent structure and properties of a chemical and on its dose. Dose-response curves are typically generated in laboratory animals exposed to various amounts of a chemical. A typical'