*Author:* **Feichen Shen**  
*Email:* shenfeichen1102@gmail.com 
*Date:* March 31, 2025

In [1]:
## Data Loader
from langchain_community.document_loaders import UnstructuredPDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

class DataReader:
    def __init__(self, chunk_size=500, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def load_and_split(self, pdf_path):
        """Load and split a PDF into text chunks."""
        loader = UnstructuredPDFLoader(pdf_path, strategy="hi_res")
        documents = loader.load()

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap
        )
 
        split_docs = splitter.split_documents(documents)
        print('doc', len(split_docs))
        for i in range(len(split_docs)):
            doc = split_docs[i]
            page = doc.metadata.get("page", "?")
            doc.page_content = f"{doc.page_content}"

        return split_docs

In [2]:
## clinical knowledge embeddings
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from langchain.embeddings.base import Embeddings
from langchain.schema import Document

class ClinicalKGEmbedding(Embeddings):
    def __init__(self, mapping_csv_path, embedding_pkl_path):
        self.node_map = pd.read_csv(mapping_csv_path)
        self.embedding_matrix = pickle.load(open(embedding_pkl_path, "rb")).numpy()
        
        # fast name lookup
        self.name_to_idx = {
            row['node_name'].lower(): row['global_graph_index']
            for _, row in self.node_map.iterrows()
        }
        self.dim = self.embedding_matrix.shape[1]

    def _embed_text(self, text: str):
        text = text.lower()
        matched = [self.embedding_matrix[idx]
                   for name, idx in self.name_to_idx.items()
                   if name in text]

        if matched:
            avg_vec = np.mean(matched, axis=0)
        else:
            avg_vec = np.zeros(self.dim)

        return normalize(avg_vec.reshape(1, -1))[0]

    def embed_documents(self, texts):
        return [self._embed_text(doc.page_content if isinstance(doc, Document) else doc) for doc in texts]

    def embed_query(self, text):
        return self._embed_text(text)

In [5]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import LlamaCpp
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from difflib import SequenceMatcher
import os
from langchain.vectorstores import FAISS

os.environ["OCR_AGENT"] = "tesseract"

In [8]:
def find_best_sentence(answer, source_docs):
    best_match = ""
    best_score = 0
    best_meta = {}

    for doc in source_docs:
        sentences = doc.page_content.split(". ")
        for sent in sentences:
            score = SequenceMatcher(None, sent.lower(), answer.lower()).ratio()
            if score > best_score:
                best_match = sent.strip()
                best_meta = doc.metadata
                best_score = score
    return best_match, best_meta, best_score

def shorten_sentence(text, max_words=30):
    words = text.split()
    return " ".join(words[:max_words]) + ("..." if len(words) > max_words else "")

pdf_path = "../data/HER2_Paper.pdf"
processor = DataReader(chunk_size=1000, chunk_overlap=200)
chunks = processor.load_and_split(pdf_path)

print(f"Loaded {len(chunks)} chunks from the PDF.")
print(chunks[0].page_content)  # Example output


doc 54
Loaded 54 chunks from the PDF.
ResearchGate

See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/19364043

Slamon DJ, Clark GM, Wong SG, Levin WJ, Ullrich A, McGuire WLHuman breast cancer: correlation of relapse and survival with amplification of the HER-2/neu oncogene. Science (Wash DC...

Article in Science - February 1987 DOI: 10.1126/science.3798106 - Source: PubMed

CITATIONS

READS

9,978

29,134

6 authors, including:

Gary Clark

Gary Clark Statistical Consulting, LLC

Steven G Wong University of California, Los Angeles

364 PUBLICATIONS 69,847 CITATIONS

24 PUBLICATIONS 24,981 CITATIONS

SEE PROFILE

SEE PROFILE

Wendy Levin

Fate Therapeutics, Inc.

25 PUBLICATIONS 17,968 CITATIONS

SEE PROFILE

All content following this page was uploaded by Gary Clark on 23 December 2013.

The user has requested enhancement of the downloaded file.

BIOM 255 (Leffert) — Discussion Feb. 1, 2007



scover.bio-rad.com

Science AVA

In [9]:
# uncomment the following code to use MiniLM
## embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# uncomment the following code to use Bio_ClinicalBERT
## embedding_model = HuggingFaceEmbeddings(model_name="emilyalsentzer/Bio_ClinicalBERT")

## Use pre-trained Clinical Knowledge Embeddings
embedding_model = ClinicalKGEmbedding(
        mapping_csv_path="../clinical_KGEmb/new_node_map_df.csv",
        embedding_pkl_path="../clinical_KGEmb/full_h_embed_hms.pkl"
    )

In [10]:
# vector db
db = FAISS.from_documents(chunks, embedding_model)
retriever = db.as_retriever(search_kwargs={"k": 4})
print("Retriever:", retriever)

llm = LlamaCpp(
    #model_path="models/llama-2-7b.Q4_K_M.gguf",
    model_path="../models/Nous-Hermes-2-Mistral-7B-DPO.Q4_K_M.gguf",
    n_ctx=2048,
    temperature=0.1,
    max_tokens=512
)

Retriever: tags=['FAISS', 'ClinicalKGEmbedding'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000002AE5F9ACC50> search_kwargs={'k': 4}


llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from ../models/Nous-Hermes-2-Mistral-7B-DPO.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = models
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention

In [11]:
custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful assistant answering questions about HER2 from a scientific paper.
Use the provided context to answer the question. 
Include the exact sentence used in your answer in brackets.
Context:
{context}

Question: {question}
Answer:"""
)


In [12]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": custom_prompt},
    return_source_documents=True
)

In [13]:
query = "What is the role of HER2 in breast cancer?"
result = qa_chain.invoke(query)

best_sentence, meta, score = find_best_sentence(result["result"], result["source_documents"])
short_snippet = shorten_sentence(best_sentence)

print("Answer:\n")
print(result["result"])

print("\n Source Evidence from Paper:")
print('len of souce_doc', len(result["source_documents"]))
for i, doc in enumerate(result["source_documents"], start=1):
    page = doc.metadata.get("page", "?")
    snippet = doc.page_content[:400].strip()
    print(f'{i}. " {snippet}..."')
    print()

llama_perf_context_print:        load time =  173374.34 ms
llama_perf_context_print: prompt eval time =  173373.76 ms /   877 tokens (  197.69 ms per token,     5.06 tokens per second)
llama_perf_context_print:        eval time =   11096.97 ms /    41 runs   (  270.66 ms per token,     3.69 tokens per second)
llama_perf_context_print:       total time =  184634.51 ms /   918 tokens


Answer:

 The text suggests that HER2 may play a role in the biologic behavior and/or pathogenesis of human breast cancer. It is also associated with disease relapse and overall patient survival.

 Source Evidence from Paper:
len of souce_doc 4
1. " prognostic factors, including hormonal-receptor status, in lymph node-positive disease. These data indicate that this gene may play a role in the biologic behavior and/or pathogenesis o! of human bi breast cancer...."

2. " Recently, a novel transforming gene was identified as a result of transfection studies with DNA from chemically induced rat neu- roglioblastomas (20). This gene, called new, was shown to be related to, but distinct from, the c-erbB proto-oncogene (21). By means of v-erbB and human EGFR as probes to screen human genomic and complementary DNA (cDNA) libraries, two other groups indepen- dently isolat..."

3. " Of 103 tumors evaluated in the initial survey, there was essentially no correlation between gene amplification and 