In [17]:
import os
import glob
from uuid import uuid4
from dotenv import load_dotenv
# from sentence_transformers import SentenceTransformer

from pinecone import Pinecone, ServerlessSpec

from langchain import hub
# from langchain_core.documents import Document
# from langchain_core.prompts import PromptTemplate
from langchain_pinecone import PineconeVectorStore
# from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker

# from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType

load_dotenv()

True

In [3]:
os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")
os.environ['GOOGLE_API_KEY']=os.getenv("GOOGLE_API_KEY")
os.environ['PINECONE_API_KEY']=os.getenv("PINECONE_API_KEY")

In [6]:
# Step 1: Load all PDFs from directory
pdf_dir = "./data/"
pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))

all_docs = []
for pdf_file in pdf_files:
    loader = PyPDFLoader(pdf_file)
    all_docs.extend(loader.load())

print(f"Loaded {len(all_docs)} page documents (should cover >200 pages).")

Loaded 247 page documents (should cover >200 pages).


In [7]:
# Step 2: Semantic Chunking
embedding_model = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
chunker = SemanticChunker(embedding_model, breakpoint_threshold_type="standard_deviation", min_chunk_size=100)
semantic_chunks = chunker.create_documents([doc.page_content for doc in all_docs])

print(f"Total semantic chunks: {len(semantic_chunks)}")

Total semantic chunks: 318


In [8]:
texts = [chunk.page_content for chunk in semantic_chunks]
embeddings = embedding_model.embed_documents(texts)
len(embeddings[0])

768

In [9]:
pc=Pinecone()

index_name="agenticbatch2-assignment"

if not pc.has_index(index_name):
    pc.create_index(
    name=index_name,
    dimension=768,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws",region="us-east-1")    
)
    
#loading the index
index=pc.Index(index_name)
vector_store=PineconeVectorStore(index=index,embedding=embedding_model)

In [None]:
# INDEX_DIM = 768  # Must match your embedding size

# # Flat index
# pc.create_index(
#     name="rag-flat",
#     dimension=INDEX_DIM,
#     metric="cosine",
#     spec=ServerlessSpec(cloud="gcp", region="us-east1"),
#     pod_type="s1",     # or appropriate for your plan
#     index_type="POD",  # POD is Pinecone's Flat index
# )

# # HNSW index
# pc.create_index(
#     name="rag-hnsw",
#     dimension=INDEX_DIM,
#     metric="cosine",
#     spec=ServerlessSpec(cloud="gcp", region="us-east1"),
#     pod_type="s1",
#     index_type="HNSW"
# )

# # IVF_PQ index (Pinecone's version of IVF)
# pc.create_index(
#     name="rag-ivfpq",
#     dimension=INDEX_DIM,
#     metric="cosine",
#     spec=ServerlessSpec(cloud="gcp", region="us-east1"),
#     pod_type="s1",
#     index_type="IVF_PQ"
# )

In [10]:
# Each item: (id, vector, metadata)
items = [
    (str(uuid4()), emb, {"text": text}) for emb, text in zip(embeddings, texts)
]

batch_size = 10
for i in range(0, len(items), batch_size):
    index.upsert(vectors=items[i:i+batch_size])

In [25]:
def retrieve(query, top_k=10, threshold=0.7):
    query_emb = embedding_model.embed_query(query)
    # Query Pinecone
    result = index.query(
        vector=query_emb,
        top_k=top_k,
        include_metadata=True,
        filter=None,  # Optionally filter on metadata
        include_values=True
    )
    # Filter by score threshold
    matches = [m for m in result['matches'] if m['score'] >= threshold]
    return matches, query_emb


In [27]:
import time

test_query = "How Radiology Report Generation is achieved using transfer learning?"

start = time.time()
matches, _ = retrieve(test_query)
elapsed = time.time() - start

print(f"Retrieval time: {elapsed:.4f}s")
for m in matches:
    print(f"Score: {m['score']:.2f}, Text: {m['metadata']['text'][:200]}...")


Retrieval time: 0.3317s
Score: 0.80, Text: 21, 2022. 2, 10, 14,
16, 18, 19
[31] Z. W ang et al. , “ A Medical Semantic-Assisted Transformer for Radio-
graphic Report Generation. ”, Med. Image Comput. Assist Interv ., V ol
13433, pp. 655–664 , ...
Score: 0.79, Text: 22 IEEE REVIEWS IN BIOMEDICAL ENGINEERING, VOL. XX, NO. XX, X XXX 2023
[33] C. Shang et al., “MA TNet: Exploiting Multi-Modal Features for Radiol-
ogy Report Generation, ” IEEE Signal Process. Lett. ,...
Score: 0.78, Text: Informatics in Medicine Unlocked 24 (2021) 100557
Available online 26 March 2021
2352-9148/© 2021 The Authors. Published by Elsevier Ltd....
Score: 0.78, Text: . 8
2.2 Training Dataset . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ....
Score: 0.78, Text: Mohsan et al. , “V ision Transformer and Language Model Based
Radiology Report Generation, ” IEEE Access , vol. 11, pp. 1814-1824,
2023. 8, 9, 10, 16, 18
[96] Xian Wu et al. . “DeltaNet: Conditional M...
Score: 0.7

In [28]:
# Reranking

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def mmr(query_emb, doc_embs, texts, top_k=5, lambda_param=0.7):
    doc_embs = np.array(doc_embs)
    query_emb = np.array(query_emb).reshape(1, -1)
    sim = cosine_similarity(doc_embs, query_emb).flatten()
    selected, candidates = [], list(range(len(texts)))
    while len(selected) < top_k and candidates:
        if not selected:
            idx = int(np.argmax(sim))
        else:
            sim_to_query = sim[candidates]
            sim_to_selected = np.max(cosine_similarity(doc_embs[candidates], doc_embs[selected]), axis=1)
            mmr_scores = lambda_param * sim_to_query - (1 - lambda_param) * sim_to_selected
            idx = candidates[int(np.argmax(mmr_scores))]
        selected.append(idx)
        candidates.remove(idx)
    return [texts[i] for i in selected]


In [29]:
# 1. Get matches and query embedding
matches, query_emb = retrieve("How Radiology Report Generation is achieved using transfer learning?", top_k=10)

# 2. Prepare for MMR
doc_embs = [m['values'] for m in matches]
texts = [m['metadata']['text'] for m in matches]

# 3. Rerank
reranked_mmr = mmr(query_emb, doc_embs, texts, top_k=5)
print("Top reranked doc:", reranked_mmr[0][:300])

Top reranked doc: 21, 2022. 2, 10, 14,
16, 18, 19
[31] Z. W ang et al. , “ A Medical Semantic-Assisted Transformer for Radio-
graphic Report Generation. ”, Med. Image Comput. Assist Interv ., V ol
13433, pp. 655–664 , 2022. 2, 8, 10, 12, 13, 14, 16, 19
[32] H. Nguyen et al., “ Automated Generation of Accurate & Fluen


In [30]:
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

prompt_template = """You are a research assistant. Use the context below to answer the question concisely.
Context:
{context}
Question: {question}
Answer:"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

context = "\n\n".join(reranked_mmr[:3])
question = test_query

llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash')
final_prompt = prompt.format(context=context, question=question)
llm_response = llm.invoke(final_prompt)
print("LLM output:\n", llm_response)


LLM output:
 content="The provided text focuses on papers using transformer models and doesn't detail how transfer learning is specifically applied to radiology report generation.  More information is needed to answer the question." additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-1.5-flash', 'safety_ratings': []} id='run--4405bbdd-d9cf-4fa8-a4da-748a7afbf41a-0' usage_metadata={'input_tokens': 364, 'output_tokens': 36, 'total_tokens': 400, 'input_token_details': {'cache_read': 0}}


# LangChain Retrieval Pipeline

In [12]:
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.7}  # Hyperparameter: tune as needed
)

In [15]:
model = ChatGoogleGenerativeAI(model='gemini-1.5-flash')

prompt = PromptTemplate(
    template="""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
        Question: {question} 
        Context: {context} 
        Answer:""",
    input_variables=['context', 'question']
)

In [16]:
# ---- 3. DOC FORMATTER ----

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [18]:
# ---- 4. RAG CHAIN ----

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [23]:
# ---- 5. INTERACTIVE QUERY FUNCTION ----

def run_rag_pipeline(question):
    print(f"\n=== User Query ===\n{question}\n")

    # Retrieve context documents (for transparency and debug)
    retrieved_docs = retriever.invoke(question)
    print(f"Retrieved {len(retrieved_docs)} context documents:")
    for i, doc in enumerate(retrieved_docs, 1):
        preview = doc.page_content[:300] + ("..." if len(doc.page_content) > 200 else "")
        print(f"{i}. {preview}  [source: {doc.metadata.get('source','')}]")

    print("\nGenerating LLM Answer...\n")
    answer = rag_chain.invoke(question)
    print("=== LLM Answer ===\n", answer)
    return answer

In [24]:
run_rag_pipeline("How Radiology Report Generation is achieved using transfer learning?")


=== User Query ===
How Radiology Report Generation is achieved using transfer learning?

Retrieved 4 context documents:
1. 21, 2022. 2, 10, 14,
16, 18, 19
[31] Z. W ang et al. , “ A Medical Semantic-Assisted Transformer for Radio-
graphic Report Generation. ”, Med. Image Comput. Assist Interv ., V ol
13433, pp. 655–664 , 2022. 2, 8, 10, 12, 13, 14, 16, 19
[32] H. Nguyen et al., “ Automated Generation of Accurate & Fluen...  [source: ]
2. 22 IEEE REVIEWS IN BIOMEDICAL ENGINEERING, VOL. XX, NO. XX, X XXX 2023
[33] C. Shang et al., “MA TNet: Exploiting Multi-Modal Features for Radiol-
ogy Report Generation, ” IEEE Signal Process. Lett. , vol. 29, pp. 2692–
2696, 2022. 2, 8, 9, 10, 13, 14, 15, 16, 18, 19
[34] F . Dalla Serra et al., “Mu...  [source: ]
3. Informatics in Medicine Unlocked 24 (2021) 100557
Available online 26 March 2021
2352-9148/© 2021 The Authors. Published by Elsevier Ltd.  [source: ]
4. . 8
2.2 Training Dataset . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

'Transfer learning in radiology report generation uses pre-trained models (e.g., transformers)  trained on large datasets of text and images.  These models are then fine-tuned on a radiology-specific dataset to generate reports.  This approach leverages the knowledge learned from the general data to improve performance on the smaller, specialized dataset.'