In [9]:
import os
import streamlit as st
import pickle
import time
from dotenv import load_dotenv
import langchain
from langchain import LLMChain
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader
from langchain import OpenAI, PromptTemplate
import glob

import google.generativeai as genai
from google.generativeai import GenerativeModel
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

# Upgrade typing_extensions to fix ImportError
%pip install --upgrade typing_extensions

Note: you may need to restart the kernel to use updated packages.


In [2]:
load_dotenv()
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY', 'your-key-if-not-using-env')

In [3]:
genai.configure()

In [5]:
api_key = os.getenv('GOOGLE_API_KEY')

In [7]:
MODEL="gemini-2.0-flash-lite"

In [48]:
llm = ChatGoogleGenerativeAI(model=MODEL, temperature=0.5)

In [16]:
file_path = r'E:\Shardul\Research-Paper-Summarizer\papers\Envisioning_Medclip_A_Deep_Dive_into_Explainability_for_Medical_Vision-Language_Models.pdf'

In [52]:
def summarize(file_path, MODEL):
    loader = PyPDFLoader(file_path)
    docs = loader.load_and_split()
    llm = ChatGoogleGenerativeAI(model=MODEL, temperature=0.5) 
    chain = load_summarize_chain(llm, chain_type='map_reduce')
    summary = chain.invoke(docs)

    return summary

In [53]:
summ = summarize(file_path, MODEL)
print(summ)

{'input_documents': [Document(metadata={'source': 'E:\\Shardul\\Research-Paper-Summarizer\\papers\\Envisioning_Medclip_A_Deep_Dive_into_Explainability_for_Medical_Vision-Language_Models.pdf', 'page': 0}, page_content='ENVISIONING MEDCLIP: A DEEP DIVE INTO EXPLAINABILITY FOR MEDICAL\nVISION-LANGUAGE MODELS\nAnees Ur Rehman Hashmi1, Dwarikanath Mahapatra2, Mohammad Yaqub1\n1Mohamed bin Zayed University of Artificial Intelligence,\n2Inception Institute of Artificial Intelligence\nAbu Dhabi, UAE\nABSTRACT\nExplaining Deep Learning models is becoming increasingly\nimportant in the face of daily emerging multimodal models,\nparticularly in safety-critical domains like medical imaging.\nHowever, the lack of detailed investigations into the perfor-\nmance of explainability methods on these models is widening\nthe gap between their development and safe deployment. In\nthis work, we analyze the performance of various explainable\nAI methods on a vision-language model, MedCLIP, to demys-\ntify it

In [54]:
summ['output_text']

"This paper investigates the explainability of MedCLIP, a vision-language model used in medical imaging. It analyzes existing XAI methods, finding them insufficient for VLMs, and proposes a new method that leverages text-image interaction within MedCLIP's embedding space to generate more plausible and informative explanations. The research uses the MIMIC-CXR dataset and finds that standard XAI methods struggle to highlight relevant image areas. The proposed method, which incorporates text prompts, offers improved insights into MedCLIP's decision-making process and its potential for medical applications. The paper also references related research in AI, computer vision, and relevant software/databases."

In [None]:
# def custom_summary(file_path, MODEL, custom_prompt):
#     loader = PyPDFLoader(file_path)
#     docs = loader.load_and_split()
#     prompt_template = custom_prompt + """
#     Answer the following question based only on the provided context, do not use any external information.:

#     <context>
#     {text}
#     </context>
    
#     SUMMARY:"""
#     PROMPT = PromptTemplate(template=prompt_template, input_variables=['text'])
#     llm = ChatGoogleGenerativeAI(model=MODEL, temperature=0.5)
#     chain = load_summarize_chain(llm, chain_type='map_reduce', map_prompt=PROMPT, combine_prompt=PROMPT)
#     summary = chain({"input_documents": docs},return_only_outputs=True)["output_text"]
    
#     return summary

In [59]:
# CUSTOM_PROMPT = "Write a concise summary of the following paper with this structure: Problem being solved, Approach, Main results and Main Discussion Points. Give output in markdown format."
CUSTOM_PROMPT = "Write a concise summary of the methodology used in following paper. Give output in markdown format."
custom_summaries = custom_summary(file_path, MODEL, custom_prompt=CUSTOM_PROMPT)

In [57]:
print(custom_summaries)

The paper's methodology focuses on improving the explainability of the MedCLIP vision-language model. It involves:

1.  **Evaluating Existing XAI Methods:** Applying and assessing the performance of established XAI techniques like Gradient backpropagation, Occlusion method, Integrated-Gradients, and Grad-Shapley to generate explainability maps for MedCLIP.
2.  **Proposing a Novel Approach:** Developing a new XAI method that leverages the model's internal structure:
    *   Applying an XAI method (Mxai) to the image embeddings generated by MedCLIP's vision encoder to create image explainability maps (Fi map).
    *   Encoding text prompts using the text encoder to generate text embeddings.
    *   Calculating a dot product between the image explainability maps and the text embeddings to highlight image pixels that influence model confidence.
3.  **Dataset and Inputs:** Utilizing the MIMIC-CXR dataset with text prompts (sentences) and class labels as text inputs.
4.  **Analysis:** Genera

The paper's methodology focuses on improving the explainability of the MedCLIP vision-language model. It involves:

1.  **Evaluating Existing XAI Methods:** Applying and assessing the performance of established XAI techniques like Gradient backpropagation, Occlusion method, Integrated-Gradients, and Grad-Shapley to generate explainability maps for MedCLIP.
2.  **Proposing a Novel Approach:** Developing a new XAI method that leverages the model's internal structure:
    *   Applying an XAI method (Mxai) to the image embeddings generated by MedCLIP's vision encoder to create image explainability maps (Fi map).
    *   Encoding text prompts using the text encoder to generate text embeddings.
    *   Calculating a dot product between the image explainability maps and the text embeddings to highlight image pixels that influence model confidence.
3.  **Dataset and Inputs:** Utilizing the MIMIC-CXR dataset with text prompts (sentences) and class labels as text inputs.
4.  **Analysis:** Generating feature activation maps to visualize how MedCLIP focuses on specific image regions based on text input, and how its focus changes with different prompts.

The paper's methodology focuses on explaining the Vision-Language Model (VLM) MedCLIP using explainable AI (XAI) techniques. It involves:

1.  **Generating Explainability Maps:** Applying XAI methods like Gradient backpropagation (GB), Occlusion, Integrated-Gradients (IG), and Grad-Shapley (GS) to the image embeddings produced by MedCLIP's vision encoder.
2.  **Analyzing Text Influence:** Utilizing both class labels and text prompts as input to the model and comparing the resulting activation maps to understand how the model's focus changes based on the text input.
3.  **Creating a RAxA weighted average:** Calculating a dot product between the image explainability maps and text embeddings and creating a RAxA weighted average of the maps using the dot product results.
4.  **Visualizing and Evaluating:** Highlighting image pixels based on the input text and class labels using chest X-ray images from the MIMIC-CXR dataset to demonstrate the model's ability to link text and image regions.

*   **Problem being solved:** The paper addresses the lack of interpretability in Vision-Language Models (VLMs), particularly MedCLIP, when applied to medical image analysis. This lack of understanding hinders trust and safe deployment of these models in critical applications.

*   **Approach:** The authors propose a novel method to generate feature activation maps for MedCLIP. This method leverages both text prompts (sentences) and class labels as text inputs to analyze how the model focuses on different image regions. The approach aims to overcome the limitations of existing XAI methods by providing more accurate and clinically relevant explanations.

*   **Main results:** The proposed method generates focused and accurate activation maps that:
    *   Highlight the most important image regions, avoiding false positives, and aligning with clinical diagnostic procedures.
    *   Demonstrate how MedCLIP's focus shifts based on the input text prompt, indicating its ability to comprehend text and identify relevant image pixels.
    *   Show variations in activation maps across different class labels.

*   **Main Discussion Points:**
    *   The proposed method overcomes the limitations of conventional XAI methods for VLMs in medical imaging.
    *   It effectively explains the functioning of the MedCLIP VLM and the combined effect of image and text inputs.
    *   The explainability maps can help understand performance discrepancies in deep learning models.
    *   The method is flexible and can be adapted to other VLMs.
    *   The importance of making VLMs explainable to increase trustworthiness and facilitate practical use.

In [65]:
def custom_summary(file_path, MODEL, custom_prompt, vectorDB_path):
    loader = PyPDFLoader(file_path)
    docs = loader.load_and_split()
    prompt_template = custom_prompt + """
    Answer the following question based only on the provided context, do not use any external information.:

    <context>
    {text}
    </context>
    
    SUMMARY:"""
    PROMPT = PromptTemplate(template=prompt_template, input_variables=['text'])
    llm = ChatGoogleGenerativeAI(model=MODEL, temperature=0.5)
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    vector_store = FAISS.from_documents(docs, embeddings)
    vectorstore = FAISS.load_local(vectorDB_path, embeddings, allow_dangerous_deserialization=True)  # Load the FAISS index
    chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), chain_type='map_reduce', map_prompt=PROMPT, combine_prompt=PROMPT)
    result = chain({"question": query}, return_only_outputs=True)
    summary = chain({"input_documents": docs},return_only_outputs=True)["output_text"]
    # chain = load_summarize_chain(llm, chain_type='map_reduce', map_prompt=PROMPT, combine_prompt=PROMPT)
    # summary = chain({"input_documents": docs},return_only_outputs=True)["output_text"]
    
    return summary

In [85]:
# CUSTOM_PROMPT = "Write a concise summary of the following paper with this structure: Problem being solved, Approach, Main results and Main Discussion Points. Give output in markdown format."
CUSTOM_PROMPT = "Write a detailed summary of the methodology used in following paper. Give output in markdown format."
vectorDB_path = 'faiss_store'
# custom_summaries = custom_summary(file_path, MODEL, custom_prompt=CUSTOM_PROMPT, vectorDB_path=vectorDB_path)

In [86]:
loader = PyPDFLoader(file_path)
docs = loader.load_and_split()
prompt_template = CUSTOM_PROMPT + """
Answer the following question based only on the provided context, do not use any external information.:

<context>
{text}
</context>
"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=['text'])
llm = ChatGoogleGenerativeAI(model=MODEL, temperature=0.5)
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector_store = FAISS.from_documents(docs, embeddings)
vector_store.save_local(vectorDB_path)
vectorstore = FAISS.load_local(vectorDB_path, embeddings, allow_dangerous_deserialization=True)  # Load the FAISS index
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
result = chain({"question": prompt_template}, return_only_outputs=True)

In [87]:
print(result["answer"])

Here's a summary of the methodology used in the paper, formatted in Markdown:

**Methodology Summary**

The paper focuses on explaining the inner workings of the Vision-Language Model (VLM) MedCLIP, specifically for chest X-ray (CXR) classification, using Explainable AI (XAI) methods. The core approach involves applying these XAI techniques to the embedding space of the VLM, rather than the final output. The methodology is designed to be generalizable to other VLMs.

**Key Steps and Components:**

1.  **XAI Methods:** The paper utilizes four XAI methods:
    *   Gradient Backpropagation (GB)
    *   Occlusion Method
    *   Integrated-Gradients (IG)
    *   Grad-Shapley (GS)

2.  **Image Embedding Explainability:** An XAI method (Mxai) is applied to the image embeddings generated by the vision-encoder (Venc) of the MedCLIP model to generate an explainability map (Fi map ∈ RAxA) for each embedding dimension (i).

    *   Formula: Fi map = Mxai(model = Venc, target= i), where i is the in

Here's a summary of the methodology used in the paper, formatted in Markdown:

**Methodology Summary**

The paper focuses on explaining the inner workings of the Vision-Language Model (VLM) MedCLIP, specifically for chest X-ray (CXR) classification, using Explainable AI (XAI) methods. The core approach involves applying these XAI techniques to the embedding space of the VLM, rather than the final output. The methodology is designed to be generalizable to other VLMs.

**Key Steps and Components:**

1.  **XAI Methods:** The paper utilizes four XAI methods:
    *   Gradient Backpropagation (GB)
    *   Occlusion Method
    *   Integrated-Gradients (IG)
    *   Grad-Shapley (GS)

2.  **Image Embedding Explainability:** An XAI method (Mxai) is applied to the image embeddings generated by the vision-encoder (Venc) of the MedCLIP model to generate an explainability map (Fi map ∈ RAxA) for each embedding dimension (i).

    *   Formula: Fi map = Mxai(model = Venc, target= i), where i is the index of the image embedding Ip ∈ R1x512.

3.  **Text Embedding Generation:** Text input (Xtxt) is encoded through the text encoder (Tend) to generate text embeddings (Tp ∈ R1xM). These embeddings are then scaled by the learned temperature parameter of the VLM.

4.  **Application to MedCLIP:** The XAI methods are applied to MedCLIP to generate explainability maps for a given image.

5.  **Input:** The method uses both images and text prompts (sentences describing various lesions) and class labels.

6.  **Output:** The method generates feature activation maps to explain the MedCLIP model, highlighting the most important image pixels. The highlighted pixel locations align with clinical diagnostic procedures. The explainability maps visualize the difference in activation maps for different text input forms. The approach avoids false positives. The method uses image and text embedding fusion approach.
