<a href="https://colab.research.google.com/github/selva-mani-007/gen-ai/blob/main/Rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [4]:
!pip install Wikipedia

Collecting Wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: Wikipedia
  Building wheel for Wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for Wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=392dd1b70e2baa85e5dc18259603f19c8b2f3eecadd64aea9e75582f799b91a5
  Stored in directory: /root/.cache/pip/wheels/8f/ab/cb/45ccc40522d3a1c41e1d2ad53b8f33a62f394011ec38cd71c6
Successfully built Wikipedia
Installing collected packages: Wikipedia
Successfully installed Wikipedia-1.4.0


In [9]:
!pip install wikipedia-api


Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.8.1-py3-none-any.whl size=15383 sha256=b88410a391e2c07bd3773e7bd0ab19ef40baae3ee519de275669cd11c0ec6dbf
  Stored in directory: /root/.cache/pip/wheels/0b/0f/39/e8214ec038ccd5aeb8c82b957289f2f3ab2251febeae5c2860
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.8.1


In [13]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [1]:
import os
from getpass import getpass
from langchain_community.document_loaders import WikipediaLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI  # Using OpenRouter
from langchain.chains import RetrievalQA

# Set OpenRouter API Key securely
os.environ["OPENROUTER_API_KEY"] = getpass("Enter your OpenRouter API Key: ")

# Check API Key
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
if not OPENROUTER_API_KEY:
    raise ValueError("OpenRouter API Key is missing. Set it as an environment variable.")

# Initialize Wikipedia Loader
def load_wikipedia_data(query):
    loader = WikipediaLoader(query=query, lang='en')
    return loader.load()

# Initialize embeddings model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load Wikipedia data and create FAISS index
def build_faiss_index(topic):
    documents = load_wikipedia_data(topic)
    if not documents:
        raise ValueError(f"No Wikipedia data found for {topic}")

    texts = [doc.page_content for doc in documents]
    vector_db = FAISS.from_texts(texts, embedding_model)
    return vector_db

# Load OpenRouter GPT model
llm = ChatOpenAI(
    model_name="openai/gpt-3.5-turbo",
    openai_api_key=OPENROUTER_API_KEY,
    openai_api_base="https://openrouter.ai/api/v1"
)

# Create Retrieval QA Chain
def create_rag_chain(vector_db):
    retriever = vector_db.as_retriever(search_kwargs={"k": 3})  # Fetch top 3 relevant docs
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

if __name__ == '__main__':
    topic = input("Enter a topic to fetch Wikipedia data: ").strip()
    if not topic:
        print("Error: Topic cannot be empty.")
        exit()

    try:
        vector_db = build_faiss_index(topic)
        qa_chain = create_rag_chain(vector_db)

        while True:
            user_query = input(f"Ask a question about {topic} (or type 'exit' to quit): ").strip()
            if user_query.lower() == 'exit':
                print("Goodbye!")
                break
            response = qa_chain.run(user_query)
            print("\nResponse:", response)
    except Exception as e:
        print("Error:", str(e))


Enter your OpenRouter API Key: ··········


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  llm = ChatOpenAI(


Enter a topic to fetch Wikipedia data: virat kohli
Ask a question about virat kohli (or type 'exit' to quit): wife name


  response = qa_chain.run(user_query)



Response: Virat Kohli's wife is actress Anushka Sharma.
Ask a question about virat kohli (or type 'exit' to quit): no of international hundreds

Response: I don't have specific information about the number of international hundreds scored by Rohit Sharma.
Ask a question about virat kohli (or type 'exit' to quit): ipl franchise name

Response: The franchise names mentioned in the context are Kolkata Knight Riders (KKR) and Royal Challengers Bengaluru (RCB).
Ask a question about virat kohli (or type 'exit' to quit): exit
Goodbye!


In [3]:
!pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5


In [4]:

import os
import fitz  # PyMuPDF for PDF processing
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI  # Using OpenRouter
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Set OpenRouter API Key (ensure it's set in your environment)
OPENROUTER_API_KEY = "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"

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text

# Load PDF and create FAISS index
def build_faiss_index(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_text(text)

    documents = [Document(page_content=t) for t in texts]

    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_db = FAISS.from_documents(documents, embedding_model)
    return vector_db

# Load OpenRouter GPT model
llm = ChatOpenAI(
    model_name="openai/gpt-3.5-turbo",
    openai_api_key=OPENROUTER_API_KEY,
    openai_api_base="https://openrouter.ai/api/v1"
)

# Create Retrieval QA Chain
def create_rag_chain(vector_db):
    retriever = vector_db.as_retriever()
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

if __name__ == '__main__':
    pdf_path = input("Enter the path to your PDF file: ")
    vector_db = build_faiss_index(pdf_path)
    qa_chain = create_rag_chain(vector_db)

    while True:
        user_query = input("Ask a question from the PDF (or type 'exit' to quit): ")
        if user_query.lower() == 'exit':
            break
        response = qa_chain.run(user_query)
        print("Response:", response)

Enter the path to your PDF file: /content/Ch.01_Introduction_ to_computers.pdf
Ask a question from the PDF (or type 'exit' to quit): what is computer
Response: A computer is an electronic device that operates under the control of instructions stored in its memory. It can accept data as input, process the data according to specified rules, produce information as output, and store information for future use. Computers consist of hardware (physical components like monitor, keyboard, storage devices) and software (programs and instructions). They can be classified based on size and power into categories like personal computers, workstations, minicomputers, mainframes, and supercomputers. Computers are versatile and can perform various tasks but can only do what they have been programmed to do.
Ask a question from the PDF (or type 'exit' to quit): exit


In [7]:

import requests
import fitz  # PyMuPDF for PDF handling (if needed)
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.chat_models import ChatOpenAI  # LLaMA/Mistral via OpenRouter
from langchain.chains import RetrievalQA
from langchain.schema import Document

# PubMed API Constants
PUBMED_API_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

# Initialize embeddings model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# OpenRouter API Key for LLaMA/Mistral
OPENROUTER_API_KEY = "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"

# Initialize LLM (LLaMA/Mistral)
llm = ChatOpenAI(
    model_name="mistralai/mistral-7b-instruct",  # You can change to "meta-llama/llama-2-7b-chat"
    openai_api_key=OPENROUTER_API_KEY,
    openai_api_base="https://openrouter.ai/api/v1"
)

def fetch_pubmed_articles(query, max_results=5):
    """ Fetch PubMed article abstracts based on query. """
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "json",
        "retmax": max_results
    }
    response = requests.get(PUBMED_API_URL, params=params)
    article_ids = response.json().get("esearchresult", {}).get("idlist", [])

    if not article_ids:
        return []

    # Fetch article details
    fetch_params = {
        "db": "pubmed",
        "id": ",".join(article_ids),
        "retmode": "text",
        "rettype": "abstract"
    }
    response = requests.get(PUBMED_FETCH_URL, params=fetch_params)
    return response.text.split("\n\n")  # Split articles

def build_faiss_index(articles):
    """ Process articles into FAISS vector store. """
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunks = [chunk for article in articles for chunk in text_splitter.split_text(article)]

    documents = [Document(page_content=chunk) for chunk in text_chunks]
    vector_db = FAISS.from_documents(documents, embedding_model)
    return vector_db

def create_rag_chain(vector_db):
    """ Create Retrieval-Augmented Generation (RAG) Chain. """
    retriever = vector_db.as_retriever()
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

if __name__ == '__main__':
    user_query = input("Enter a medical topic (e.g., 'Diabetes treatment', 'COVID-19 vaccines'): ")
    print(f"Fetching articles related to: {user_query}")

    articles = fetch_pubmed_articles(user_query)

    if not articles:
        print("No relevant articles found on PubMed.")
    else:
        print("Building vector database...")
        vector_db = build_faiss_index(articles)
        qa_chain = create_rag_chain(vector_db)

        print("System ready! Ask health-related questions.")
        while True:
            question = input("Ask a health question (or type 'exit' to quit): ")
            if question.lower() == 'exit':
                break
            response = qa_chain.run(question)
            print("Response:", response)

Enter a medical topic (e.g., 'Diabetes treatment', 'COVID-19 vaccines'): insomnia
Fetching articles related to: insomnia
Building vector database...
System ready! Ask health-related questions.
Ask a health question (or type 'exit' to quit): how to prevent it
Response:  The provided context does not provide information on how to prevent a specific disease or condition. It simply lists the authors, their affiliations, and the publication details of two different research articles. To find information on how to prevent a disease or condition, you would need to consult the research articles themselves or other authoritative sources such as the Centers for Disease Control and Prevention (CDC) or the World Health Organization (WHO).
Ask a health question (or type 'exit' to quit): then why you are here get lost
Response:  Based on the provided context, it appears that the titles are for academic articles related to the psychological status of pregnant women in Nairobi County, Kenya during the

In [1]:
!pip install openai-whisper




In [3]:
!pip install git+https://github.com/openai/CLIP.git


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-0birup1_
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-0birup1_
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=ac959db59f5d0850e0ece8597c1f39643cb71a9cfd3de87be6377678dda5c441
  Stored in directory: /tmp/pip-ephem-wheel-cache-ycx_yzcm/wheels/3f/7c/a4/9b490845988bf7a4d

In [4]:
import os
import torch
import faiss
import numpy as np
import whisper
import clip
from PIL import Image
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI  # OpenRouter LLM
from langchain.chains import RetrievalQA
from langchain.schema import Document
from transformers import CLIPProcessor, CLIPModel

# Set API Key for OpenRouter
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")  # Ensure this is set

# Load Models
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)  # CLIP for images
whisper_model = whisper.load_model("small")  # Whisper for audio transcription
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize OpenRouter LLM
llm = ChatOpenAI(
    model_name="mistralai/mistral-7b-instruct",
    openai_api_key=OPENROUTER_API_KEY,
    openai_api_base="https://openrouter.ai/api/v1"
)

# ========== TEXT PROCESSING ==========
def embed_text(text):
    """ Convert text into a vector embedding using HuggingFace. """
    return embedding_model.embed_query(text)

# ========== IMAGE PROCESSING ==========
def embed_image(image_path):
    """ Convert image into a vector embedding using CLIP. """
    image = Image.open(image_path)
    image_input = clip_preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = clip_model.encode_image(image_input)
    return image_features.cpu().numpy().flatten()

# ========== AUDIO PROCESSING ==========
def transcribe_audio(audio_path):
    """ Transcribe audio using Whisper and then embed it. """
    transcription = whisper_model.transcribe(audio_path)["text"]
    return embed_text(transcription)

# ========== FAISS DATABASE ==========
vector_dim = 512  # CLIP & MiniLM use 512D embeddings
faiss_index = faiss.IndexFlatL2(vector_dim)  # Create FAISS index
vector_store = {}

def add_to_faiss(identifier, vector):
    """ Add embeddings to FAISS with an identifier. """
    global faiss_index, vector_store
    faiss_index.add(np.array([vector]).astype("float32"))
    vector_store[len(vector_store)] = identifier  # Track index position

def retrieve_similar(query_vector, top_k=3):
    """ Retrieve top-k similar items from FAISS. """
    _, indices = faiss_index.search(np.array([query_vector]).astype("float32"), top_k)
    return [vector_store[idx] for idx in indices[0]]

# ========== DATA INGESTION ==========
def ingest_data(data_path, data_type):
    """ Ingests text, image, or audio into FAISS. """
    if data_type == "text":
        vector = embed_text(open(data_path, "r").read())
    elif data_type == "image":
        vector = embed_image(data_path)
    elif data_type == "audio":
        vector = transcribe_audio(data_path)
    else:
        raise ValueError("Invalid data type. Choose from 'text', 'image', 'audio'.")

    add_to_faiss(data_path, vector)
    print(f"✅ {data_type.capitalize()} data added to FAISS: {data_path}")

# ========== QUERY SYSTEM ==========
def multimodal_query(query, query_type):
    """ Handles text, image, and audio queries for retrieval and LLM response. """
    if query_type == "text":
        query_vector = embed_text(query)
    elif query_type == "image":
        query_vector = embed_image(query)
    elif query_type == "audio":
        query_vector = transcribe_audio(query)
    else:
        raise ValueError("Invalid query type. Use 'text', 'image', or 'audio'.")

    # Retrieve relevant documents
    results = retrieve_similar(query_vector)

    # Format retrieved data for LLM
    context = "\n".join([f"Relevant Data: {open(res, 'r').read() if res.endswith('.txt') else res}" for res in results])
    prompt = f"Based on the retrieved data, answer the following:\nQuery: {query}\nContext: {context}"

    # Generate final response
    response = llm.invoke(prompt)
    return response.content

# ========== DEMO ==========
if __name__ == "__main__":
    print("🚀 Multimodal RAG System Initialized!")

    # Ingest Sample Data (Text, Image, and Audio)
    ingest_data("sample.txt", "text")  # Add text data
    ingest_data("sample.jpg", "image")  # Add image data
    ingest_data("sample.mp3", "audio")  # Add audio data

    while True:
        user_query = input("\nEnter a query (or type 'exit' to quit): ")
        if user_query.lower() == "exit":
            break

        query_type = input("What type of query is this? (text/image/audio): ").strip().lower()
        response = multimodal_query(user_query, query_type)
        print("\n🧠 AI Response:", response)


100%|███████████████████████████████████████| 338M/338M [00:06<00:00, 55.9MiB/s]
100%|███████████████████████████████████████| 461M/461M [00:09<00:00, 51.6MiB/s]
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  llm = ChatOpenAI(


ValidationError: 1 validation error for ChatOpenAI
  Value error, Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. [type=value_error, input_value={'model_name': 'mistralai...ne, 'http_client': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/value_error