In [1]:
! pip install -q sentence-transformers langchain-community langchain-core pypdf langchain-chroma langchain-google-genai rank_bm25

# Keyword Search

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import string

In [3]:
# Sample documents
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [4]:
query = "Keyword-based search"
query

'Keyword-based search'

In [5]:
def preprocess_text(text):
  text = text.lower()
  text = text.translate(str.maketrans('', '', string.punctuation))
  return text

In [6]:
preprocessed_query = preprocess_text(query)

In [7]:
preprocessed_documents = [preprocess_text(x) for x in documents]
preprocessed_documents

['this is a list which containig sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [8]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(preprocessed_documents)

In [9]:
X.toarray()

array([[0.        , 0.        , 0.37796447, 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.37796447],
       [0.        , 0.4533864 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
        0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
        0.        , 0.        , 0.35745504, 0.        , 0.        ,
        0.        ],
       [0.46516193, 0.        , 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43671931, 0.        , 0.        , 0.       

In [10]:
X.toarray()[0]

array([0.        , 0.        , 0.37796447, 0.        , 0.37796447,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.37796447, 0.        , 0.        , 0.37796447,
       0.37796447])

In [11]:
query_embedding = tfidf.transform([preprocessed_query])
query_embedding.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        ]])

In [12]:
similarity_scores = cosine_similarity(query_embedding, X)
similarity_scores

array([[0.        , 0.50551777, 0.        , 0.48693426]])

In [13]:
ranked_indices = np.argsort(similarity_scores[0])[::-1]
ranked_indices

array([1, 3, 2, 0])

In [14]:
for i, doc in enumerate(ranked_indices):
  print(f"Rank {i+1}: {documents[doc]}")

Rank 1: Keywords are important for keyword-based search.
Rank 2: Keyword-based search relies on sparse embeddings.
Rank 3: Document analysis involves extracting keywords.
Rank 4: This is a list which containig sample documents.


# Vector Search

In [15]:
from sentence_transformers import SentenceTransformer

In [16]:
embedding = SentenceTransformer("all-mpnet-base-v2")
embedding

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [17]:
document_embeddings = embedding.encode(preprocessed_documents)
document_embeddings

array([[ 1.8650264e-02, -3.9068390e-02,  8.9845817e-05, ...,
        -1.4877621e-02, -5.5974912e-02,  1.4441775e-02],
       [ 6.5535419e-02, -3.3263426e-02, -3.7593879e-02, ...,
         3.2554816e-02, -6.5969929e-02, -1.8465200e-02],
       [ 5.7807725e-02, -1.8644137e-02, -2.6945801e-02, ...,
        -1.3054643e-02, -8.0498278e-02,  1.2007855e-02],
       [ 3.5619844e-02,  9.6558640e-03,  8.0874981e-03, ...,
         3.3195820e-02, -1.2137770e-01, -1.4678548e-02]], dtype=float32)

In [18]:
query_embedding = embedding.encode(preprocessed_query)
query_embedding

array([ 4.37562093e-02, -1.60320532e-02, -3.05199698e-02, -2.00518221e-02,
       -6.29770458e-02, -6.01423122e-02, -5.82171511e-03,  4.76126932e-02,
       -2.31703036e-02, -3.12311221e-02, -2.21245121e-02, -2.15400029e-02,
       -4.94112484e-02,  6.25988692e-02, -5.50539121e-02, -2.93574836e-02,
        2.98771318e-02,  3.06584258e-02, -1.62722673e-02,  1.34539548e-02,
        6.65953849e-03,  1.59052685e-02,  3.94686870e-03, -9.34478827e-03,
        5.12912497e-02,  5.04843742e-02, -6.12117574e-02, -1.04595674e-02,
        7.52847223e-03, -8.11539590e-03, -1.61834080e-02,  1.27498442e-02,
        2.21845531e-03,  1.41245294e-02,  1.21696098e-06, -2.51499508e-02,
       -6.21999316e-02, -6.32416559e-05, -8.71455879e-04,  4.46435660e-02,
        4.04462852e-02,  4.78317887e-02,  4.70352285e-02, -9.93653946e-03,
       -6.34716777e-03,  3.29303264e-04,  1.89788621e-02, -6.22508989e-04,
        3.73017415e-02,  7.09498301e-04,  1.44601930e-02, -5.98637238e-02,
       -7.31391609e-02, -

In [19]:
similarity_scores = cosine_similarity([query_embedding], document_embeddings)
similarity_scores

array([[0.22118078, 0.83885694, 0.5133637 , 0.646263  ]], dtype=float32)

In [20]:
ranked_indices = np.argsort(similarity_scores[0])[::-1]
ranked_indices

array([1, 3, 2, 0])

In [21]:
for i, doc in enumerate(ranked_indices):
  print(f"Rank {i+1}: {documents[doc]}")

Rank 1: Keywords are important for keyword-based search.
Rank 2: Keyword-based search relies on sparse embeddings.
Rank 3: Document analysis involves extracting keywords.
Rank 4: This is a list which containig sample documents.


# Hybrid Search

In [22]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.chains import RetrievalQA
from google.colab import userdata
import os

In [23]:
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [24]:
doc = PyPDFLoader("transformers.pdf").load()
doc

[Document(metadata={'source': 'transformers.pdf', 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network 

In [25]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [26]:
chunks = splitter.split_documents(doc)
chunks

[Document(metadata={'source': 'transformers.pdf', 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network 

In [27]:
embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

In [28]:
vectordb = Chroma.from_documents(documents=chunks, embedding=embedding)

In [29]:
vector_retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [30]:
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k = 3

In [31]:
ensemble_retriver = EnsembleRetriever(retrievers=[vector_retriever,keyword_retriever], weights=[0.3, 0.7])

In [32]:
model = ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp")

In [33]:
normal_chain = RetrievalQA.from_chain_type(llm=model, chain_type="stuff", retriever=vector_retriever)

In [34]:
hybrid_chain = RetrievalQA.from_chain_type(llm=model, chain_type="stuff", retriever=ensemble_retriver)

In [35]:
response1 = normal_chain.invoke('Attention')
response1

{'query': 'Attention',
 'result': 'Attention mechanisms are used to compute the weight assigned to each value, based on a compatibility function of the query with the corresponding key. The two most common attention functions are additive attention and dot-product (multiplicative) attention. Scaled Dot-Product Attention is a specific type of attention mechanism.'}

In [36]:
response2 = hybrid_chain.invoke('Attention')
response2

{'query': 'Attention',
 'result': "Based on the provided context, here's a breakdown of attention mechanisms:\n\n**What is Attention?**\n\n*   An attention function maps a query and a set of key-value pairs to an output.\n*   The query, keys, values, and output are all vectors.\n*   The output is a weighted sum of the values.\n*   The weight assigned to each value is determined by a compatibility function between the query and its corresponding key.\n\n**Types of Attention Discussed:**\n\n1.  **Scaled Dot-Product Attention:**\n    *   This is the specific type of attention used in the Transformer model.\n    *   It calculates the dot products of the query with all keys.\n    *   Each dot product is divided by the square root of the dimension of the keys (√dk).\n    *   A softmax function is applied to get the weights for the values.\n    *   The output is calculated as: Attention(Q, K, V) = softmax(QKT / √dk)V, where Q, K, and V are matrices of queries, keys, and values, respectively.\