In [1]:
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [2]:
query="keyword-based search"

In [3]:
import re
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [4]:
preprocess_documents=[preprocess_text(doc) for doc in documents]

In [5]:
preprocess_documents

['this is a list which containig sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [6]:
preprocess_query=preprocess_text(query)

In [7]:
preprocess_query

'keywordbased search'

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [9]:
vector=TfidfVectorizer()

In [10]:
doc_embedding=vector.fit_transform(preprocess_documents)

In [11]:
doc_embedding.toarray()

array([[0.        , 0.        , 0.37796447, 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.37796447],
       [0.        , 0.4533864 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
        0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
        0.        , 0.        , 0.35745504, 0.        , 0.        ,
        0.        ],
       [0.46516193, 0.        , 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43671931, 0.        , 0.        , 0.       

In [12]:
doc_embedding.toarray()[0]

array([0.        , 0.        , 0.37796447, 0.        , 0.37796447,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.37796447, 0.        , 0.        , 0.37796447,
       0.37796447])

In [13]:
query_embedding=vector.transform([preprocess_query])

In [14]:
query_embedding.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        ]])

In [15]:
similarities=cosine_similarity(query_embedding,doc_embedding)

In [16]:
similarities

array([[0.        , 0.50551777, 0.        , 0.48693426]])

In [17]:
np.argsort(similarities)

array([[0, 2, 3, 1]])

In [18]:
rank_ind=np.argsort(similarities)[0][::-1]
rank_ind

array([1, 3, 2, 0])

In [19]:
rank_doc=[documents[i] for i in rank_ind]
rank_doc

['Keywords are important for keyword-based search.',
 'Keyword-based search relies on sparse embeddings.',
 'Document analysis involves extracting keywords.',
 'This is a list which containig sample documents.']

In [20]:
!pip install -qU pypdf langchain_community

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.7/309.7 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m85.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
from langchain_community.document_loaders import PyPDFLoader

In [11]:
filepath="/content/CS_DSAI_Roadmap.pdf"

In [12]:
docs=PyPDFLoader(filepath).load()

In [13]:
docs[0]

Document(metadata={'producer': 'Skia/PDF m102', 'creator': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Joplin/2.9.17 Chrome/102.0.5005.167 Electron/19.0.10 Safari/537.36', 'creationdate': '2023-05-19T07:15:42+00:00', 'moddate': '2023-05-19T07:15:42+00:00', 'source': '/content/CS_DSAI_Roadmap.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1'}, page_content="CS_DSAI_Roadmap\nAbout\nThe sheet has been as a checkpoint/roadmap for students wanting to pursue CSE and DSAI. If you have any queries or want elaboration on any point, please contact me.\nContact Details - Name : Vaibhav Arora Phone : +91-8168401709 Email : vaibhavarora@iitbhilai.ac.in Note - You may also contact me for arrangement of course materials taught at IIT Bhilai.\nSDE targeted Tasks (CSE)\nSDE stands for Software development intern. It requires you to be good ith DSA and development. It is very important that CSE folks focus on their core subjects for this but there are plently of courses an

In [14]:
docs[0].metadata['total_pages']

7

In [15]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [16]:
text_spliter=RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=40)

In [17]:
chunks=text_spliter.create_documents([docs[0].page_content])

In [18]:
chunks

[Document(metadata={}, page_content='CS_DSAI_Roadmap\nAbout\nThe sheet has been as a checkpoint/roadmap for students wanting to pursue CSE and DSAI. If you have any queries or want elaboration on any point, please contact me.'),
 Document(metadata={}, page_content='Contact Details - Name : Vaibhav Arora Phone : +91-8168401709 Email : vaibhavarora@iitbhilai.ac.in Note - You may also contact me for arrangement of course materials taught at IIT Bhilai.'),
 Document(metadata={}, page_content='SDE targeted Tasks (CSE)'),
 Document(metadata={}, page_content='SDE stands for Software development intern. It requires you to be good ith DSA and development. It is very important that CSE folks focus on their core subjects for this but there are plently of'),
 Document(metadata={}, page_content='for this but there are plently of courses and Problemsets online to learn from. Given 2 months of vacation, I would recommend the following'),
 Document(metadata={}, page_content="tasks -:\nStrivers' DSA Sh

In [23]:
!pip install -U sentence_transformers



In [19]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

In [33]:
from google.colab import userdata

In [34]:
HF_TOKEN=userdata.get("hf")

In [None]:
embeddings = HuggingFaceEmbeddings( model_name="BAAI/bge-base-en-v1.5")

In [None]:
em_q=embeddings.embed_query("hello")
em_q[:5]

In [37]:
!pip install -qU "langchain-chroma>=0.1.2"

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.6/101.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m78.7 MB/s[0m eta [36m0:00:0

In [None]:
from langchain_chroma import Chroma

vector_store = Chroma.from_documents(chunks,embeddings)

In [39]:
vectorstore_retreiver = vector_store.as_retriever(search_kwargs={"k": 3})

In [40]:
res=vectorstore_retreiver.invoke("what is SDE")

In [41]:
res

[Document(id='5869d7eb-d59e-4df1-a0e7-a6df4e856000', metadata={}, page_content='SDE stands for Software development intern. It requires you to be good ith DSA and development. It is very important that CSE folks focus on their core subjects for this but there are plently of'),
 Document(id='5882b142-4539-441b-8827-5f66f9b5c90c', metadata={}, page_content='SDE targeted Tasks (CSE)'),
 Document(id='d860503b-acd5-4a8a-b8bb-460a65e7a5f8', metadata={}, page_content="tasks -:\nStrivers' DSA Sheet - https://takeuforward.org/strivers-a2z-dsa-course/strivers-a2z-dsa-course-\nsheet-2/\nStrivers SDE Sheet (if you have completed Networking, DBMS etc.) -")]

In [42]:
!pip install -qU rank_bm25

In [43]:
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

In [44]:
keyword_retriver=BM25Retriever.from_documents(chunks)

In [46]:
keyword_retriver.k=3

In [47]:
keyword_retriver.invoke("what is SDE")

[Document(metadata={}, page_content='SDE targeted Tasks (CSE)'),
 Document(metadata={}, page_content='SDE stands for Software development intern. It requires you to be good ith DSA and development. It is very important that CSE folks focus on their core subjects for this but there are plently of'),
 Document(metadata={}, page_content="tasks -:\nStrivers' DSA Sheet - https://takeuforward.org/strivers-a2z-dsa-course/strivers-a2z-dsa-course-\nsheet-2/\nStrivers SDE Sheet (if you have completed Networking, DBMS etc.) -")]

In [54]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,keyword_retriver],weights=[0.5, 0.5])

In [55]:
ensemble_retriever.invoke("what is SDE")

[Document(id='5869d7eb-d59e-4df1-a0e7-a6df4e856000', metadata={}, page_content='SDE stands for Software development intern. It requires you to be good ith DSA and development. It is very important that CSE folks focus on their core subjects for this but there are plently of'),
 Document(id='5882b142-4539-441b-8827-5f66f9b5c90c', metadata={}, page_content='SDE targeted Tasks (CSE)'),
 Document(id='d860503b-acd5-4a8a-b8bb-460a65e7a5f8', metadata={}, page_content="tasks -:\nStrivers' DSA Sheet - https://takeuforward.org/strivers-a2z-dsa-course/strivers-a2z-dsa-course-\nsheet-2/\nStrivers SDE Sheet (if you have completed Networking, DBMS etc.) -")]

In [56]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [71]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Using cached bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting torch<3,>=2.2 (from bitsandbytes)
  Downloading torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting numpy>=1.17 (from bitsandbytes)
  Downloading numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from torch<3,>=2.2->bitsandbytes)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.10.0 (from torch<3,>=2.2->bitsandbytes)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy>=1.13.3 (from torch<3,>=2.2->bitsandbytes)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch<3,>=2.2->bitsandbytes)
  Downloading networkx-3.5-py3-none-any.whl.metad

In [59]:
!pip install -qU accelerate

In [60]:
import torch
from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, )
from langchain_huggingface.llms import HuggingFacePipeline

In [68]:
# function for loading 4-bit quantized model
def load_quantized_model(model_name: str):
    """
    model_name: Name or path of the model to be loaded.
    return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
    )
    return model

In [62]:
# initializing tokenizer
def initialize_tokenizer(model_name: str):
    """
    model_name: Name or path of the model for tokenizer initialization.
    return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer

In [69]:
tokenizer = initialize_tokenizer(model_name)

In [None]:
model = load_quantized_model(model_name)

In [None]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

In [None]:
llm = HuggingFacePipeline(pipeline=pipeline)

In [None]:
from langchain.chains import RetrievalQA


In [None]:
normal_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vectorstore_retreiver
)

In [None]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=ensemble_retriever
)

In [None]:
response1 = normal_chain.invoke("What is Abstractive Question Answering?")
response1

In [None]:
response2 = hybrid_chain.invoke("What is Abstractive Question Answering?")
response2