In [None]:
#This notebook implements a RAG pipeline for querying hugging face documentation using langchain and hugging face
#It is based on the tutorial at https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/advanced_rag.ipynb#scrollTo=Abn4gw5A9-9O
#The steps are as follows:
#1. Load the dataset
#2. Chunk the dataset
#3. Create a vector store of the dataset
#4. Create an embedding model for initial retrieving
#5. Create a reranker
#6. Create a chat model
#7. Use all of them to RAG

In [None]:
!pip install -q torch transformers datasets accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl pacmap

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset
import matplotlib.pyplot as plt
pd.set_option(
    'display.max_colwidth', None
)

In [None]:
#1. Load dataset
from datasets import load_dataset
dataset = load_dataset("m-ric/huggingface_doc", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/22.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2647 [00:00<?, ? examples/s]

In [None]:
#2. convert document into a form processible by splitter -- metadata contains source field
from langchain.docstore.document import Document
raw_data = [
            Document(page_content = doc['text'], metadata = {'source' : doc['source']})
            for doc in dataset
           ]

In [None]:
#3. Define a function to chunk the documents using recursive character and tokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
def split_documents(chunk_size, raw_data, separators, model_name):
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
                    AutoTokenizer.from_pretrained(model_name),
                    chunk_size = chunk_size,
                    chunk_overlap = int(chunk_size/10),
                    add_start_index = True,
                    strip_whitespace = True,
                    separators = separators
    )
    split_docs = []
    for doc in raw_data:
      split_docs = split_docs + text_splitter.split_documents([doc])
    #keep unique documents
    unique_d = {}
    unique_text = []
    for doc in split_docs:
        if doc.page_content not in unique_d:
           #
           unique_d[doc.page_content] = 1
           unique_text.append(doc)
    return unique_text

In [None]:
separators = [
              "\n#{1,6} ",
              "```\n",
              "\n\\*\\*\\*+\n",
              "\n---+\n",
              "\n___+\n",
              "\n\n",
              "\n",
              " ",
              "",
             ]
embedding_name = 'thenlper/gte-small'
split_docs = split_documents(512, raw_data, separators, embedding_name)

In [None]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

embedding_model = HuggingFaceEmbeddings(
                         model_name = embedding_name,
                         model_kwargs = {'device' : 'cuda'},
                         encode_kwargs = {'normalize_embeddings' : True}
)
v = FAISS.from_documents(split_docs, embedding_model, distance_strategy = DistanceStrategy.COSINE)

In [None]:
query = 'How to use sentence transformers'
embedded_query = embedding_model.embed_query(query)
retrieved_docs = v.similarity_search(query = query, k = 30)
for i in range(30):
  print('FROM THE DOCUMENT')
  print(retrieved_docs[i].page_content)
  print(retrieved_docs[i].metadata['source'])
  print('->->->->')

FROM THE DOCUMENT
Second, Sentence Transformers is an accessible entry-point to many important ML concepts that you can branch off into. For example, you can use it to learn about [clustering](https://www.sbert.net/examples/applications/clustering/README.html), [model distillation](https://www.sbert.net/examples/training/distillation/README.html), and even launch into text-to-image work with [CLIP](https://www.sbert.net/examples/applications/image-search/README.html). In fact, Sentence Transformers is so versatile that it’s skyrocketed to almost 8,000 stars on GitHub, with [more than 3,000 projects and packages depending on it](https://github.com/UKPLab/sentence-transformers/network/dependents?dependent_type=REPOSITORY&package_id=UGFja2FnZS00ODgyNDAwNzQ%3D). On top of the official docs, there’s an abundance of community-created content (look for some links at the end of this post 👀), and the library’s ubiquity has made it [popular in research](https://twitter.com/NimaBoscarino/status/1

In [None]:
!pip -q install ragatouille

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m41.0/41.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m867.1 kB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m81.9/86.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.7/86.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB

In [None]:
from transformers import pipeline, BitsAndBytesConfig, AutoModelForCausalLM
import torch

model_name = 'HuggingFaceH4/zephyr-7b-beta'
bnb_config = BitsAndBytesConfig(
                                load_in_4bit = True,
                                bnb_4bit_quant_type = 'nf4',
                                bnb_4bit_compute_dtype = torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config = bnb_config)
tokeniser = AutoTokenizer.from_pretrained(model_name)
pipe = pipeline(
               task = 'text-generation',
               model = model,
               tokenizer = tokeniser,
               do_sample = True,
               temperature = 0.1,
               repetition_penalty = 1,
               return_full_text = False,
               max_new_tokens = 500
)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [None]:
chat_prompt = [
    {'role' : 'system',
     'content' : ''' Use the information in the context below to answer the question, also below, comprehensively and
                correctly. Respond only to the question asked. Give a source when you can. If you cannot give an answer, it is better that you tell it.
    '''},
    {'role' : 'user',
     'content' : ''' Context {context} The question you need to answer is Question : {question}
     '''
     }
]
rag_prompt_template = tokeniser.apply_chat_template(chat_prompt, tokenize = False, add_generation_prompt = True)
print(rag_prompt_template)

<|system|>
 Use the information in the context below to answer the question, also below, comprehensively and
                correctly. Respond only to the question asked. Give a source when you can. If you cannot give an answer, it is better that you tell it. 
    </s>
<|user|>
 Context {context} The question you need to answer is Question : {question}
     </s>
<|assistant|>



In [None]:
from ragatouille import RAGPretrainedModel
reranker = RAGPretrainedModel.from_pretrained('colbert-ir/colbertv2.0')

artifact.metadata:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
page_content = [doc.page_content for doc in retrieved_docs]
relevant_docs = reranker.rerank(query, page_content, k=5)
for doc in relevant_docs:
  print('FROM THE DOCUMENT')
  print(doc)
  print('->->->->')

100%|██████████| 1/1 [00:00<00:00,  2.48it/s]

FROM THE DOCUMENT
{'content': 'Second, Sentence Transformers is an accessible entry-point to many important ML concepts that you can branch off into. For example, you can use it to learn about [clustering](https://www.sbert.net/examples/applications/clustering/README.html), [model distillation](https://www.sbert.net/examples/training/distillation/README.html), and even launch into text-to-image work with [CLIP](https://www.sbert.net/examples/applications/image-search/README.html). In fact, Sentence Transformers is so versatile that it’s skyrocketed to almost 8,000 stars on GitHub, with [more than 3,000 projects and packages depending on it](https://github.com/UKPLab/sentence-transformers/network/dependents?dependent_type=REPOSITORY&package_id=UGFja2FnZS00ODgyNDAwNzQ%3D). On top of the official docs, there’s an abundance of community-created content (look for some links at the end of this post 👀), and the library’s ubiquity has made it [popular in research](https://twitter.com/NimaBosca




In [None]:
def RAG(pipe, v, reranker, query, first_k, second_k):
    relevant_docs = v.similarity_search(query = query, k = first_k)
    relevant_docs = [doc.page_content for doc in relevant_docs]
    reranked_docs = reranker.rerank(query, relevant_docs, k = second_k)
    context = ''.join('Document ' + str(i) + doc['content'] for i, doc in enumerate(reranked_docs))
    prompt = rag_prompt_template.format(question = query, context = context)
    answer = pipe(prompt)[0]['generated_text']
    return answer, reranked_docs
answer, reranked_docs = RAG(pipe, v, reranker, query, 30, 5)
print(answer)

100%|██████████| 1/1 [00:00<00:00,  2.64it/s]


Sentence Transformers is a library that allows for the computation of dense vector representations, also known as embeddings, for sentences, paragraphs, and images. These embeddings enable applications such as semantic search, clustering, and retrieval by placing similar text close together in a vector space. The library, called `sentence-transformers`, can be found on the Hugging Face Hub, where over 500 pre-trained models are available for various tasks, including feature extraction and sentence similarity. These models come with features such as automatically generated model cards, metadata tags, and interactive widgets for easy exploration. The library works by passing input text through a pre-trained Transformer model, such as DistilBERT, and then applying a pooling function to generate a single fixed-length embedding for the entire text. This process can be easily implemented in Python using the `sentence-transformers` library, and pre-trained models can be loaded and used direct