In [8]:
!pip install transformers
!pip install langchain_community
!pip install ctransformers
!pip install faiss-cpu
!pip install torch
!pip install numpy
!pip install sentence_transformers


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m82.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.0


In [9]:
from langchain_community.llms.ctransformers import CTransformers
from transformers import AutoTokenizer, AutoModel
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_community.docstore import InMemoryDocstore
import faiss
import torch
import numpy as np

In [10]:
# Load Models
llm = CTransformers(
    model="TheBloke/Llama-2-7b-GGML",
    model_type="llama"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

llama-2-7b.ggmlv3.q2_K.bin:   0%|          | 0.00/2.87G [00:00<?, ?B/s]

In [11]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [15]:
# Load model
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def embed_texts(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

In [16]:
#Get Embeddings Dimentions
embeddings_exmple = embed_texts("Hello. How are you?")
embeding_dim = embeddings_exmple.shape[1]

In [17]:
#initialize FAISS index
index = faiss.IndexFlatL2(embeding_dim)

In [18]:
# Initialize InMemoryDocstore
docstore = InMemoryDocstore()

In [19]:
#Create an index-to-document mapping
index_to_docstore_id = {}

In [20]:
#create the fiass vector store
vector_store = FAISS(embedding_function=embed_texts, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)



In [21]:
#prepare documents
documents = [
    Document(page_content="RAG (Retrieval-Augmented Generation) is a method that combines a language model with an external database or documents, so the model can fetch relevant information before generating an answer."),
    Document(page_content="RAG is commonly used in chatbots, question-answering systems, and search-based AI apps, because it reduces hallucination and improves reliability."),
    Document(page_content="This approach helps the model produce more accurate, updated, and factual responses, especially when the needed information is not inside the model itself."),
]


In [22]:
#embed documents and add to the vector store
texts = [doc.page_content for doc in documents]
embeddings = get_embeddings(texts)

for i, embedding in enumerate(embeddings):
    index.add(np.array([embedding], dtype=np.float32))
    index_to_docstore_id[i] = documents[i].page_content


In [24]:
def simple_retriever(query):
  query_embedding = embed_texts([query])
  D, I = index.search(query_embedding, k=1)
  return index_to_docstore_id[I[0][0]] if len(I) > 0 and I[0][0] in index_to_docstore_id else None

In [28]:
#Create the RAG Chain
class SimpleRetrieverlQA:
    def __init__(self, llm, retriever):
        self.llm = llm
        self.retriever = retriever

    def run(self, query):
        return self.retriever(query)
        response = self.llm(f"Context: {context}\nQuestion: {query}")
        return response

qa_chain = SimpleRetrieverlQA(llm = llm, retriever = simple_retriever)


In [31]:
#Questions
questions = "What is RAG?"

In [32]:
#Get Answers
answers = qa_chain.run(questions)
print(answers)

RAG is commonly used in chatbots, question-answering systems, and search-based AI apps, because it reduces hallucination and improves reliability.
