In [1]:
pip install datasets transformers faiss-cpu sentence-transformers

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloadin

In [2]:
import faiss
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

In [3]:
device = torch.device('cpu')

In [4]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
gpt_model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
print("Loading Wikipedia dataset...")
wiki_dataset = load_dataset('wikipedia', '20220301.en', split='train[:1%]')  # Using only 1% for demonstration

documents = [article['text'] for article in wiki_dataset]

def truncate_document(doc, max_length=1000):
    return doc[:max_length]

# Truncate all documents to a max length
documents = [truncate_document(doc) for doc in documents]

Loading Wikipedia dataset...


Downloading builder script:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

The repository for wikipedia contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wikipedia.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0/41 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/6458670 [00:00<?, ? examples/s]

In [6]:
def create_faiss_index(docs):
    print("Creating embeddings...")

    embeddings = embedder.encode(docs, convert_to_tensor=True, device=device)
    embeddings = embeddings.numpy()

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index, embeddings

index, doc_embeddings = create_faiss_index(documents)

Creating embeddings...


In [7]:
def retrieve_documents(query, index, documents, k=2):
    query_embedding = embedder.encode(query, convert_to_tensor=True, device=device)
    query_embedding = query_embedding.numpy()

    distances, indices = index.search(query_embedding.reshape(1, -1), k)
    retrieved_docs = [documents[i] for i in indices[0]]
    return retrieved_docs

In [12]:
def generate_response(query, retrieved_docs, max_input_length=1024, max_new_tokens=150):
    context = " ".join(retrieved_docs)
    input_text = f"Context: {context}\nUser: {query}\nBot:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    if input_ids.shape[1] > max_input_length:
        input_ids = input_ids[:, -max_input_length:]

    output = gpt_model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1
    )

    # Decoding the generated text
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

In [13]:
def rag_chatbot(query):
    retrieved_docs = retrieve_documents(query, index, documents)
    response = generate_response(query, retrieved_docs)
    return response

In [16]:
user_query = "Tell me about Fyodor Doestoevsky."
bot_response = rag_chatbot(user_query)
print("Bot:", bot_response)

Bot: Context: Fyodor Mikhailovich Dostoevsky (, ; ; 11 November 18219 February 1881), sometimes transliterated as Dostoyevsky, was a Russian novelist, short story writer, essayist, and journalist. Dostoevsky's literary works explore the human condition in the troubled political, social, and spiritual atmospheres of 19th-century Russia, and engage with a variety of philosophical and religious themes. His most acclaimed novels include Crime and Punishment (1866), The Idiot (1869), Demons (1872), and The Brothers Karamazov (1880). His 1864 novella Notes from Underground is considered to be one of the first works of existentialist literature. Numerous literary critics rate him as one of the greatest novelists in all of world literature, as many of his works are considered highly influential masterpieces.

Born in Moscow in 1821, Dostoevsky was introduced to literature at an early age through fairy tales and legends, and through books by Russian and foreign authors. His mother died in 1837 