In [1]:
import datasets

ds = datasets.load_dataset("m-ric/huggingface_doc", split="train")
documents = [doc["text"] for doc in ds]

In [2]:
print(f"Loaded {len(documents)} documents.")

Loaded 2647 documents.


In [3]:
def chunk_examples(examples):
    chunks = []
    for text in examples['text']:  # replace 'text' with your dataset's text field name
        # Split text into 200-character chunks
        chunks += [text[i:i + 200] for i in range(0, len(text), 200)]
    return {'chunks': chunks}


In [4]:
chunked_ds = ds.map(chunk_examples, batched=True, remove_columns=ds.column_names)

In [5]:
print(chunked_ds[:10])

{'chunks': [' Create an Endpoint\n\nAfter your first login, you will be directed to the [Endpoint creation page](https://ui.endpoints.huggingface.co/new). As an example, this guide will go through the steps to deplo', 'y [distilbert-base-uncased-finetuned-sst-2-english](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) for text classification. \n\n## 1. Enter the Hugging Face Repository ID and yo', 'ur desired endpoint name:\n\n<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/1_repository.png" alt="select repository" />\n\n## 2. Select your Cloud Provider', ' and region. Initially, only AWS will be available as a Cloud Provider with the `us-east-1` and `eu-west-1` regions. We will add Azure soon, and if you need to test Endpoints with other Cloud Provider', 's or regions, please let us know.\n\n<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/1_region.png" alt="select regio

In [6]:
print(ds.column_names)

['text', 'source']


In [7]:
texts = chunked_ds['chunks']

In [8]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # lightweight and effective

In [9]:
embeddings = embedder.encode(texts, batch_size=32, show_progress_bar=True)

Batches:   0%|          | 0/3383 [00:00<?, ?it/s]

In [10]:
import faiss
import numpy as np

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance metric

index.add(np.array(embeddings))
print(f"Indexed {index.ntotal} vectors.")

Indexed 108237 vectors.


In [11]:
faiss.write_index(index, "rag_index.faiss")

import pickle
with open("rag_chunks.pkl", "wb") as f:
    pickle.dump(texts, f)

In [13]:
query = "Tell me about dog agility"

query_embedding = embedder.encode([query])

D, I = index.search(np.array(query_embedding), k=3)  # retrieve top 3 chunks

retrieved_chunks = [texts[i] for i in I[0]]

In [14]:
import ollama

context = "\n".join(retrieved_chunks)
prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"

messages = [{'role': 'user', 'content': prompt}]

response = ollama.chat(model='hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF:latest', messages=messages)

print(response['message']['content'])


Dog agility training is a fun and rewarding activity that can help improve your dog's physical fitness, obedience, and overall well-being.

In dog agility, participants navigate an obstacle course designed for dogs, typically consisting of tunnels, jumps, weave poles, and other challenges. The goal is to run as quickly and safely as possible through the course while earning points based on accuracy and speed.

Some benefits of dog agility training include:

* Improved cardiovascular health and overall fitness
* Increased confidence and strength in your dog
* Enhanced problem-solving skills and focus
* Better communication between you and your dog

To get started with Huggy, follow these steps:

1. **Introduce yourself to Huggy**: Tell the AI system about your dog's preferences, goals, and any allergies or sensitivities.
2. **Set up the course**: Choose a dog agility course that suits your dog's abilities and experience level. You can find many pre-made courses online or create your own