import os
import re
from typing import List
import tiktoken
import chromadb
from sentence_transformers import SentenceTransformer

CHUNK_SIZE = 100  
CHUNK_OVERLAP = 50  

encoding = tiktoken.get_encoding("cl100k_base")

def count_tokens(text: str) -> int:
    return len(encoding.encode(text))

def split_text_to_chunks(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        current_text = " ".join(current_chunk)
        if count_tokens(current_text) >= chunk_size:
            chunks.append(current_text)
            current_chunk = current_chunk[-overlap:]  # keep overlap

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def load_and_chunk(path: str) -> List[str]:
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    chunks = split_text_to_chunks(text)
    return chunks

    
chunks = load_and_chunk("french_tour_guide.txt")

client = chromadb.PersistentClient(path="./chroma_db")  # will persist on disk
collection = client.get_or_create_collection(name="france_tourism")


embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(chunks).tolist()


collection.add(
    documents=chunks,
    embeddings=embeddings,
    ids=[f"doc_{i}" for i in range(len(chunks))]
)

In [None]:
query = "Where can i get a cofee in paris?"
query_embedding = embedding_model.encode([query]).tolist()[0]

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

for doc in results["documents"][0]:
    print(doc)
    print()