In [None]:
# Embedding with nomic-embed-text and retieve with llm

import os
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain.chains import RetrievalQA
from langchain_ollama import OllamaLLM

# --------------------
# Setup
# --------------------
os.environ.setdefault("no_proxy", "127.0.0.1,localhost")
os.environ.setdefault("HTTPX_NO_PROXY", "127.0.0.1,localhost")

persist_dir = "nomic-emb"
embeddings = OllamaEmbeddings(model="nomic-embed-text")

# Load documents only if we need to create embeddings
if not os.path.exists(persist_dir) or not os.listdir(persist_dir):
    print("⚡ No existing DB found. Creating new embeddings...")

    # Load and split text
    loader = TextLoader("facts.txt")
    docs = loader.load()
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=200,
        chunk_overlap=50
    )
    docs = text_splitter.split_documents(docs)

    # Create Chroma DB and persist
    db = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        persist_directory=persist_dir
    )
else:
    print("✅ Found existing DB. Loading without re-embedding...")
    db = Chroma(
        persist_directory=persist_dir,
        embedding_function=embeddings
    )

# --------------------
# Run a test query
# --------------------
query = "What is an interesting fact about the English language?"

# Create retriever
retriever = db.as_retriever(search_kwargs={"k": 3})

OLLAMA_URL = os.getenv("OLLAMA_URL", "http://127.0.0.1:11434")
model_name = "llama3.2:3b"
chat = OllamaLLM(model=model_name, base_url=OLLAMA_URL, timeout=30)

chain = RetrievalQA.from_chain_type(
    llm=chat,
    retriever=retriever,
    chain_type="stuff"
)

result = chain.invoke(query)
print(result)

✅ Found existing DB. Loading without re-embedding...
{'query': 'What is an interesting fact about the English language?', 'result': 'One interesting fact about the English language from the provided context is related to the word "dreamt".'}


In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [2]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

loader = PyPDFLoader("spice.pdf")
docs = loader.load_and_split(text_splitter)

print(docs)

[Document(metadata={'source': 'spice.pdf', 'page': 0, 'page_label': '1'}, page_content='Spices\nSpices at a central market in Agadir,\nMorocco\nA group of Indian herbs and spices in\nbowls\nSpices of Saúde flea market, São\nPaulo, Brazil\nSpice\nA spice is a seed, fruit, root, bark, or other plant substance\nprimarily used for flavoring or coloring food. Spices are\ndistinguished from herbs, which are the leaves, flowers, or stems of\nplants used for flavoring or as a garnish. Spices are sometimes used\nin medicine, religious rituals, cosmetics, or perfume production.'), Document(metadata={'source': 'spice.pdf', 'page': 0, 'page_label': '1'}, page_content='in medicine, religious rituals, cosmetics, or perfume production.\nFor example, vanilla is commonly used as an ingredient in\nfragrance manufacturing.[1]\nA spice may be available in several forms: fresh, whole-dried, or\npre-ground dried. Generally, spices are dried. Spices may be\nground into a powder for convenience. A whole dried