In [1]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


#### Reading required documents with PDF and Word Loaders

In [None]:
pdf_loader = PyPDFLoader("Crypto Report 2025.pdf")
word_loader = Docx2txtLoader("Crypto Performance 2025 Q4.docx")

In [4]:
docs = pdf_loader.load() + word_loader.load()

#### Split text into chunks

In [6]:
# chunk_overlap to  the context between chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(docs)

### Create Vector Store

In [8]:
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    persist_directory="./chroma_db"
)

Defining the Retriever

In [11]:
retriever = vector_db.as_retriever(search_kwargs={"k": 3})

In [12]:
retriever.invoke("Q3 reports")

[Document(metadata={'author': '', 'creator': 'Microsoft® PowerPoint® for Microsoft 365', 'creationdate': '2025-10-16T14:05:42+08:00', 'total_pages': 53, 'producer': 'Microsoft® PowerPoint® for Microsoft 365', 'page': 15, 'moddate': '2025-10-16T14:05:42+08:00', 'title': '2025 Q3 Crypto Industry Report | CoinGecko', 'source': 'Crypto Report 2025.pdf', 'page_label': '16'}, page_content=', including JPY (\n -\n 3.1%), GBP (\n -\n 2.2%), and EUR (\n -\n 0.6%). \nConversely, \n Crude Oil (\n -\n 5.3%) extended its decline  \nsince the start \nof the year, reflecting persistent supply\n -\n side surpluses despite \nglobal demand outlooks.\nPrice Return\n15\n-10.0%\n-5.0%\n0.0%\n5.0%\n10.0%\n15.0%\n20.0%\nJul-25 Aug-25 Sep-25\nBTC NASDAQ CRUDE OIL GOLD TLT DXY JPY GBP'),
 Document(metadata={'title': '2025 Q3 Crypto Industry Report | CoinGecko', 'source': 'Crypto Report 2025.pdf', 'page_label': '12', 'producer': 'Microsoft® PowerPoint® for Microsoft 365', 'total_pages': 53, 'author': '', 'creat

This setup is to be used in the Streamlit app for Retrieval of relevant chunks