In [2]:
## imports ##
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader,DirectoryLoader
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from typing import List
import numpy as np

In [9]:
## document loaders ##
loader = DirectoryLoader(
    path='data',
    glob="*.txt",
    loader_cls=TextLoader,
    loader_kwargs={'encoding':'utf-8'}
)
documents = loader.load()
print(f"Number of documents loaded : {len(documents)}")
for index,doc in enumerate(documents):
    print(f"Metadata of the document : {doc.metadata}")
    print(f"Document content:\n{doc.page_content[0:100]}")
    print("-------------------------------")

Number of documents loaded : 3
Metadata of the document : {'source': 'data\\chatgpt_conversational_ai.txt'}
Document content:
ChatGPT and Conversational AI

ChatGPT is one of the most widely known conversational AI models deve
-------------------------------
Metadata of the document : {'source': 'data\\google_gemini_multimodal_ai.txt'}
Document content:
Google Gemini and the Rise of Multimodal AI

Google Gemini represents a new generation of large-scal
-------------------------------
Metadata of the document : {'source': 'data\\machine_learning_fundamentals.txt'}
Document content:
Machine Learning Fundamentals

Machine Learning (ML) is a branch of artificial intelligence focused 
-------------------------------


In [14]:
## document splitters ##
text_splitter = RecursiveCharacterTextSplitter(

    chunk_size = 500, ##max size of one chunk
    chunk_overlap = 50, ##chunk overlap character count
    length_function=len,
    separators=[" "]
)
chunks = text_splitter.split_documents(documents)
print(f"Number of chunks created : {len(chunks)}")
print(f"Content in chunk : {chunks[6].page_content[0:100]}")
print(f"Metadata in a chunk:{chunks[6].metadata}")


Number of chunks created : 12
Content in chunk : architecture but extends it with multimodal embeddings. These embeddings allow the model to represen
Metadata in a chunk:{'source': 'data\\google_gemini_multimodal_ai.txt'}


In [18]:
### chromadb vector store ###
persist_dir = "./chroma_db"
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') ##embedding model
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory=persist_dir,
    collection_name='rag_collection'
)
print(f"Vector store created with {vector_store._collection.count()} vectors")


Vector store created with 12 vectors


In [32]:
## testing the similarity search##

query = 'What capabilities does Google gemini have?'
sim_chunks = vector_store.similarity_search(query,k=3)
for index,chnk in enumerate(sim_chunks):
    print(f"chunk number-{index+1}")
    print(f"chunk metadata:\n{chnk.metadata}")
    print(f"Data in chunk:\n{chnk.page_content}")
    print("----------------------------")

chunk number-1
chunk metadata:
{'source': 'data\\google_gemini_multimodal_ai.txt'}
Data in chunk:
to systems capable of general intelligence. Gemini’s release positions Google as a key competitor to OpenAI’s GPT-4 and Anthropic’s Claude models. As multimodal AI continues to mature, we can expect breakthroughs in robotics, education, and accessibility—where machines can truly “see,” “hear,” and “understand” the world like humans do.
----------------------------
chunk number-2
chunk metadata:
{'source': 'data\\google_gemini_multimodal_ai.txt'}
Data in chunk:
traditional LLMs that are text-only, Gemini can interpret visual inputs, perform cross-modal reasoning, and produce outputs that combine modalities. This makes it ideal for use cases like visual question answering, document analysis, or creative generation. It also integrates with Google Search, Workspace, and YouTube, enabling intelligent summarization, content generation, and recommendation systems.

Technically, Gemini builds upon

In [None]:
from dot