In [None]:
!pip install -q langchain langchain-core langchain-community langchain-text-splitters
!pip install -q langchain-google-genai langchain-huggingface
!pip install -q faiss-cpu langchain-chroma chromadb
!pip install -q pypdf


In [2]:
import os

In [4]:
from google.colab import userdata
GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')

In [5]:
os.environ['GOOGLE_API_KEY'] = GEMINI_API_KEY


In [6]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.3,google_api_key=GEMINI_API_KEY)

# **a) Document Loader**

In [None]:
from langchain_community.document_loaders import TextLoader, PyPDFLoader

In [16]:
# TXT
loader = TextLoader("sample_file.txt", encoding="utf-8")
docs = loader.load()

# PDF
pdf_loader = PyPDFLoader("sample_file.pdf")
pdf_docs = pdf_loader.load()

print(len(docs), docs[0].metadata, docs[0].page_content[:100])
print(len(pdf_docs), pdf_docs[0].metadata)


1 {'source': 'sample_file.txt'} # Ideanote

> Ideanote is the world's leading innovation and idea management software for enterprise
2 {'producer': 'LibreOffice 24.8.1.2 (X86_64) / LibreOffice Community', 'creator': 'Writer', 'creationdate': '2025-09-04T16:52:16+00:00', 'source': 'sample_file.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}


# **b) Text Splitter**

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [17]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=100, add_start_index=True
)
splits = splitter.split_documents(docs)

print(len(splits), splits[0].metadata, splits[0].page_content[:120])


14 {'source': 'sample_file.txt', 'start_index': 0} # Ideanote

> Ideanote is the world's leading innovation and idea management software for enterprise. 

Ideanote is how 


# **c) Embeddings + Vector Database**

In [29]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
emb = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [30]:
from langchain_community.vectorstores import FAISS

faiss_store = FAISS.from_documents(splits, emb)
faiss_store.save_local("faiss_index")

# **d) Retriever**

In [28]:
retriever = faiss_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

query = "What are the main ideas in the text?"
results = retriever.invoke(query)

for r in results:
    print(r.metadata, r.page_content[:120])


{'source': 'sample_file.txt', 'start_index': 3927} - [Free Innovation Tools](https://ideanote.io/tool): Useful tools for innovation.
- [Ideanote Blog](https://www.ideanote
{'source': 'sample_file.txt', 'start_index': 3494} ## Innovation Resources & Thought Leadership
- [Innovation-Led Growth Flywheel](https://www.ideanote.io/innovation-led-g
{'source': 'sample_file.txt', 'start_index': 2816} ## Doc and Search
- [Help Docs](https://ideanote.io/help)
- [Intro and Signup](https://ideanote.io/help/intro-to-ideanot


# **e) RAG (Retriever + LLM)**

In [31]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.runnables import RunnablePassthrough


faiss_store = FAISS.load_local("faiss_index", emb, allow_dangerous_deserialization=True)
retriever = faiss_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

prompt = ChatPromptTemplate.from_template(
    "Answer the following question using only the given context.\n\n"
    "Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
)

# Build RAG pipeline using runnables
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}  # fetch docs + keep question
    | prompt
    | llm
    | StrOutputParser()   # clean text output
)

# Run
result = rag_chain.invoke("Tell me the main idea of the text file")
print(result)


Ideanote is innovation and idea management software for enterprises.  It helps companies manage innovation by engaging employees and collecting, developing, and managing ideas through a central platform.
