In [None]:
%pip install --quiet --upgrade langchain-text-splitters langchain-community
%pip install langchain_mistralai
%pip install -qU langchain-chroma
%pip install langchain-groq
%pip install langchain-huggingface
%pip install pymupdf pymupdf4llm
%pip install "unstructured[md]" nltk



In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls "/content/drive/My Drive/HP"

HarryPotter.md	HarryPotter.pdf


LOADING DOCUMENT

In [None]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document
import pymupdf4llm
import pathlib

docs = pymupdf4llm.to_markdown('/content/drive/MyDrive/HP/HarryPotter.pdf')
pathlib.Path("/content/drive/MyDrive/HP/HarryPotter.md").write_bytes(docs.encode())
markdown_path  = '/content/drive/MyDrive/HP/HarryPotter.md'
loader  = UnstructuredMarkdownLoader(markdown_path)

docs = loader.load()

readme_content = docs[0].page_content

print(readme_content[:250])
assert len(docs) == 1
print(f"Total characters: {len(docs[0].page_content)}")

Processing /content/drive/MyDrive/HP/HarryPotter.pdf...
Harry Potter and the Sorcerer's Stone

CHAPTER ONE

THE BOY WHO LIVED

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved i
Total characters: 439865


SPLITTING DOCUMENT

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split Document into {len(all_splits)} sub-documents.")

Split Document into 750 sub-documents.


SETTING MISTRAL LLM

In [None]:
import getpass
import os

if not os.environ.get("MISTRAL_API_KEY"):
  os.environ["MISTRAL_API_KEY"] = getpass.getpass("Enter API key for Mistral AI: ")

from langchain_mistralai import ChatMistralAI

model = ChatMistralAI(
    model="mistral-small-latest",
    mistral_api_key=os.environ["MISTRAL_API_KEY"],
    temperature=0,
    streaming=True
)

Enter API key for Mistral AI: ··········


SETTING GROQ LLM

In [None]:
import getpass
import os

if not os.environ.get("GROQ_API_KEY"):
  os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")

from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    temperature=0,
    streaming=True
)

 HUGGINGFACE EMBEDDING MODEL

In [None]:
import getpass
import os

from langchain_huggingface import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


INITIALIZING VECTORSTORE

In [None]:
from langchain_chroma import Chroma
import time

vector_store = Chroma(embedding_function=embeddings)

for i in range(0, len(all_splits), 10):  # Process chunks in batches of 10
    batch = all_splits[i : i + 10]
    document_ids = vector_store.add_documents(documents=batch)
    print(f"Added documents {i}-{i + len(batch)}")
    # time.sleep(5)

# document_ids = vector_store.add_documents(documents=all_splits)

print(document_ids[:3])

Added documents 0-10
Added documents 10-20
Added documents 20-30
Added documents 30-40
Added documents 40-50
Added documents 50-60
Added documents 60-70
Added documents 70-80
Added documents 80-90
Added documents 90-100
Added documents 100-110
Added documents 110-120
Added documents 120-130
Added documents 130-140
Added documents 140-150
Added documents 150-160
Added documents 160-170
Added documents 170-180
Added documents 180-190
Added documents 190-200
Added documents 200-210
Added documents 210-220
Added documents 220-230
Added documents 230-240
Added documents 240-250
Added documents 250-260
Added documents 260-270
Added documents 270-280
Added documents 280-290
Added documents 290-300
Added documents 300-310
Added documents 310-320
Added documents 320-330
Added documents 330-340
Added documents 340-350
Added documents 350-360
Added documents 360-370
Added documents 370-380
Added documents 380-390
Added documents 390-400
Added documents 400-410
Added documents 410-420
Added docume

RETRIEVAL & GENERATION

In [None]:
from langchain_core.prompts import ChatPromptTemplate

template = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences and keep the answer concise. Output the answer as bullet points, line by line.
{context}
Question: {question}
Helpful Answer:
"""

question = input("Ask a question: ")

retrieved_docs = vector_store.similarity_search(question, k=3)
# print(f"Found {len(retrieved_docs)} documents")

context = "\n".join([doc.page_content for doc in retrieved_docs])

prompt = ChatPromptTemplate.from_template(template)

chain = prompt | llm

response = chain.invoke({"context": context, "question": question}).content
print("Response: ", response)

Ask a question: Who is Harry Poter?
Response:  * Harry Potter is a wizard who grew up in a family of non-magic people, known as Muggles, with his Uncle Vernon and Aunt Petunia.
* He is a student who is about to be sorted into a house at a magic school, and is known for being "The Harry Potter" with a special talent and a thirst to prove himself.
* The context suggests that Harry is famous in the wizarding world, possibly because something about him stopped a powerful wizard, referred to as "he", from carrying on with his powers.
