In [2]:
!pip install langchain chromadb faiss-cpu tiktoken langchain_google_genai langchain_community wikipedia

Collecting chromadb
  Downloading chromadb-1.3.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting langchain_google_genai
  Downloading langchain_google_genai-3.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x8

In [15]:
from langchain_community.retrievers import WikipediaRetriever

In [16]:
retriever1 = WikipediaRetriever(top_k_results=2, lang="en")

In [17]:
query1 = "geopolitical history of India and Pakistan from the perspective of China"
docs1 = retriever1.invoke(query1)

In [18]:
for i, doc1 in enumerate(docs1):
  print(f"Result {i+1}")
  print(f"content: {doc1.page_content}")

Result 1
content: The Islamic Republic of Pakistan emerged as an independent country through the partition of India in August 1947 and was admitted as a United Nations member state in September 1947. It is currently the second-most populous country within the Muslim world, and is also the only Muslim-majority country openly in possession of nuclear weapons. The country shares land borders with India, Iran, Afghanistan, and China.
The country has extensive trade relations with the European Union and with several countries globally. As of 2025, Pakistan does not recognize Israel and its ties with India remain frozen since 2019.
From a geopolitical perspective, Pakistan's location is strategically important as it is situated at the crossroads of major maritime and land transit routes between the Middle East and South Asia, while also serving as a bridge between the Arabian Sea and the energy-rich regions of Central Asia. Since the partition of India, the Kashmir conflict has defined the I

In [7]:
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.documents import Document

In [20]:
documents = [
    Document(page_content="The quick brown fox jumps over the lazy dog."),
    Document(page_content="Artificial intelligence is transforming various industries."),
    Document(page_content="The sun rises in the east and sets in the west."),
    Document(page_content="Water is essential for all known forms of life."),
    Document(page_content="The internet has revolutionized communication."),
    Document(page_content="Learning new things can be a rewarding experience."),
    Document(page_content="Climate change is a pressing global issue."),
    Document(page_content="Reading a good book can transport you to another world."),
    Document(page_content="The history of the world is vast and complex."),
    Document(page_content="Technology continues to advance at a rapid pace.")
]

In [21]:
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GEMINI_API_KEY')
embed = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001", google_api_key=GOOGLE_API_KEY)
vectorstore = Chroma.from_documents(documents = documents,
                                    embedding = embed,
                                    collection_name="my_collection"
                                    )

In [24]:
retriever2 = vectorstore.as_retriever(search_kwargs = {"k": 2})


In [25]:
query2 = "Where does the sun rise?"
results2 = retriever2.invoke(query2)

In [26]:
for j, docs2 in enumerate(results2):
  print(f"Result {j+1}")
  print(f"content: {docs2.page_content}")

Result 1
content: The sun rises in the east and sets in the west.
Result 2
content: The sun rises in the east and sets in the west.


In [27]:
print(vectorstore.similarity_search(query2))

[Document(metadata={}, page_content='The sun rises in the east and sets in the west.'), Document(metadata={}, page_content='The sun rises in the east and sets in the west.'), Document(metadata={}, page_content='The history of the world is vast and complex.'), Document(metadata={}, page_content='The history of the world is vast and complex.')]
