In [18]:
# !pip install -qU langchain-google-genai
# !pip install chromadb

In [19]:
from google.colab import userdata
google_api_key = userdata.get('google_api_key')


In [20]:
import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate, ChatPromptTemplate
import chromadb
from chromadb.utils import embedding_functions
from langchain.schema import HumanMessage, SystemMessage, AIMessage
import pandas as pd

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
from google.colab import drive
directory_path = '/content/drive/MyDrive/news/'

In [23]:
model_name = "models/gemini-2.0-flash-lite-preview"

In [24]:
load_dotenv()
google_api_key = google_api_key
model = ChatGoogleGenerativeAI(model=model_name ,temperature=0, google_api_key=google_api_key)


In [25]:
embedding_func = embedding_functions.GoogleGenerativeAiEmbeddingFunction(
    api_key=google_api_key,
    model_name="models/embedding-001"
)

In [26]:
chroma_client = chromadb.PersistentClient(path="chroma_persistent_storage")
collection_name = "document_qa_collection"
collection = chroma_client.get_or_create_collection(
    name=collection_name, embedding_function = embedding_func
)

In [27]:
messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="What is human life expectancy in the United States?"),
]

# Get first response
resp = model.invoke(messages)
print(resp.content)

# Add the AI's response to history and ask follow-up
messages.append(AIMessage(content=resp.content))
messages.append(HumanMessage(content="What about in Japan?"))

# Get second response with full history
follow_up = model.invoke(messages)
print(follow_up.content)

The average life expectancy in the United States is around 77 years. However, this number can vary depending on factors like sex, race, and socioeconomic status.
Japan has one of the highest life expectancies in the world. The average life expectancy in Japan is around 84-85 years.


In [28]:
def load_documents_from_directory(directory_path):
    print("==== Loading documents from directory ====")
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            with open(
                os.path.join(directory_path, filename), "r", encoding="utf-8"
            ) as file:
                documents.append({"id": filename, "text": file.read()})
    return documents


In [29]:
# directory_path = "./news_articles"
documents = load_documents_from_directory(directory_path)

==== Loading documents from directory ====


In [30]:
# for doc in documents:
#     doc['text'] = doc['text'].replace("\n"," ")
#     doc['text'] = doc['text'].replace("  "," ")

In [31]:
documents[0]

{'id': '05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt',
 'text': 'Signaling that investments in the supply chain sector remain robust, Pando, a startup developing fulfillment management technologies, today announced that it raised $30 million in a Series B round, bringing its total raised to $45 million.\n\nIron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chiratae Ventures and Next47. CEO and founder Nitin Jayakrishnan says that the new capital will be put toward expanding Pando’s global sales, marketing and delivery capabilities.\n\n“We will not expand into new industries or adjacent product areas,” he told TechCrunch in an email interview. “Great talent is the foundation of the business — we will continue to augment our teams at all levels of the organization. Pando is also open to exploring strategic partnerships and acquisitions with this round of funding.”\n\nPando was co-launched by Jaya

In [32]:
documents_df = pd.DataFrame(documents)
documents_df.head(2)

Unnamed: 0,id,text
0,05-03-ai-powered-supply-chain-startup-pando-la...,Signaling that investments in the supply chain...
1,05-03-ai-replace-tv-writers-strike.txt,"In the must-watch final season of “Succession,..."


In [33]:
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")


<spacy.pipeline.sentencizer.Sentencizer at 0x7e0ff5951c50>

In [34]:
from tqdm import tqdm


for item in tqdm(documents):
    item['text'] = item['text'].replace("\n", " ").replace("  ", " ")
    item["sentences"] = [str(sen) for sen in nlp(item["text"]).sents]
    item["count_of_sentences"] = len(item["sentences"])


100%|██████████| 21/21 [00:00<00:00, 56.92it/s]


In [35]:
import pprint
pprint.pprint(documents[0])

{'count_of_sentences': 35,
 'id': '05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt',
 'sentences': ['Signaling that investments in the supply chain sector remain '
               'robust, Pando, a startup developing fulfillment management '
               'technologies, today announced that it raised $30 million in a '
               'Series B round, bringing its total raised to $45 million.',
               'Iron Pillar and Uncorrelated Ventures led the round, with '
               'participation from existing investors Nexus Venture Partners, '
               'Chiratae Ventures and Next47.',
               'CEO and founder Nitin Jayakrishnan says that the new capital '
               'will be put toward expanding Pando’s global sales, marketing '
               'and delivery capabilities. “',
               'We will not expand into new industries or adjacent product '
               'areas,” he told TechCrunch in an email interview. “',
               'Great tale

In [36]:

num_sentence_chunk_size = 10

def split_list(input_list: list, slice_size: int) -> list[list[str]]:
    """Splits the input_list into sublists of size slice_size"""
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

chunked_documents = []

for item in tqdm(documents):
    # Clean and process the original document
    item['text'] = item['text'].replace("\n", " ").replace("  ", " ")
    item["sentences"] = [str(sen) for sen in nlp(item["text"]).sents]
    item["count_of_sentences"] = len(item["sentences"])

    # Split sentences into chunks
    sentence_chunks = split_list(input_list=item["sentences"], slice_size=num_sentence_chunk_size)
    num_chunks = len(sentence_chunks)

    # Create a new document for each chunk
    for chunk_idx, chunk_sentences in enumerate(sentence_chunks):
        chunk_text = " ".join(chunk_sentences)

        chunk_doc = {
            # Chunk content
            "chunk_text": chunk_text,
            # Chunk metadata
            "chunk_count": num_chunks,


            "chunk_char_count": len(chunk_text),
            "chunk_token_count": len(chunk_text) / 4
        }

        # Add any other metadata from the original document you want to preserve
        for key, value in item.items():
            if key not in ['text', 'sentences', 'chunks'] and key not in chunk_doc:
                chunk_doc[f"original_{key}"] = value

        chunked_documents.append(chunk_doc)

# Now chunked_documents contains your new list with chunk-based documents
print(f"Created {len(chunked_documents)} chunks from {len(documents)} original documents")

100%|██████████| 21/21 [00:00<00:00, 92.30it/s] 

Created 131 chunks from 21 original documents





In [37]:
pprint.pprint(chunked_documents[0])

{'chunk_char_count': 1660,
 'chunk_count': 4,
 'chunk_text': 'Signaling that investments in the supply chain sector remain '
               'robust, Pando, a startup developing fulfillment management '
               'technologies, today announced that it raised $30 million in a '
               'Series B round, bringing its total raised to $45 million. Iron '
               'Pillar and Uncorrelated Ventures led the round, with '
               'participation from existing investors Nexus Venture Partners, '
               'Chiratae Ventures and Next47. CEO and founder Nitin '
               'Jayakrishnan says that the new capital will be put toward '
               'expanding Pando’s global sales, marketing and delivery '
               'capabilities. “ We will not expand into new industries or '
               'adjacent product areas,” he told TechCrunch in an email '
               'interview. “ Great talent is the foundation of the business — '
               'we will continue to a

In [38]:
max(item['chunk_token_count'] for item in chunked_documents)

580.0

In [39]:
# pprint.pprint(chunked_documents[0])

In [40]:
def get_google_embedding(text):
    # GoogleGenerativeAiEmbeddingFunction has a __call__ method
    embeddings = embedding_func([text])
    print("==== Generating embeddings... ====")
    return embeddings[0].tolist()

# Generate embeddings for the document chunks
for doc in chunked_documents:
    print("==== Generating embeddings... ====")
    doc["embedding"] = get_google_embedding(doc["chunk_text"])

==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embe

In [41]:
for doc in chunked_documents:
    print("==== Inserting chunks into db;;; ====")
    collection.upsert(
        ids=[doc["original_id"]],
        documents=[doc["chunk_text"]],
        embeddings=[doc["embedding"]]
    )

==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserti

In [42]:
def query_documents(question, n_results=2):
    # query_embedding = get_openai_embedding(question)
    results = collection.query(query_texts=question, n_results=n_results)

    # Extract the relevant chunks
    relevant_chunks = [doc for sublist in results["documents"] for doc in sublist]
    print("==== Returning relevant chunks ====")
    return relevant_chunks
    # for idx, document in enumerate(results["documents"][0]):
    #     doc_id = results["ids"][0][idx]
    #     distance = results["distances"][0][idx]
    #     print(f"Found document chunk: {document} (ID: {doc_id}, Distance: {distance})")

In [43]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import HumanMessage, SystemMessage

def generate_response(question, relevant_chunks):
    context = "\n\n".join(relevant_chunks)
    prompt = (
        "You are an assistant for question-answering tasks. Use the following pieces of "
        "retrieved context to answer the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the answer concise."
        "\n\nContext:\n" + context + "\n\nQuestion:\n" + question
    )

    # Create messages in LangChain format
    messages = [
        SystemMessage(content=prompt),
        HumanMessage(content=question)
    ]

    # Generate response using Google Gemini
    response = model.invoke(messages)

    return response.content

In [47]:
question = "tell me about Politics"
relevant_chunks = query_documents(question)
answer = generate_response(question, relevant_chunks)

print(answer)

==== Returning relevant chunks ====
I am sorry, but the provided context does not contain any information about politics. Therefore, I cannot answer your question.
