In [34]:
from langchain.chat_models import init_chat_model

llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")

In [36]:
#cleaning chat
import re

chat_file = "WhatsApp Chat with Mansimar Singh Bhatia/WhatsApp Chat with Mansimar Singh Bhatia.txt"

pattern = re.compile(r"^(\d{1,2}/\d{1,2}/\d{2}), (\d{2}:\d{2}) - ([^:]+): (.+)$")

messages = []

with open(chat_file, encoding="utf-8") as f:
    for line in f:
        match = pattern.match(line.strip())
        if match:
            date, time, sender, message = match.groups()
            if message not in ('null',
                               'Missed video call',
                               'Missed voice call',
                               'Missed group video call',
                               'Missed group voice call'):
                messages.append({
                    "date": date,
                    "time": time,
                    "sender": sender,
                    "message": message
                })

In [38]:
from datetime import datetime

formatted_messages = []
for msg in messages:
    # convert 24h → 12h (am/pm)
    time_24 = datetime.strptime(msg['time'], "%H:%M")
    time_12 = time_24.strftime("%I:%M %p").lstrip("0")  # remove leading 0
    
    line = f"{msg['date']}, {time_12} - {msg['sender']}: {msg['message']}"
    formatted_messages.append(line)

cleantext = "\n".join(formatted_messages)

with open('cleaned_chat.txt', 'w', encoding="utf-8") as out:
    out.write(cleantext)

In [39]:
from langchain_core.documents import Document

docs = [
    Document(
        page_content=m["message"],
        metadata={
            "date": m["date"],
            "time": m["time"],
            "sender": m["sender"]
        }
    )
    for m in messages
]

In [40]:
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_community.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore.save_local("faiss_whatsapp_index")

In [None]:
retriever = vectorstore.as_retriever()

In [44]:
summary_prompt = 'Summarise this conversation: ' + cleantext
llm.invoke(summary_prompt)

AIMessage(content='This is a long, casual, and informal conversation between two close friends, Mansimar Singh Bhatia and Sanskar, spanning nearly two years (October 2023 to August 2025).\n\nThe main themes of their communication include:\n\n1.  **Frequent Attempts to Connect:** Mansimar often initiates contact, asking Sanskar to call, join Discord, or just check if he\'s awake, sometimes playfully using expletives like "bhai," "oye," "randi," or "bitxh." Sanskar frequently responds by saying he\'s busy (in meetings, studying for an interview, or simply out).\n2.  **Media Sharing:** Both friends frequently share YouTube links (shorts, channels), images, and videos.\n3.  **Logistics & Travel:** They discuss train travel (including an incident with a "crazy uncle" on the wrong seat and finding available seats), Mansimar\'s travel plans to Noida, and general coordination for meeting up.\n4.  **Work & Help:** Sanskar seeks Mansimar\'s help with a work-related document/topics, and later men

In [45]:
from langchain.chains import RetrievalQA

chain = RetrievalQA.from_chain_type(llm = llm, 
                                    chain_type = "stuff",
                                    retriever = retriever,
                                    return_source_documents = True
                                    )

query = "summarize this conversation"


chain.invoke({"query": query, "context": cleantext})

{'query': 'summarize this conversation',
 'context': "10/14/23, 9:20 PM - Mansimar Singh Bhatia: Bhai\n10/14/23, 9:21 PM - Mansimar Singh Bhatia: Bhai\n10/14/23, 9:21 PM - Mansimar Singh Bhatia: Bahut zaroori kaam hai\n3/10/24, 5:15 AM - Mansimar Singh Bhatia: IMG-20240310-WA0000.jpg (file attached)\n4/16/24, 9:51 PM - Sanskar: https://youtu.be/Q8PdffUfoF0?si=PTmBLtSNlHLd7GEW\n4/16/24, 9:53 PM - Mansimar Singh Bhatia: yeasssap\n4/16/24, 9:54 PM - Mansimar Singh Bhatia: not available noooooo\n4/16/24, 9:54 PM - Sanskar: Why not available?? Vpn laga le\n4/16/24, 10:46 PM - Mansimar Singh Bhatia: oh\n10/2/24, 12:21 AM - Mansimar Singh Bhatia: IMG-20241002-WA0000.jpg (file attached)\n1/29/25, 9:43 PM - Sanskar: PNR-2129038747\n1/30/25, 8:03 AM - Sanskar: Bhai uncle to crazy nikla samne wala\n1/30/25, 8:03 AM - Sanskar: Imposter mc\n1/30/25, 8:07 AM - Mansimar Singh Bhatia: kyun\n1/30/25, 8:07 AM - Sanskar: Abe galat seat pe baitha tha na\n1/30/25, 8:07 AM - Mansimar Singh Bhatia: lmao\n1/3

In [55]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, LLMRouterChain, MultiPromptChain
from langchain.chains.summarize import load_summarize_chain


qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

summary_chain = load_summarize_chain(llm, chain_type="map_reduce")

qa_prompt_info = {
    "name": "qa_chain",
    "description": "Good for answering specific questions about who said what, dates, details in the chat.",
    "chain": qa_chain,
}

summary_prompt_info = {
    "name": "summary_chain",
    "description": "Good for summarizing the entire conversation or giving overviews.",
    "chain": summary_chain,
}

destination_chains = {
    "qa_chain": qa_chain,
    "summary_chain": summary_chain,
}

default_chain = summary_chain

router_chain = LLMRouterChain.from_prompts(
    llm=llm,
    prompt_infos=[qa_prompt_info, summary_prompt_info],
    default_chain=default_chain,
    verbose=True
)

AttributeError: from_prompts

In [66]:
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert on answering specific questions about who said what, dates, details in the chat."),
        ("human", "{input}"),
    ]
)
summary_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert on summarizing the entire conversation or giving overviews."),
        ("human", "{input}"),
    ]
)

# Construct the chains we will route to. These format the input query
# into the respective prompt, run it through a chat model, and cast
# the result to a string.
qa_chain = qa_prompt | llm | StrOutputParser()
summary_chain = summary_prompt | llm | StrOutputParser()

# Define the chain that selects which branch to route to.
route_system = """
Given a raw text input to a language model, select the model prompt best suited for the input. You will be given the names of the available prompts and a description of what the prompt is best suited for.

<< CANDIDATE PROMPTS >>

qa_prompt: prompt seeking answer for specific questions about who said what, dates, details in the chat.
summary_prompt: prompt seeking summary, overview or analyzing the entire conversation.

<< INPUT >>
{input}

<< OUTPUT >>
Return a JSON object with the key "destination" and the value being either "qa_prompt" or "summary_prompt".
"""

route_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", route_system),
        ("human", "{input}"),
    ]
)

chain = route_prompt | llm

In [70]:
chain.invoke("who are they talking about")

AIMessage(content='```json\n{\n  "destination": "qa_prompt"\n}\n```', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ratings': []}, id='run--08fecfb2-0443-4dba-b7b4-e1b2d4ca2638-0', usage_metadata={'input_tokens': 139, 'output_tokens': 99, 'total_tokens': 238, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 81}})