In [1]:
from langchain.chat_models import init_chat_model

llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")

In [63]:
#cleaning chat
import re

chat_file = "WhatsApp Chat with Mansimar Singh Bhatia/WhatsApp Chat with Mansimar Singh Bhatia.txt"

pattern = re.compile(r"^(\d{1,2}/\d{1,2}/\d{2}), (\d{2}:\d{2}) - ([^:]+): (.+)$")

messages = []

with open(chat_file, encoding="utf-8") as f:
    for line in f:
        match = pattern.match(line.strip())
        if match:
            date, time, sender, message = match.groups()
            if message not in ('null',
                               'Missed video call',
                               'Missed voice call',
                               'Missed group video call',
                               'Missed group voice call'):
                messages.append({
                    "date": date,
                    "time": time,
                    "sender": sender,
                    "message": message
                })
messages

[{'date': '10/14/23',
  'time': '21:20',
  'sender': 'Mansimar Singh Bhatia',
  'message': 'Bhai'},
 {'date': '10/14/23',
  'time': '21:21',
  'sender': 'Mansimar Singh Bhatia',
  'message': 'Bhai'},
 {'date': '10/14/23',
  'time': '21:21',
  'sender': 'Mansimar Singh Bhatia',
  'message': 'Bahut zaroori kaam hai'},
 {'date': '3/10/24',
  'time': '05:15',
  'sender': 'Mansimar Singh Bhatia',
  'message': 'IMG-20240310-WA0000.jpg (file attached)'},
 {'date': '4/16/24',
  'time': '21:51',
  'sender': 'Sanskar',
  'message': 'https://youtu.be/Q8PdffUfoF0?si=PTmBLtSNlHLd7GEW'},
 {'date': '4/16/24',
  'time': '21:53',
  'sender': 'Mansimar Singh Bhatia',
  'message': 'yeasssap'},
 {'date': '4/16/24',
  'time': '21:54',
  'sender': 'Mansimar Singh Bhatia',
  'message': 'not available noooooo'},
 {'date': '4/16/24',
  'time': '21:54',
  'sender': 'Sanskar',
  'message': 'Why not available?? Vpn laga le'},
 {'date': '4/16/24',
  'time': '22:46',
  'sender': 'Mansimar Singh Bhatia',
  'message'

In [64]:
# this is not needed. i am only creating the cleantext to see how the messages look after cleaning.
# DO NOT USE cleantext OR formatted_messages IN THE CODE!!!

from datetime import datetime

formatted_messages = []
for msg in messages:
    # convert 24h → 12h (am/pm)
    time_24 = datetime.strptime(msg['time'], "%H:%M")
    time_12 = time_24.strftime("%I:%M %p").lstrip("0")  # remove leading 0
    
    line = f"{msg['date']}, {time_12} - {msg['sender']}: {msg['message']}"
    formatted_messages.append(line)

cleantext = "\n".join(formatted_messages)

with open('cleaned_chat.txt', 'w', encoding="utf-8") as out:
    out.write(cleantext)

In [None]:
from langchain_core.documents import Document

docs = [
    Document(
        page_content=m["message"],
        metadata={
            "date": m["date"],
            "time": m["time"],
            "sender": m["sender"]
        }
    )
    for m in messages
]
docs

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_community.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore.save_local("faiss_whatsapp_index")

# this retriever is reading the docs from this program
retriever = vectorstore.as_retriever()

# try another method where the retriever is taking docs from the saved file

In [98]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

retrieval  = RunnableParallel(
    {"context": retriever, "input": RunnablePassthrough()}
)

full_context_retrieval = RunnableParallel(
    {"context": lambda _: cleantext, "input": RunnablePassthrough()}
)

In [115]:
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert on answering specific questions about who said what, dates, details in the chat. Use the following context to answer questions.\n\n{context}"),
        ("human", "{input}"),
    ]
)
summary_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert on summarizing the entire conversation or giving overviews. Use the following context to answer questions.\n\n{context}"),
        ("human", "{input}"),
    ]
)

qa_chain = retrieval | qa_prompt | llm

summary_chain = full_context_retrieval | summary_prompt | llm

route_system = """
Given a raw text input to a language model, select the model prompt best suited for the input. You will be given the names of the available prompts and a description of what the prompt is best suited for.

<< CANDIDATE PROMPTS >>

qa_prompt: prompt seeking answer for specific questions about who said what, dates, details in the chat.
summary_prompt: prompt seeking summary, overview or analyzing the entire conversation.

<< INPUT >>
{input}

<< OUTPUT >>
Return only one word and the value being either "qa_prompt" or "summary_prompt" depending on which is the most suitable.
"""

route_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", route_system),
        ("human", "{input}"),
    ]
)

router_chain = route_prompt | llm

In [116]:
from langchain_core.runnables import RunnableLambda, RunnableBranch

master_chain = RunnableBranch(
    (lambda x: "qa_prompt" in router_chain.invoke(x).content.lower(), qa_chain),
    (lambda x: "summary_prompt" in router_chain.invoke(x).content.lower(), summary_chain),
    RunnableLambda(lambda x: "Sorry, I don't know how to handle that.")  # default
)

In [121]:
result = master_chain.invoke("does sanskar go to gym")
print (result.content)
print (result.usage_metadata)

Based on the provided context, there is no information about whether Sanskar goes to the gym.
{'input_tokens': 368, 'output_tokens': 221, 'total_tokens': 589, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 202}}


In [122]:
result = master_chain.invoke("what is the main emotions in the conversation")
print (result.content)
print (result.usage_metadata)

The main emotions conveyed in this conversation are:

1.  **Strong Friendship & Camaraderie:** This is the most dominant emotion. Despite long gaps between messages, the users communicate with a very casual, often teasing, and affectionate tone (e.g., "randi," "bitxh," "loda mera," "love you ❤️"). They share links, check in on each other, and discuss personal matters.
2.  **Support & Empathy:** When Sanskar expresses struggling with his "mental state," Mansimar immediately offers strong emotional support, emphasizing the role of friendship in sharing sadness ("friendships are for the equal distribution of sadness," "hum saath mein sad baithenge"). Mansimar also offers encouragement for Sanskar's interview.
3.  **Playfulness & Teasing:** A significant portion of their interaction involves lighthearted insults, jokes, and banter ("uncle to crazy nikla," "Imposter mc," "u liar," "party boi," "ganda aadmi toh nhi hai tu").
4.  **Frustration (often mild and humorous):** There are instances 