In [1]:
from langchain.chat_models import init_chat_model

llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")

In [2]:
#cleaning chat
import re

chat_file = "WhatsApp Chat with Saurabh Sushmita Somya/WhatsApp Chat with Saurabh Sushmita Somya.txt"

pattern = re.compile(r"^(\d{1,2}/\d{1,2}/\d{2}), (\d{2}:\d{2}) - ([^:]+): (.+)$")

messages = []

with open(chat_file, encoding="utf-8") as f:
    for line in f:
        match = pattern.match(line.strip())
        if match:
            date, time, sender, message = match.groups()
            if message not in ('null',
                               'Missed video call',
                               'Missed voice call',
                               'Missed group video call',
                               'Missed group voice call'):
                messages.append({
                    "date": date,
                    "time": time,
                    "sender": sender,
                    "message": message
                })
messages

[{'date': '12/31/18',
  'time': '22:59',
  'sender': 'Saurabh Sushmita Somya',
  'message': 'Hey! We are just an hour away from 2019 so I just wanted the first one to wish you'},
 {'date': '12/31/18',
  'time': '22:59',
  'sender': 'Saurabh Sushmita Somya',
  'message': 'Happy birthday'},
 {'date': '12/31/18',
  'time': '22:59',
  'sender': 'Sanskar',
  'message': 'Silly mistake'},
 {'date': '12/31/18',
  'time': '22:59',
  'sender': 'Sanskar',
  'message': 'Koi baat nahi'},
 {'date': '12/31/18',
  'time': '22:59',
  'sender': 'Sanskar',
  'message': 'Common mistake hai'},
 {'date': '12/31/18',
  'time': '22:59',
  'sender': 'Sanskar',
  'message': 'Happy New year'},
 {'date': '12/31/18',
  'time': '22:59',
  'sender': 'Saurabh Sushmita Somya',
  'message': '11 March right'},
 {'date': '12/31/18',
  'time': '23:00',
  'sender': 'Saurabh Sushmita Somya',
  'message': "I know I was going was a birthday wish it's not a mistake"},
 {'date': '12/31/18',
  'time': '23:00',
  'sender': 'Sansk

In [3]:
from datetime import datetime

formatted_messages = []
for msg in messages:
    # convert 24h → 12h (am/pm)
    time_24 = datetime.strptime(msg['time'], "%H:%M")
    time_12 = time_24.strftime("%I:%M %p").lstrip("0")  # remove leading 0
    
    line = f"{msg['date']}, {time_12} - {msg['sender']}: {msg['message']}"
    formatted_messages.append(line)

cleantext = "\n".join(formatted_messages)

with open('cleaned_chat.txt', 'w', encoding="utf-8") as out:
    out.write(cleantext)

In [4]:
from langchain_core.documents import Document

docs = [
    Document(
        page_content=m["message"],
        metadata={
            "date": m["date"],
            "time": m["time"],
            "sender": m["sender"]
        }
    )
    for m in messages
]

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_community.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore.save_local("faiss_whatsapp_index")

# this retriever is reading the docs from this program
retriever = vectorstore.as_retriever()

# try another method where the retriever is taking docs from the saved file

  from .autonotebook import tqdm as notebook_tqdm


OSError: The paging file is too small for this operation to complete. (os error 1455)

In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

retrieval  = RunnableParallel(
    {"context": retriever, "input": RunnablePassthrough()}
)

full_context_retrieval = RunnableParallel(
    {"context": lambda _: cleantext, "input": RunnablePassthrough()}
)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert on answering specific questions about who said what, dates, details in the chat. Use the following context to answer questions.\n\n{context}"),
        ("human", "{input}"),
    ]
)
summary_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert on summarizing the entire conversation or giving overviews. Use the following context to answer questions.\n\n{context}"),
        ("human", "{input}"),
    ]
)

qa_chain = retrieval | qa_prompt | llm

summary_chain = full_context_retrieval | summary_prompt | llm

route_system = """
Given a raw text input to a language model, select the model prompt best suited for the input. You will be given the names of the available prompts and a description of what the prompt is best suited for.

<< CANDIDATE PROMPTS >>

qa_prompt: prompt seeking answer for specific questions about who said what, dates, details in the chat.
summary_prompt: prompt seeking summary, overview or analyzing the entire conversation.

<< INPUT >>
{input}

<< OUTPUT >>
Return only one word and the value being either "qa_prompt" or "summary_prompt" depending on which is the most suitable.
"""

route_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", route_system),
        ("human", "{input}"),
    ]
)

router_chain = route_prompt | llm

In [None]:
from langchain_core.runnables import RunnableLambda, RunnableBranch

master_chain = RunnableBranch(
    (lambda x: "qa_prompt" in router_chain.invoke(x).content.lower(), qa_chain),
    (lambda x: "summary_prompt" in router_chain.invoke(x).content.lower(), summary_chain),
    RunnableLambda(lambda x: "Sorry, I don't know how to handle that.")  # default
)

In [None]:
result = master_chain.invoke("give me a summary of the conversation")
print (result.content)
print (result.usage_metadata)

Based on the provided chat context, there is no information about Sanskar going to the gym.
{'input_tokens': 364, 'output_tokens': 330, 'total_tokens': 694, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 311}}
