In [1]:
!pip install -U requests==2.32.5 --quiet
!pip install -U "langgraph>=0.2.26"  --quiet
!pip install langchain langchain-community sentence-transformers faiss-cpu transformers accelerate torch --quiet
!python -m pip show langgraph


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.8/156.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.2/471.2 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m


In [10]:
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "NotesRAGApp"
os.environ["LANGCHAIN_API_KEY"] = "<<API_KEY>>"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"


In [11]:
from langsmith import Client

client = Client()
print("Connected to LangSmith ✅, current project:", os.environ.get("LANGCHAIN_PROJECT"))


Connected to LangSmith ✅, current project: NotesRAGApp


In [12]:
from typing import TypedDict
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langgraph.graph import StateGraph
from langgraph.checkpoint.memory import MemorySaver
from datetime import datetime
import pandas as pd

notes_db = [
    {"id": 1, "tags": ["AI", "LangGraph"], "text": "LangGraph lets you build agent workflows as graphs with memory and control flow."},
    {"id": 2, "tags": ["AI"], "text": "Retrieval-Augmented Generation (RAG) connects LLMs to external knowledge to reduce hallucinations."},
    {"id": 3, "tags": ["Python"], "text": "FastAPI is a fast, modern framework for building web APIs in Python."},
    {"id": 4, "tags": ["AI", "Embeddings"], "text": "Sentence-Transformers provide high-quality text embeddings for semantic search."}
]




In [13]:
# Initialize embedding model wrapper for LangChain
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

texts = [n["text"] for n in notes_db]
metas = [{"tags": n["tags"], "id": n["id"]} for n in notes_db]

# Build FAISS vector store (LangChain handles embeddings internally)
vectorstore = FAISS.from_texts(texts, embedding=embedding_model, metadatas=metas)
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

In [14]:
from langgraph.graph import StateGraph
from typing import TypedDict

class RAGState(TypedDict):
    query: str
    context: str
    answer: str

builder = StateGraph(state_schema=RAGState)

def retriever_node(state):
    docs = retriever.invoke(state["query"])
    state["context"] = "\n".join([d.page_content for d in docs])
    return state

def summarizer_node(state):
    prompt = f"Answer concisely using only this context:\n{state['context']}\n\nQuestion: {state['query']}\nAnswer:"
    response = llm.invoke(prompt)
    state["answer"] = response
    return state

builder.add_node("retriever", retriever_node)
builder.add_node("summarizer", summarizer_node)
builder.add_edge("retriever", "summarizer")
builder.set_entry_point("retriever")
builder.set_finish_point("summarizer")

checkpointer = MemorySaver()

app = builder.compile(checkpointer=checkpointer)


In [15]:
from datetime import datetime
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import json

semantic_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

metrics = []

def run_query(query, session_id="default"):
  """Run a quey through RAG graph with session memory + metrics tracking"""
  config = {"configurable":{"thread_id": session_id}}
  state = app.invoke({"query": query, "context": "", "answer": ""}, config=config)

  # Print human-readable response
  print(f"\nSession: {session_id}")
  print(f"Query: {query}")
  print(f"Answer: {state['answer'][:250]}\n")

  # completeness = round(min(1.0, len(state["context"].split()) / (len(query.split()) * 5)), 2)
  # faithfulness = 1.0 if any(w in state["answer"].lower() for w in query.lower().split()[:2]) else 0.8
  # mismatch = 1-faithfulness

  query_emb = semantic_model.encode(query, convert_to_tensor=True)
  answer_emb = semantic_model.encode(state["answer"], convert_to_tensor=True)
  context_emb = semantic_model.encode(state["context"], convert_to_tensor=True)

  # Faithfulness → how close the answer is to the retrieved context
  faithfulness = float(util.cos_sim(answer_emb, context_emb))
  # Retrieval relevance → how close the retrieved context is to the user’s query
  retrieval_relevance = float(util.cos_sim(query_emb, context_emb))
  # Completeness → estimated by context length ratio
  completeness = round(min(1.0, len(state["context"].split()) / (len(query.split()) * 5)), 2)

  print(f"Completeness: {completeness}")
  print(f"Faithfulness: {faithfulness}")
  print(f"Retrieval Relevence: {retrieval_relevance}")

  metrics.append({
      "session": session_id,
      "query": query,
      "completeness": completeness,
      "faithfulness": faithfulness,
      "Retrieval Relevence": retrieval_relevance,
      "timestamp": datetime.now().isoformat()
  })

  print(json.dumps(metrics))

  return state


In [16]:
from langchain_community.llms import HuggingFacePipeline

flan = pipeline("text2text-generation", model="google/flan-t5-base", device_map="auto", max_new_tokens=150)
llm = HuggingFacePipeline(pipeline=flan)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=flan)


In [17]:
run_query("What is LangGraph?", "user001")
run_query("How does RAG reduce hallucinations?", "user001")
run_query("Explain FastAPI briefly", "user001")



Session: user001
Query: What is LangGraph?
Answer: LangGraph lets you build agent workflows as graphs with memory and control flow. Sentence-Transformers provide high-quality text embeddings for semantic search

Completeness: 1.0
Faithfulness: 0.996911883354187
Retrieval Relevence: 0.4405069947242737
[{"session": "user001", "query": "What is LangGraph?", "completeness": 1.0, "faithfulness": 0.996911883354187, "Retrieval Relevence": 0.4405069947242737, "timestamp": "2025-11-08T17:14:44.546735"}]

Session: user001
Query: How does RAG reduce hallucinations?
Answer: connects LLMs to external knowledge

Completeness: 0.96
Faithfulness: 0.445770263671875
Retrieval Relevence: 0.44643956422805786
[{"session": "user001", "query": "What is LangGraph?", "completeness": 1.0, "faithfulness": 0.996911883354187, "Retrieval Relevence": 0.4405069947242737, "timestamp": "2025-11-08T17:14:44.546735"}, {"session": "user001", "query": "How does RAG reduce hallucinations?", "completeness": 0.96, "faithfuln

{'query': 'Explain FastAPI briefly',
 'context': 'FastAPI is a fast, modern framework for building web APIs in Python.\nLangGraph lets you build agent workflows as graphs with memory and control flow.',
 'answer': 'FastAPI is a fast, modern framework for building web APIs in Python. LangGraph lets you build agent workflows as graphs with memory and control flow.'}