In [1]:
# !pip install langgraph langchain langchain_openai chromadb

In [None]:

# imports
import os

from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [None]:
# set environment variables
os.environ["OPENAI_API_KEY"] = # enter your openai api key here

Ingest the data, a markdown file with information on work experience, etc.

Process the markdown document, which includes details such as work experience, by segmenting it at markdown header points to create each chunk. This ensures that each segment maintains its integrity, encapsulating the relevant data within.

In [None]:
# ingesting data
markdown_path = "source.md"
# read the markdown file and return the full document as a string
with open(markdown_path, "r") as file:
    full_markdown_document = file.read()

# split the data into chunks based on the markdown heading
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
chunked_documents = markdown_splitter.split_text(full_markdown_document)

# create a vector store
embeddings_model = OpenAIEmbeddings()
db = Chroma.from_documents(chunked_documents, embeddings_model)

# create retriever
retriever = db.as_retriever()

In [None]:
llm = ChatOpenAI(model="gpt-4-0125-preview", temperature=0)

Create the rag chain

In [None]:
rag_prompt = """You are an AI  assistant. Your main task is to answer questions people may have about Sajal.
Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""

rag_prompt_template = ChatPromptTemplate.from_template(rag_prompt)
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | rag_prompt_template
    | llm
    | StrOutputParser()
)

Examples of good results

In [None]:
rag_chain.invoke("When did Sajal graduate from University of Melbourne?")

'Sajal graduated from the University of Melbourne with a Master of Information Technology, majoring in Computing, in August 2016.'

In [None]:
rag_chain.invoke("What did Sajal do at Unscrambl?")

'At Unscrambl, Sajal was a key member of the NLP Engineering team, where he helped enhance the natural language understanding of their business analytics platform, focusing on advancing Named Entity Recognition (NER), intent recognition, and ANNOY model functionalities. He developed the Natural Language to SQL system data preparation pipeline using NLTK and spaCy, significantly reducing manual effort and boosting system efficiency. Additionally, Sajal collaborated in designing and developing NLP-driven chatbot products and led the deployment of these solutions for clients across Asia, impacting over 100,000 monthly users.'

Examples of subpar results

In [None]:
# incorrect result
rag_chain.invoke("How many countries has sajal worked in?")

"The provided documents do not specify the exact number of countries Sajal has worked in. However, his education and mentoring activities suggest he has connections to Australia and India, and possibly interacts with international students globally through his role as a mentor at Udacity. Without more specific information on his professional work locations, it's not possible to give a precise count of countries he has worked in."

In [None]:
# check what documents were retrieved from the vector db
retriever.get_relevant_documents("How many countries has sajal worked in?")

[Document(page_content='# Sajal Sharma  \n## Contact Info  \n+65 9077-9093 | contact@sajalsharma.com | [LinkedIn](linkedin.com/in/sajals) | [GitHub](github.com/sajal2692)', metadata={'Header 1': 'Sajal Sharma', 'Header 2': 'Contact Info'}),
 Document(page_content='## Languages  \n- Hindi (Native or Bilingual)\n- English (Native or Bilingual)\n- German (Elementary)', metadata={'Header 1': 'Sajal Sharma', 'Header 2': 'Languages'}),
 Document(page_content='## Activities  \n- Mentor & Project Reviewer, Udacity: Coached 100+ international students enrolled in Data Science courses. Recognised as an elite mentor in 2021 with A+ mentor performance grade based on student feedback scores.\n- Mentor, STEM Industry Mentoring Programme, The University of Melbourne: Jul 2020 - Present\n- Creator, Data Science Portfolio: Github repo with 900+ stars showcasing various classical Data Science projects.', metadata={'Header 1': 'Sajal Sharma', 'Header 2': 'Activities'}),
 Document(page_content='## Educati

Since there are no chunks that can directly answer the given question, the similarity search struggles to find relevant information.

Another example of a similar case:

In [None]:
# incorrect / incomplete result
rag_chain.invoke("list all the positions that sajal has held throughout his career")

'Throughout his career, Sajal has held the following positions:\n1. Mentor & Project Reviewer at Udacity\n2. Mentor at the STEM Industry Mentoring Programme, The University of Melbourne\n3. Creator of a Data Science Portfolio on GitHub\n4. Senior AI Engineer at Splore, a Temasek-backed AI startup (contracted via Unscrambl), Singapore'

Building a Corrective RAG workflow using LangGraph

1. Grade retrieved documents based on the question
2. If no relevant documents found, then pass in the whole source document as the context.

In [None]:
# Defining the state class which holds data related to the current state
from typing import Dict, TypedDict

class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        keys: A dictionary where each key is a string.
    """
    keys: Dict[str, any]

Defining the nodes of the graph

In [None]:
def retrieve_documents(state):
  """Node to retrieve documents, by using the query from the state"""
  print("---RETRIEVE DOCUMENTS---") # print statements to track flow
  state_dict = state["keys"]
  question = state_dict["question"]
  documents = retriever.get_relevant_documents(question)
  return {"keys": {"question": question, "documents": documents}}

In [None]:
generation_answer_chain = rag_prompt_template | llm | StrOutputParser()
def generate_with_retrieved_documents(state):
  """Node to generate answer using retrieved documents"""
  print("---GENERATE USING RETRIEVED DOCUMENTS---")
  state_dict = state["keys"]
  question = state_dict["question"]
  documents = state_dict["documents"]
  answer = generation_answer_chain.invoke({"question": question, "context": documents})
  return {"keys": {"question": question, "response": answer}}

In [None]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers.openai_tools import PydanticToolsParser
from langchain_core.utils.function_calling import convert_to_openai_tool

grader_prompt = """
You are a grader assessing relevance of a retrieved document to a user question. \n
Retrieved document: \n\n {context} \n\n
User Question: {question} \n
When assessing the relevance of a retrieved document to a user question, consider whether the document can provide a complete answer to the question posed. A document is considered relevant only if it contains all the necessary information to fully answer the user's inquiry without requiring additional context or assumptions.
Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.
Do not return anything other than a 'yes' or 'no'.
"""

grader_prompt_template = PromptTemplate(template=grader_prompt, input_variables=["context", "question"])

# pydantic class for grade, to be used with openai function calling
class grade(BaseModel):
    """Binary score for relevance check."""
    binary_score: str = Field(description="Relevance score 'yes' or 'no'")

grade_tool_openai = convert_to_openai_tool(grade)

llm_with_grader_tool = llm.bind(
    tools=[grade_tool_openai],
    tool_choice={"type": "function", "function": {"name": "grade"}}
)

tool_parser = PydanticToolsParser(tools=[grade])

grader_chain = grader_prompt_template | llm_with_grader_tool | tool_parser

def grade_documents(state):
  """Node to grade documents, filter out irrelevant documents and assess whether need to run generation on whole document"""
  print("---GRADE DOCUMENTS---")
  state_dict = state["keys"]
  question = state_dict["question"]
  documents = state_dict["documents"]

  filtered_documents = []
  run_with_all_data = False
  for doc in documents:
    score = grader_chain.invoke({"context": documents, "question": question})
    grade = score[0].binary_score
    if grade == "yes":
      print("---GRADE: FOUND RELEVANT DOCUMENT---")
      filtered_documents.append(doc)
  if not filtered_documents:
    print("---GRADE: DID NOT FIND ANY RELEVANT DOCUMENTS")
    run_with_all_data = True

  return {
      "keys": {
          "documents": filtered_documents,
          "question": question,
          "run_with_all_data": run_with_all_data
          }
      }

In [None]:
def generate_answer_using_all_data(state):
  """Node to generate the answer using the complete document"""
  print("---GENERATING ANSWER USING ALL DATA")
  state_dict = state["keys"]
  question = state_dict["question"]
  answer = generation_answer_chain.invoke({"question": question, "context": full_markdown_document})
  return {"keys": {"question": question, "response": answer}}

Define the conditional edge

In [None]:
def decide_to_use_all_data(state):
  """Conditional edge that decides the next node to run"""
  state_dict = state["keys"]
  run_with_all_data = state_dict["run_with_all_data"]

  if run_with_all_data:
      return "generate_answer_using_all_data"
  else:
      return "rag"

Defining the graph

In [None]:
from langgraph.graph import END, StateGraph

class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        keys: A dictionary where each key is a string.
    """
    keys: Dict[str, any]


def compile_graph():
  workflow = StateGraph(GraphState)
  ### define the nodes
  workflow.add_node("retrieve", retrieve_documents)
  workflow.add_node("grade_documents", grade_documents)
  workflow.add_node("generate_answer_with_retrieved_documents", generate_with_retrieved_documents)
  workflow.add_node("generate_answer_using_all_data", generate_answer_using_all_data)
  ### build the graph
  workflow.set_entry_point("retrieve")
  workflow.add_edge("retrieve", "grade_documents")
  workflow.add_conditional_edges(
      "grade_documents",
      decide_to_use_all_data,
      {
          "rag": "generate_answer_with_retrieved_documents",
          "generate_answer_using_all_data": "generate_answer_using_all_data",
      }
  )
  workflow.add_edge("generate_answer_with_retrieved_documents", END)
  workflow.add_edge("generate_answer_using_all_data", END)
  ### compile the graph
  app = workflow.compile()
  return app

Compiling the graph

In [None]:
app = compile_graph()
def response_from_graph(question):
  """Returns the response from the graph"""
  return app.invoke({"keys": {"question": question}})["keys"]["response"]

Trying on the same example as previously

In [None]:
# testing out the flow with crag
print(response_from_graph("How many countries has sajal worked in?"))

---RETRIEVE DOCUMENTS---
---GRADE DOCUMENTS---
---GRADE: DID NOT FIND ANY RELEVANT DOCUMENTS
---GENERATING ANSWER USING ALL DATA
Sajal has worked in at least three countries: Singapore, the Philippines, and India. His work in Singapore is mentioned with OneByZero and Splore, a Temasek-backed AI startup. Additionally, he developed a proof of concept for a major bank in the Philippines and was a key member of Unscrambl's NLP Engineering team in India.


In [None]:
print(response_from_graph("list all the positions that sajal has held throughout his career"))

---RETRIEVE DOCUMENTS---
---GRADE DOCUMENTS---
---GRADE: DID NOT FIND ANY RELEVANT DOCUMENTS
---GENERATING ANSWER USING ALL DATA
Throughout his career, Sajal has held the following positions:
1. Lead AI Engineer at OneByZero (contracted via Unscrambl), Singapore.
2. Senior AI Engineer at Splore, a Temasek-backed AI startup (contracted via Unscrambl), Singapore.
3. Senior Machine Learning Engineer at Unscrambl, India.
4. Machine Learning Engineer at Unscrambl, India.


The graph is able to handle cases where the retrieved chunks can be used to answer the question.

In [None]:
print(response_from_graph("Has sajal created any popular github repositories?"))

---RETRIEVE DOCUMENTS---
---GRADE DOCUMENTS---
---GRADE: FOUND RELEVANT DOCUMENT---
---GRADE: FOUND RELEVANT DOCUMENT---
---GRADE: FOUND RELEVANT DOCUMENT---
---GRADE: FOUND RELEVANT DOCUMENT---
---GENERATE USING RETRIEVED DOCUMENTS---
Yes, Sajal has created a popular GitHub repository. His Data Science Portfolio on GitHub has garnered over 900 stars, showcasing various classical Data Science projects. This indicates a significant level of recognition and appreciation from the GitHub community.
