In [1]:
#llm model=qwen
#embedding model=all-MiniLM-L6-v2

!pip install langchain langchain-huggingface langchain_community langgraph langchain-text-splitters sentence-transformers

from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline

import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

from langchain_community.embeddings import HuggingFaceEmbeddings


from langchain_core.vectorstores import InMemoryVectorStore



Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting langchain_community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting langgraph
  Downloading langgraph-0.6.6-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Downloading langchain_core-0.3.75-py3-none-any.whl.metadata (5.7 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langgraph-checkpoint<3.0.0,>=2.1.0 (from langgraph)
  Downloading langgraph_checkpoint-2.1.1-py3-none-any.whl.metadata (4.2 kB)
Collecting langgraph-prebuilt<0.7.0,>=0.6.0 (from langgraph)
  Downloading langgraph_prebuilt-0.6.4-py3-none-any.whl.metadata (4.5 kB)
Collecting langgraph-sdk<0.3.0,>=0.2.2 (from 



In [2]:


# Load and chunk contents of the blog
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/#task-decomposition",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs=loader.load()

# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

# ✅ Embedding model wrapped for LangChain
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# ✅ Vector store
vector_store = InMemoryVectorStore(embedding=embedding_model)
vector_store.add_documents(all_splits)

#llm model
llm=HuggingFacePipeline.from_model_id(
    #model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    model_id="Qwen/Qwen2.5-1.5B-Instruct",
    device_map="auto",
    task="text-generation",
)

model=ChatHuggingFace(llm=llm)

prompt = hub.pull("rlm/rag-prompt")


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = model.invoke(messages)
    return {"answer": response.content}


# Compile application
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph_builder.add_edge("retrieve","generate")
graph = graph_builder.compile()

#test
response = graph.invoke({"question": "Why is task decomposition important for autonomous agents?"})
print(response["answer"])

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Device set to use cuda:0


<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: Why is task decomposition important for autonomous agents? 
Context: LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, 

In [25]:
import csv

# List of Q&A dictionaries
qa_pairs = [
    ("What is the main concept described in the blog 'LLM Powered Autonomous Agents' by Lilian Weng?",
     "It explores building autonomous agents using large language models (LLMs) as the core controller, enabling them to act as general problem solvers."),

    ("What are the three key components of an LLM-powered autonomous agent system?",
     "Planning, Memory, and Tool Use."),

    ("What does the Planning component involve in an autonomous agent?",
     "It involves breaking tasks into subgoals and engaging in reflection to refine actions over time."),

    ("How does Task Decomposition contribute to effective planning?",
     "It allows the agent to split complex tasks into manageable subgoals for efficient handling and execution."),

    ("What prompting technique is commonly used for task decomposition?",
     "Chain of Thought (CoT), which instructs the model to think step by step."),

    ("Why is self-reflection important for autonomous agents?",
     "It lets agents critique their past actions, learn from mistakes, and improve future performance."),

    ("What types of memory are used in these agents?",
     "Short-term memory (via in-context learning) and long-term memory (via external vector stores)."),

    ("How does short-term memory function for agents?",
     "It uses the model’s in-context learning abilities to maintain and process immediate information."),

    ("What is the role of long-term memory in autonomous agents?",
     "It enables retention and retrieval of vast information over extended periods via external vector stores."),

    ("Why do agents use external tools?",
     "Because LLMs may lack up-to-date information or execution capability, so tools via APIs provide access to current data, execution environments, and proprietary sources."),

    ("Name some real-world proof-of-concept examples mentioned in the blog.",
     "AutoGPT, GPT‑Engineer, and BabyAGI."),

    ("How does CoT prompting improve performance?",
     "By transparently modeling the chain of reasoning and breaking tasks into simpler, sequential steps—enhancing the model’s interpretability and output quality."),

    ("What is the Tree of Thoughts approach?",
     "It extends CoT by exploring multiple reasoning paths at each step, forming a tree of potential solution paths."),

    ("How does Tree of Thoughts evaluate multiple branches?",
     "It can use breadth-first or depth-first search, and utilize classifiers or majority votes to select the most promising path."),

    ("In what way can task decomposition be conducted using external planners?",
     "The agent may translate the task into PDDL, use a classical planner, and then convert the plan back into natural language."),

    ("What does self-reflection look like in agent behavior?",
     "Agents reuse self-awareness mechanisms, such as ReAct, to interleave reasoning with action and learn from past mistakes."),

    ("According to the blog, what is the overarching benefit of LLM-powered autonomous agents?",
     "They offer a general problem-solving framework—beyond writing or coding—with potential to automate complex workflows."),

    ("What core limitations do many agent frameworks currently face?",
     "They often only support parts of the needed capabilities (e.g., decomposition, memory, tool use), lack consistency, and are hard to customize or tune."),

    ("Why is long-short term memory integration crucial in language agents?",
     "Because agents must interact over time—remembering past interactions (short-term) and retaining information across sessions (long-term)."),

    ("What value does external tool access add to LLM agents?",
     "It equips agents with abilities beyond language—like web navigation, real-time data access, and executing code.")
]


# Write to CSV
import pandas as pd

# Convert to DataFrame
df = pd.DataFrame(qa_pairs, columns=["Question", "Answer"])

# Save to CSV
df.to_csv("llm_agent_qa_pairs.csv", index=False)



In [26]:
#load your qna csv
import pandas as pd

qna_df = pd.read_csv("llm_agent_qa_pairs.csv")
print(qna_df.head())


                                            Question  \
0  What is the main concept described in the blog...   
1  What are the three key components of an LLM-po...   
2  What does the Planning component involve in an...   
3  How does Task Decomposition contribute to effe...   
4  What prompting technique is commonly used for ...   

                                              Answer  
0  It explores building autonomous agents using l...  
1                    Planning, Memory, and Tool Use.  
2  It involves breaking tasks into subgoals and e...  
3  It allows the agent to split complex tasks int...  
4  Chain of Thought (CoT), which instructs the mo...  


# Task
Benchmark the RAG model using the uploaded Q&A file.

## Define evaluation function

### Subtask:
Create a function that takes a question as input, invokes the RAG model to get an answer, and compares it to the expected answer from the Q&A dataset.


**Reasoning**:
I need to define a Python function that takes a question, invokes the RAG model, and returns the generated answer.



In [27]:
def get_rag_answer(question: str) -> str:
    """
    Invokes the RAG model with a given question and returns the generated answer.

    Args:
        question: The question to pass to the RAG model.

    Returns:
        The answer generated by the RAG model.
    """
    response = graph.invoke({"question": question})
    return response["answer"]

## Evaluate rag model

### Subtask:
Iterate through the questions in the Q&A dataset, apply the evaluation function to each question, and store the results.


**Reasoning**:
Initialize an empty list to store evaluation results and iterate through the qna_df to evaluate each question using the previously defined `get_rag_answer` function.



In [28]:
# Define application steps
# def retrieve(state: State):
#     retrieved_docs = vector_store.similarity_search(state["question"])
#     return {"context": retrieved_docs}


# def generate(state: State):
#     docs_content = "\n\n".join(doc.page_content for doc in state["context"])
#     messages = prompt.invoke({"question": state["question"], "context": docs_content})
#     response = llm.invoke(messages)
#     return {"answer": response} # Corrected to return the string directly


# # Compile application
# graph_builder = StateGraph(State).add_sequence([retrieve, generate])
# graph_builder.add_edge(START, "retrieve")
# graph_builder.add_edge("retrieve","generate")
# graph = graph_builder.compile()

evaluation_results = []

for index, row in qna_df.iterrows():
    question = row['Question']
    expected_answer = row['Answer']
    generated_answer = get_rag_answer(question)
    print(f"{index+1} Qn done")

    evaluation_results.append({
        'question': question,
        'expected_answer': expected_answer,
        'generated_answer': generated_answer
    })

#print(evaluation_results)

1 Qn done
2 Qn done
3 Qn done
4 Qn done
5 Qn done
6 Qn done
7 Qn done
8 Qn done
9 Qn done
10 Qn done
11 Qn done
12 Qn done
13 Qn done
14 Qn done
15 Qn done
16 Qn done
17 Qn done
18 Qn done
19 Qn done
20 Qn done


## Analyze results

### Subtask:
Calculate and display metrics such as accuracy, precision, or recall based on the comparison of generated and expected answers.


**Reasoning**:
Iterate through the evaluation results, compare generated and expected answers, calculate accuracy, and print the results.



In [36]:
from sklearn.metrics.pairwise import cosine_similarity

def is_answer_correct(generated_answer: str, expected_answer: str, embedding_model) -> bool:
    """
    Compares the generated answer with the expected answer using semantic similarity.

    Args:
        generated_answer: The answer generated by the RAG model.
        expected_answer: The expected answer from the Q&A dataset.
        embedding_model: The embedding model to use for calculating semantic similarity.

    Returns:
        True if the answers are semantically similar, False otherwise.
    """
    # Convert to lowercase for case-insensitive comparison
    generated_answer_lower = generated_answer.lower()
    expected_answer_lower = expected_answer.lower()

    # Get embeddings for the generated and expected answers
    generated_embedding = embedding_model.embed_query(generated_answer_lower)
    expected_embedding = embedding_model.embed_query(expected_answer_lower)

    # Calculate cosine similarity between the embeddings
    similarity_score = cosine_similarity([generated_embedding], [expected_embedding])[0][0]

    # Define a similarity threshold (you may need to tune this value)
    similarity_threshold = 0.5

    return similarity_score >= similarity_threshold

total_questions = len(evaluation_results)
correct_answers = 0

for result in evaluation_results:
    question = result['question']
    expected_answer = result['expected_answer']
    generated_answer = result['generated_answer']

    # Pass the embedding_model to the function
    if is_answer_correct(generated_answer, expected_answer, embedding_model):
        correct_answers += 1
       # print(f"Question: {question}\nExpected: {expected_answer}\nGenerated: {generated_answer}\nResult: Correct\n")
    #else:
       # print(f"Question: {question}\nExpected: {expected_answer}\nGenerated: {generated_answer}\nResult: Incorrect\n")


accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0

print(f"Total questions: {total_questions}")
print(f"Correctly answered: {correct_answers}")
print(f"Accuracy: {accuracy:.2f}%")

Total questions: 20
Correctly answered: 11
Accuracy: 55.00%


## Summarize findings

### Subtask:
Present a summary of the benchmarking results and insights into the RAG model's performance.


**Reasoning**:
Summarize the benchmarking results, discuss potential reasons for the performance, and suggest areas for improvement based on the previous analysis.



In [23]:
print("--- RAG Model Benchmarking Summary ---")
print(f"Total questions evaluated: {total_questions}")
print(f"Number of correctly answered questions: {correct_answers}")
print(f"Overall accuracy: {accuracy:.2f}%\n")

print("--- Analysis and Insights ---")
print(f"The RAG model achieved an accuracy of {accuracy:.2f}% on the provided Q&A dataset using semantic similarity.")
print("Semantic similarity provides a more nuanced evaluation than a simple substring check,")
print("accounting for variations in phrasing while capturing the core meaning.")
print("Further analysis of incorrectly answered questions can help identify areas for improvement in the RAG pipeline,")
print("such as prompt engineering, document chunking strategies, or the choice of language and embedding models.")
print("Manual evaluation of a subset of results is still recommended to gain deeper insights into the quality of generated answers.")

print("\n--- Suggested Areas for Improvement ---")
print("1. Refine the semantic similarity threshold: Experiment with different threshold values to find the best balance for your specific use case.")
print("2. Manual evaluation: Conduct a manual review of a subset of generated answers, especially those marked as incorrect by the semantic similarity metric, to understand the types of errors.")
print("3. Fine-tune the RAG model: Experiment with different prompting strategies, chunk sizes for document splitting, or different base language models and embedding models to potentially improve generation quality and semantic similarity.")
print("4. Expand the Q&A dataset: A larger and more diverse dataset would provide a more robust evaluation of the model's performance across various types of questions and phrasings.")

--- RAG Model Benchmarking Summary ---
Total questions evaluated: 20
Number of correctly answered questions: 11
Overall accuracy: 55.00%

--- Analysis and Insights ---
The RAG model achieved an accuracy of 55.00% on the provided Q&A dataset using semantic similarity.
Semantic similarity provides a more nuanced evaluation than a simple substring check,
accounting for variations in phrasing while capturing the core meaning.
Further analysis of incorrectly answered questions can help identify areas for improvement in the RAG pipeline,
such as prompt engineering, document chunking strategies, or the choice of language and embedding models.
Manual evaluation of a subset of results is still recommended to gain deeper insights into the quality of generated answers.

--- Suggested Areas for Improvement ---
1. Refine the semantic similarity threshold: Experiment with different threshold values to find the best balance for your specific use case.
2. Manual evaluation: Conduct a manual review of 