

## This section installs all necessary dependencies for the notebook.

In [2]:
!pip install autogen-agentchat autogen-ext[openai] -q
!pip install groq -q
!pip install arxiv -q
!pip install PyPDF2 -q
!pip install pymupdf -q
!pip install transformers torch -q
!pip install numpy -q
!pip install nltk -q
!pip install rouge -q
!pip install bert_score -q
!pip install textstat -q
!pip install spacy -q
!pip install torch -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/75.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/83.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.9/83.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/234.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m234.2/234.2 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

## Importing Required Libraries

This section imports all necessary modules, categorized based on their functionality.

In [3]:
from autogen_agentchat.agents import AssistantAgent, UserProxyAgent
from autogen_agentchat.conditions import TextMentionTermination, MaxMessageTermination
from autogen_agentchat.teams import RoundRobinGroupChat, SelectorGroupChat
from autogen_agentchat.ui import Console
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_core.tools import FunctionTool

import arxiv
import fitz

import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
import spacy

import rouge
import bert_score
import textstat
from nltk.translate.bleu_score import sentence_bleu

import torch
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## Setting Up API Keys
Before proceeding, you need to provide your API key for authentication

In [37]:
import os
from getpass import getpass

tokenGROQ = getpass('Enter GROQ_API_KEY here: ')
os.environ["GROQ_API_KEY"] = tokenGROQ

Enter GROQ_API_KEY here: ··········


## Automatic Research Paper Summarization

This section of the code downloads academic papers from arXiv, extracts their text, breaks it into manageable chunks, and generates a concise summary using Facebook's BART Large CNN model.

In [7]:
nltk.download("punkt")
nltk.download('punkt_tab')

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "".join(page.get_text() + "\n" for page in doc)
    text = re.split(r'(?i)\breferences\b', text, maxsplit=1)[0]

    return text

def chunk_text(text, chunk_size=1024):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) < chunk_size:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def summarize_chunks(chunks):
    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
        summaries.append(summary)
    return " ".join(summaries)

def download_and_summarize_papers(query, num_papers=1):
    client = arxiv.Client()
    search = arxiv.Search(
        query=query,
        max_results=num_papers,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )

    paper_summaries = []

    for result in client.results(search):
        filename = f"{result.get_short_id()}.pdf"
        result.download_pdf(filename=filename)
        print(f"Downloaded: {filename}")

        text = extract_text_from_pdf(filename)
        chunks = chunk_text(text)
        summary = summarize_chunks(chunks)

        paper_summaries.append({"id": result.get_short_id(), "summary": summary})

    return paper_summaries

papers_MA = download_and_summarize_papers("Multi-agent LLM systems", num_papers=1)
# papers_PE = download_and_summarize_papers("Prompt Engineering", num_papers=1)
for paper in papers_MA:
    print(f"Paper ID: {paper['id']}, Summary: {paper['summary']}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Device set to use cpu


Downloaded: 2503.10630v1.pdf
Paper ID: 2503.10630v1, Summary: In this paper, we propose a general framework for univer-sal zero-shot goal-oriented navigation. We propose a uniform graph representation to unify different goals. We also convert the observation of agent into an online maintained scene graph. We preserve most structural infor-                mation compared with pure text. We are able to leverage GLM for explicit graph-based reasoning. Extensive experiments on several benchmarks show that our UniGoal achieves state-of-the-art zero-shot performance on three studied navigation tasks. Goal-oriented navigation is a fundamental problem in robotic tasks. It requires the agent to navigate to a specified goal in an unknown environment. State-of-the-art zero-shot goal-oriented Navigation meth-phthalods are typically specialized for each goal type. UniGoal enables zero-shotinference on three studied navigation tasks. It achieves leading performance on multiple benchmarks. These sub-

In [8]:
print(papers_MA[0].get("summary"))

In this paper, we propose a general framework for univer-sal zero-shot goal-oriented navigation. We propose a uniform graph representation to unify different goals. We also convert the observation of agent into an online maintained scene graph. We preserve most structural infor-                mation compared with pure text. We are able to leverage GLM for explicit graph-based reasoning. Extensive experiments on several benchmarks show that our UniGoal achieves state-of-the-art zero-shot performance on three studied navigation tasks. Goal-oriented navigation is a fundamental problem in robotic tasks. It requires the agent to navigate to a specified goal in an unknown environment. State-of-the-art zero-shot goal-oriented Navigation meth-phthalods are typically specialized for each goal type. UniGoal enables zero-shotinference on three studied navigation tasks. It achieves leading performance on multiple benchmarks. These sub-tasks are also known as Object-goalNavigation (ON) and Instanc


## Autonomous Multi-Agent System for Generating Literature Reviews

This script implements an AI-powered multi-agent system for generating and refining
literature reviews based on research summaries. It utilizes the `autogen_agentchat`
framework to coordinate different AI agents in a structured workflow.



In [29]:
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.base import Handoff
from autogen_agentchat.conditions import HandoffTermination, TextMentionTermination
from autogen_agentchat.teams import RoundRobinGroupChat
from autogen_agentchat.ui import Console
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_agentchat.conditions import MaxMessageTermination
from autogen_core.tools import FunctionTool
from autogen_agentchat.teams import SelectorGroupChat
from typing import List

text_termination = TextMentionTermination("TERMINATE")
max_messages_termination = MaxMessageTermination(max_messages=7)
combined_termination = max_messages_termination | text_termination

custom_model_client = OpenAIChatCompletionClient(
    # model="llama-3.1-8b-instant",
    model="llama-3.3-70b-versatile",
    base_url="https://api.groq.com/openai/v1",
    api_key=os.environ["GROQ_API_KEY"],
    model_info={
        "vision": False,
        "function_calling": True,
        "json_output": False,
        "family": "unknown",
    },
)

def get_summary():
    return papers_MA[0].get("summary")

get_summary_tool = FunctionTool(
    get_summary,
    description="Retrieves the summary of the research."
)

report_agent = AssistantAgent(
    name="Report_Agent",
    model_client=custom_model_client,
    tools=[get_summary_tool],
    description="Generates a high-quality literature review based on the provided summary.",
    system_message="""
    You are the Report Agent. Your task is to synthesize the provided summary into a high-quality literature review.
    Use the `get_summary_tool` to retrieve the summary and write a well-structured review.
    """
)

planning_agent = AssistantAgent(
    name="PlanningAgent",
    description="An agent responsible for planning and coordinating the workflow of the research team.",
    model_client=custom_model_client,
    system_message="""
    You are the Planning Agent. Your responsibilities are:
    1. Initiate the review process by asking the `Report_Agent` to generate a literature review.
    2. Present the review to the user and collect their feedback.
    3. If the user provides feedback, ask the `Report_Agent` to revise the review.
    4. Repeat the process until the user is satisfied or terminates the workflow.

    Follow these steps:
    1. Start by asking the `Report_Agent` to generate a literature review using its tool.
    2. Once the review is generated, present it to the user and ask for feedback.
    3. If the user provides feedback, pass it to the `Report_Agent` to revise the review.
    4. Repeat steps 2-3.

    Your role is to ensure the workflow runs smoothly and that the user's feedback is incorporated into the review.
    """
)

user_proxy = UserProxyAgent(
    name="user_proxy",
    input_func=input
)


team = SelectorGroupChat(
    [planning_agent, report_agent, user_proxy],
    termination_condition=combined_termination,
    model_client=custom_model_client,
    allow_repeated_speaker=True,
    selector_prompt=(
        "Available roles:\n{roles}\nTheir job descriptions:\n{participants}\n"
        "Current conversation history:\n{history}\n"
        "Please select the most appropriate role for the next message, and only return the role name."
    ),
)

response = await Console(
    team.run_stream(
        task="Write a literature review",
    )
)

---------- user ----------
Write a literature review
---------- Report_Agent ----------
[FunctionCall(id='call_ysff', arguments='{}', name='get_summary')]
---------- Report_Agent ----------
[FunctionExecutionResult(content='In this paper, we propose a general framework for univer-sal zero-shot goal-oriented navigation. We propose a uniform graph representation to unify different goals. We also convert the observation of agent into an online maintained scene graph. We preserve most structural infor- \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0mation compared with pure text. We are able to leverage GLM for explicit graph-based reasoning. Extensive experiments on several benchmarks show that our UniGoal achieves state-of-the-art zero-shot performance on three studied navigation tasks. Goal-oriented navigation is a fundamental problem in robotic tasks. It requires the agent to navigate to a specified goal in an unknown environment. State-of-the-art zero-shot goal-oriented Navigation meth-phthal

In [26]:
await team.reset()

In [34]:
print(response.messages[-4].content)

Here is a literature review based on the provided summary:

The problem of universal zero-shot goal-oriented navigation has been addressed in recent research. This task involves designing a general method that can navigate to different goals in an unknown environment without requiring any training or fine-tuning. Existing methods have typically been specialized for each goal type, but a recent approach, called UniGoal, has achieved state-of-the-art performance on three studied navigation tasks: object-goal navigation, instance-image-goal navigation, and text-goal navigation.

UniGoal uses a uniform graph representation for both 3D scenes and goals, which allows it to uniformly represent different types of goals and scenes. The method constructs an online 3D scene graph along with the movement of the agent and conducts graph matching between the scene graph and goal graph to determine whether a goal is observed. If not, the agent needs to infer the relationship between objects from the 

In [35]:
generated_text = response.messages[-4].content

## Text Evaluation Metrics for AI-Generated Content

This script evaluates the quality of AI-generated text against a reference text
using multiple NLP-based metrics, including:

1. **ROUGE Score** - Measures text overlap between reference and generated text.
2. **BLEU Score** - Evaluates n-gram similarity (precision-based) for translations.
3. **BERTScore** - Uses contextual embeddings to assess semantic similarity.
4. **Cosine Similarity** - Computes text similarity using TF-IDF vectorization.
5. **Readability Score** - Estimates ease of comprehension (Flesch Reading Ease).
6. **Perplexity** - Measures how well GPT-2 predicts the generated text (lower is better).
7. **Citation Overlap** - Checks named entities (e.g., authors, papers, organizations).


In [36]:
nlp = spacy.load("en_core_web_sm")

gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

def compute_rouge_scores(reference, generated):
    evaluator = rouge.Rouge()
    scores = evaluator.get_scores(generated, reference)
    return scores

def compute_bleu(reference, generated):
    reference_tokens = [nltk.word_tokenize(reference)]
    generated_tokens = nltk.word_tokenize(generated)
    return sentence_bleu(reference_tokens, generated_tokens)

def compute_bert_score(reference, generated):
    P, R, F1 = bert_score.score([generated], [reference], lang="en")
    return {'precision': P.mean().item(), 'recall': R.mean().item(), 'f1': F1.mean().item()}

def compute_readability(text):
    return textstat.flesch_reading_ease(text)

def compute_perplexity(text):
    tokens = gpt2_tokenizer.encode(text, return_tensors="pt")
    with torch.no_grad():
        loss = gpt2_model(tokens, labels=tokens).loss
    return torch.exp(loss).item()

def compute_cosine_similarity(reference, generated):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([reference, generated])
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    return similarity

def extract_cited_entities(text):
    doc = nlp(text)
    citations = [ent.text for ent in doc.ents if ent.label_ in ['PERSON', 'ORG', 'WORK_OF_ART']]
    return set(citations)

def evaluate_reviews(reference_text, generated_text):
    scores = {}

    scores['rouge'] = compute_rouge_scores(reference_text, generated_text)
    scores['bleu'] = compute_bleu(reference_text, generated_text)
    scores['bert_score'] = compute_bert_score(reference_text, generated_text)
    scores['cosine_similarity'] = compute_cosine_similarity(reference_text, generated_text)
    scores['readability'] = compute_readability(generated_text)
    scores['perplexity'] = compute_perplexity(generated_text)

    reference_citations = extract_cited_entities(reference_text)
    generated_citations = extract_cited_entities(generated_text)
    scores['citation_overlap'] = len(reference_citations & generated_citations) / max(1, len(reference_citations))

    return scores

reference_text = """
Goal-oriented navigation is a fundamental problem in robotics and AI, requiring agents to navigate to specified targets within unknown environments. Over the years, numerous approaches have been proposed to tackle this problem, often focusing on different types of goals, such as object categories, instance images, and text descriptions. This review highlights key contributions and limitations in existing zero-shot navigation methodologies while contextualizing the advancements introduced by UniGoal.

Zero-shot Navigation

Traditional supervised navigation methods often require extensive training in simulation environments, which limits their adaptability to real-world scenarios (Chaplot et al., 2020; Wijmans et al., 2019). Zero-shot navigation approaches aim to eliminate this dependency by leveraging external knowledge, often from large-scale vision-language models. Methods such as CoW (Gadre et al., 2023) and ESC (Zhou et al., 2023) have demonstrated the potential of language models in goal-oriented navigation but remain specialized for individual sub-tasks like object-goal navigation (ON) or instance-image-goal navigation (IIN).

SG-Nav (Yin et al., 2024) introduced the concept of 3D scene graphs to guide zero-shot navigation through structured reasoning. However, this method remains specific to ON and does not generalize across different goal types. Similarly, Mod-IIN (Krantz et al., 2023) employs feature-matching techniques for IIN, but its reliance on explicit image comparisons limits its applicability to text-based or object category-based navigation.

Universal Goal-oriented Navigation

A major challenge in zero-shot navigation is the development of universal frameworks that can generalize across different goal modalities. Recent efforts, such as PSL (Sun et al., 2024) and GOAT (Chang et al., 2023), attempt to unify goal representations by leveraging shared embedding spaces. PSL employs CLIP-based embeddings for goal encoding but requires reinforcement learning for policy optimization, restricting its adaptability. GOAT, on the other hand, trains a universal global policy to handle multiple navigation tasks but remains dependent on pre-training, thereby limiting its zero-shot generalization capabilities.

Graph-based Scene Representation

Graph-based representations have emerged as a promising approach for scene understanding and navigation. Works such as SayPlan (Rana et al., 2023) and OVSG (Chang et al., 2023) utilize open-vocabulary scene graphs for task planning and object grounding. SG-Nav (Yin et al., 2024) further builds an online hierarchical scene graph to guide goal inference via chain-of-thought prompting. However, these approaches often suffer from task-specific constraints and lack a unified structure to accommodate various goal types.

UniGoal: A Unified Zero-shot Navigation Framework

UniGoal addresses the aforementioned limitations by introducing a uniform graph-based representation that integrates different goal modalities—object categories, instance images, and text descriptions—into a single inference pipeline. Unlike prior methods that rely on separate inference mechanisms for each goal type, UniGoal converts agent observations into an online-maintained scene graph and applies graph-matching techniques to dynamically infer goal locations.

The key innovations of UniGoal include:

Unified Graph Representation – Combining scene graphs and goal graphs to enable structured reasoning across different goal modalities.

Multi-stage Exploration Strategy – Employing iterative subgraph searching, coordinate projection, and anchor pair alignment to progressively refine goal localization.

Blacklist Mechanism – Enhancing robustness by preventing redundant exploration in cases of failed matches.

Extensive evaluations on benchmarks such as Matterport3D, HM3D, and RoboTHOR demonstrate that UniGoal surpasses state-of-the-art zero-shot navigation methods while maintaining a single, training-free model. Its superior performance across ON, IIN, and text-goal navigation (TN) establishes it as a leading framework for universal zero-shot goal-oriented navigation.

Conclusion

Existing zero-shot navigation methods exhibit strong performance but remain specialized for specific goal types. The introduction of graph-based reasoning and unified goal representation in UniGoal marks a significant step toward general-purpose navigation solutions. Future research may explore further optimizations in graph-matching algorithms, real-world deployment, and integration with embodied AI systems to enhance adaptability and efficiency.


"""


results = evaluate_reviews(reference_text, generated_text)
print(results)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


{'rouge': [{'rouge-1': {'r': 0.23837209302325582, 'p': 0.422680412371134, 'f': 0.3048327091433231}, 'rouge-2': {'r': 0.0570902394106814, 'p': 0.09657320872274143, 'f': 0.07175925458936179}, 'rouge-l': {'r': 0.21220930232558138, 'p': 0.37628865979381443, 'f': 0.2713754600726912}}], 'bleu': 0.035074640831564884, 'bert_score': {'precision': 0.8487093448638916, 'recall': 0.832493007183075, 'f1': 0.8405229449272156}, 'cosine_similarity': 0.5287159245702142, 'readability': 23.66, 'perplexity': 24.03458023071289, 'citation_overlap': 0.20833333333333334}
