<a href="https://colab.research.google.com/github/rreimche/genai-exam/blob/main/SLRev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prerequisites:
- Google Colab
- Access to a Google Colab Secret named "HF_API_TOKEN" containing a Huggingface API token with read access.

In [1]:
#@title Settings
from google.colab import userdata

download_dir = "papers"  # directory name to download papers to revie to
huggingface_apikey = userdata.get('HF_API_TOKEN')  # use colab secrets to store the huggingface api key
groq_key = userdata.get('GROQ_KEY')  #  use colab secrets to store the huggingface api key


In [2]:
#@title Declare bibtex references

bibtext_references = """

@misc{rasheed2024codeporilargescaleautonomoussoftware,
      title={CodePori: Large-Scale System for Autonomous Software Development Using Multi-Agent Technology},
      author={Zeeshan Rasheed and Malik Abdul Sami and Kai-Kristian Kemell and Muhammad Waseem and Mika Saari and Kari Systä and Pekka Abrahamsson},
      year={2024},
      eprint={2402.01411},
      archivePrefix={arXiv},
      primaryClass={cs.SE},
      url={https://arxiv.org/abs/2402.01411},
},
@misc{hong2024metagptmetaprogrammingmultiagent,
      title={MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework},
      author={Sirui Hong and Mingchen Zhuge and Jiaqi Chen and Xiawu Zheng and Yuheng Cheng and Ceyao Zhang and Jinlin Wang and Zili Wang and Steven Ka Shing Yau and Zijuan Lin and Liyang Zhou and Chenyu Ran and Lingfeng Xiao and Chenglin Wu and Jürgen Schmidhuber},
      year={2024},
      eprint={2308.00352},
      archivePrefix={arXiv},
      primaryClass={cs.AI},
      url={https://arxiv.org/abs/2308.00352},
},


@misc{qian2024chatdevcommunicativeagentssoftware,
      title={ChatDev: Communicative Agents for Software Development},
      author={Chen Qian and Wei Liu and Hongzhang Liu and Nuo Chen and Yufan Dang and Jiahao Li and Cheng Yang and Weize Chen and Yusheng Su and Xin Cong and Juyuan Xu and Dahai Li and Zhiyuan Liu and Maosong Sun},
      year={2024},
      eprint={2307.07924},
      archivePrefix={arXiv},
      primaryClass={cs.SE},
      url={https://arxiv.org/abs/2307.07924},
},

@misc{zhang2024empoweringagilebasedgenerativesoftware,
      title={Empowering Agile-Based Generative Software Development through Human-AI Teamwork},
      author={Sai Zhang and Zhenchang Xing and Ronghui Guo and Fangzhou Xu and Lei Chen and Zhaoyuan Zhang and Xiaowang Zhang and Zhiyong Feng and Zhiqiang Zhuang},
      year={2024},
      eprint={2407.15568},
      archivePrefix={arXiv},
      primaryClass={cs.SE},
      url={https://arxiv.org/abs/2407.15568},
},

@misc{nguyen2024agilecoderdynamiccollaborativeagents,
      title={AgileCoder: Dynamic Collaborative Agents for Software Development based on Agile Methodology},
      author={Minh Huynh Nguyen and Thang Phan Chau and Phong X. Nguyen and Nghi D. Q. Bui},
      year={2024},
      eprint={2406.11912},
      archivePrefix={arXiv},
      primaryClass={cs.SE},
      url={https://arxiv.org/abs/2406.11912},
},


"""

In [21]:
#@title Install dependencies
!pip install -q autogen arxiv scholarly crossrefapi beautifulsoup4 requests cloudscraper pymupdf nltk autogen-agentchat autogen-ext[openai] groq;
!pip install -q --pre bibtexparser;

In [4]:
#@title Download papers
from autogen import UserProxyAgent, config_list_from_json
import arxiv
import os
import datetime
from bibtexparser.bparser import BibTexParser
from bibtexparser.bibdatabase import BibDatabase

def download_papers(paper_info_list: list, download_dir: str) -> dict:
    results = {}

    for paper_info in paper_info_list:
        title = paper_info.get("title", "")
        results[title] = {"success": False, "filepath": None}

        # 1. Try arXiv (using eprint ID)
        eprint_id = paper_info.get("eprint")
        if eprint_id:
            print(f"{datetime.datetime.now()} - Trying arXiv for: {title} (arXiv ID: {eprint_id})")
            try:
                search = arxiv.Search(id_list=[eprint_id], max_results=1)
                result = next(search.results(), None)
                if result:
                    print(f"  Found on arXiv: {result.title}")
                    filepath = os.path.join(download_dir, f"{title.replace(' ', '_')}.pdf")
                    result.download_pdf(filename=filepath)
                    results[title]["success"] = True
                    results[title]["filepath"] = filepath
                    print(f"  Download successful: {filepath}")
                else:
                    print(f"  No arXiv result found for ID {eprint_id}.")

            except Exception as e:
                print(f"arXiv download failed for: {title}: {e}")

        # Check if the download was NOT successful
        if not results[title]["success"]:
            print(f"{datetime.datetime.now()} - Download failed or not attempted for: {title}")

    return results

def parse_bibtex_entries(bibtex_refs: str) -> list:

    try:
        parser = BibTexParser()
        bib_database = parser.parse(bibtex_refs)
        return bib_database.entries
    except Exception as e:
        print(f"Error parsing BibTeX string: {e}")
        return []

def maybe_create_dir(donwload_dir: str) -> None:
    try:
        os.makedirs(donwload_dir, exist_ok=True)
        print(f"Directory '{download_dir}' created successfully.")
    except OSError as e:
        print(f"Error creating directory '{download_dir}': {e}")
        exit(1)  # Exit with an error code

# --- Main Execution ---

# 1. Load BibTeX entries

papers_to_download = parse_bibtex_entries(bibtext_references)

# 2. Download papers (if entries were loaded)
if False and papers_to_download:
    download_dir = "papers"
    maybe_create_dir(download_dir)
    download_results = download_papers(papers_to_download, download_dir)

    # Count successful downloads
    successful_downloads = sum(1 for paper_data in download_results.values() if paper_data["success"])
    total_papers = len(papers_to_download)

    print(f"Downloaded {successful_downloads} of {total_papers} papers, done.")  # Summary message

else:
    print("No papers to download.")

No papers to download.


In [22]:
#@title Parse PDFs to text
import fitz  # PyMuPDF
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')

def parse_papers(download_results: dict, paper_info_list: list) -> dict:
    """
    Parses downloaded PDF papers, extracts text, and indexes by arXiv eprint.

    Args:
        download_results: Dictionary from download_papers (title -> {success, filepath}).
        paper_info_list:  Original list of paper info dictionaries (from BibTeX).

    Returns:
        A dictionary mapping arXiv eprints to the extracted text and title.
        {
            "arxiv_eprint": {"text": "extracted text...", "title": "paper title"},
            ...
        }
        Returns an empty dictionary if there are issues with the input data.
    """
    parsed_texts = {}

    # Create a lookup dictionary for paper info by title
    title_to_info = {paper_info.get('title'): paper_info for paper_info in paper_info_list}

    for title, data in download_results.items():
        # We only process successfully downloaded papers
        if data["success"]:
            filepath = data["filepath"]

            # Get eprint from the original paper info using the title lookup
            paper_info = title_to_info.get(title)
            if not paper_info:
                print(f"Error: Could not find original paper info for title: {title}")
                continue  # Skip this paper

            eprint_id = paper_info.get("eprint")
            if not eprint_id:
                print(f"Error: No eprint ID found for title: {title}")
                continue  # Skip papers without an eprint

            try:
                with fitz.open(filepath) as doc:
                    text = ""
                    for page in doc:
                        text += page.get_text()
                    parsed_texts[eprint_id] = {"text": text, "title": title}
                print(f"Successfully parsed: {title} (arXiv ID: {eprint_id})")
            except Exception as e:
                print(f"Error parsing {title} ({filepath}): {e}")
                # Don't add to parsed_texts if parsing fails.

    return parsed_texts

# Example Usage (assuming download_results and papers_to_download are available):
parsed_texts = {} # parse_papers(download_results, papers_to_download)
print()
for eprint, data in parsed_texts.items():
    print(f"---- arXiv eprint: {eprint} ----")
    print(f"Title: {data['title']}")
    print(f"Length: {len(data['text'])}")
    print(f"First 100 characters with newlines stripped:")
    print(data['text'][:100].replace('\n', ' '))
    print(f"Token count: {len(word_tokenize(data['text']))}")
    print(f"---- end {eprint} ----")
    print()




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [23]:
#@title Preparation for agents: model client connections
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.messages import TextMessage
from autogen_core.tools import FunctionTool
from autogen_core.models import UserMessage
#from autogen_agentchat.ui import Console
from autogen_core import CancellationToken
from autogen_ext.models.openai import OpenAIChatCompletionClient

# INSTANTIATE MODEL CLIENTS

# This client will be used for paper summarization
model_client = OpenAIChatCompletionClient(
    model="mistralai/Mistral-7B-Instruct-v0.3",
    base_url="https://router.huggingface.co/hf-inference/v1",
    api_key=huggingface_apikey,
    model_info={
        "vision": False,
        "function_calling": False,
        "json_output": False,
        "family": "unknown",
    },
)

# We need another inference point for downloader agent,
# because huggingface can't serve reflection on tool use
# so that autogen agent understands it
downloader_model_client = OpenAIChatCompletionClient(
    model="llama3-70b-8192",
    base_url="https://api.groq.com/openai/v1",
    api_key=groq_key,
    model_info={
        "vision": False,
        "function_calling": True,
        "json_output": False,
        "family": "unknown",
    },
)


# CHECK CONNECTION

messages = [
    UserMessage(content="What is the capital of France?", source="user"),
]
response = await model_client.create(messages=messages)
response_downloader = await downloader_model_client.create(messages=messages)

print(response.content)
print(response_downloader.content)

The capital of France is Paris. However, it's important to note that Paris is not a constituent country, but rather a city, and France is a sovereign state. The seat of the government is located in Paris, while French law technically considers France to be an indivisible, indissoluble, secular, democratic and social Republic.
The capital of France is Paris.


In [14]:
#@title Downloader agent
import arxiv
import os
import datetime
import json
import sys

# HELPER AND TOOL FUNCTIONS

def maybe_create_dir(donwload_dir: str) -> None:
    """
        A helper function to create the downloads directory
        in colab root if the directory is not yet present.
    """

    try:
        os.makedirs(donwload_dir, exist_ok=True)
        #print(f"Directory '{download_dir}' created successfully.")
    except OSError as e:
        raise Exception(f"Error creating directory '{download_dir}': {e}")

def download_paper(eprint_id: str) -> str:
    """
        Receives a eprint_id of a paper from arxiv, downloads the specified paper
        and saves it in downloads directory.
        The filepath of the downloaded paper is in the resulting dictionary
        with the key "filepath".

        :param eprint_id: string, the arxiv eprint_id.
        :return: string with filepath of the downloaded paper.
    """

    assert len(eprint_id) > 0, "eprint_id cannot be empty"

    result = {
        "eprint_id": eprint_id,
        "success": False,
        "filepath": None,
        }


    if eprint_id:
        #print(f"{datetime.datetime.now()} - Trying arXiv for: {eprint_id}")
        try:
            client = arxiv.Client()
            search = arxiv.Search(id_list=[eprint_id], max_results=1)

            #arxiv_result = next(search.results(), None)
            arxiv_result = next(client.results(search))
            if arxiv_result:
                #print(f"  Found on arXiv: {arxiv_result.title}")
                maybe_create_dir(download_dir)
                filepath = os.path.join(download_dir, f"{arxiv_result.title.replace(' ', '_')}.pdf")
                arxiv_result.download_pdf(filename=filepath)
                result["success"] = True
                result["filepath"] = filepath
                #print(f"  Download successful: {filepath}")
            else:
                # print(f"  No arXiv result found for ID {eprint_id}.")
                raise Exception(f"ERROR: No arXiv result found for ID {eprint_id}.")
                #return f"ERROR: No arXiv result found for ID {eprint_id}."

        except Exception as e:
            #print(f"arXiv download failed for: {eprint_id}: {e}")
            raise Exception(f"ERROR: arXiv download failed for: {eprint_id}: {e}")
            #return f"ERROR: arXiv download failed for: {eprint_id}: {e}"

    # Check if the download was NOT successful
    if not result["success"]:
        #print(f"{datetime.datetime.now()} - Download failed or not attempted for: {eprint_id}")
        raise Exception(f"ERROR: Download failed or not attempted for: {eprint_id}")
        #return f"ERROR: Download failed or not attempted for: {eprint_id}"

    return result["filepath"]


# DOWNLOADER AGENT WITH TOOL

downloader_tool = FunctionTool(download_paper, description="Given a string denoting an eprint_id of a page on arxiv, downloads a paper")

downloader_agent = AssistantAgent(
    name="downloader",
    model_client=downloader_model_client,
    tools=[downloader_tool],
    reflect_on_tool_use=True,
    system_message="""
        Use downloader tool to download specified paper(s) from arxiv.
        Here is an example of how you must respond in success case:
          'mypapers/veryinterstingpaper.pdf'
        If download was not successful, respond like this:
          'ERROR: Download for paper with eprint id XXX failed for the following reason: REASON'.
      """,
)


# CHECK IF DOWNLOAD WORKS

async def downloader_run() -> None:
    response = await downloader_agent.on_messages(
        [TextMessage(content="Download the paper with eprint_id 2402.01411", source="user")],
        cancellation_token=CancellationToken(),
    )


    #print(response.inner_messages)
    #print(response.chat_message)
    print(response.chat_message.content)



# Use asyncio.run(assistant_run()) when running in a script.
await downloader_run()

The tool has successfully downloaded the paper with eprint_id 2402.01411, and the PDF file is now available at `papers/CodePori:_Large-Scale_System_for_Autonomous_Software_Development_Using_Multi-Agent_Technology.pdf`.

You can access the paper by opening this file. The title of the paper is "CodePori: Large-Scale System for Autonomous Software Development Using Multi-Agent Technology".


In [27]:
#@title Parser agent
import fitz  # PyMuPDF
#import nltk
#from nltk.tokenize import word_tokenize

#nltk.download('punkt_tab')

def parse_pdf(filepath: str) -> str:
    try:
        with fitz.open(filepath) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
            # print(f"Successfully parsed: {title} (arXiv ID: {eprint_id})")
            return text

    except Exception as e:
        # print(f"Error parsing {title} ({filepath}): {e}")
        raise Exception(f"ERROR: Error parsing {filepath}: {e}")


parser_tool = FunctionTool(
    parse_pdf,
    description="A tool that takes a filepath of a pdf file, parses the file and returns the text"
  )

parser_agent = AssistantAgent(
    name="parser",
    model_client=downloader_model_client,
    system_message="""
        For a given filepath of a scientific paper,
        you use the provided tool, which parses the paper and returns text.
        Then you return the text in your response.
      """,
    tools=[parser_tool]
)

# CHECK IF PARSING WORKS

test_paper_text = ""  # we'll need this to test the summarizer
async def parser_run() -> None:
    response = await parser_agent.on_messages(
        [TextMessage(content="Parse this paper: papers/CodePori:_Large-Scale_System_for_Autonomous_Software_Development_Using_Multi-Agent_Technology.pdf", source="user")],
        cancellation_token=CancellationToken(),
    )


    #print(response.inner_messages)
    #print(response.chat_message)
    test_paper_text = response.chat_message.content
    print(paper_text)



# Use asyncio.run(assistant_run()) when running in a script.
await parser_run()




In [28]:
#@title Summarizer agent


summarizer_agent = AssistantAgent(
    name="summarizer",
    model_client=model_client,
    system_message="""
        You are a qualified and experience reviewer and critic of scientific papers.
        You will be given a paper text and you will summarize it as your response.
        You must fit the summarization in 500 words.
    """
)


# CHECK IF SUMMARIZING WORKS

async def summarizer_run() -> None:
    response = await summarizer.on_messages(
        [TextMessage(content=test_paper_text, source="user")],
        cancellation_token=CancellationToken(),
    )


    #print(response.inner_messages)
    #print(response.chat_message)
    print(response.chat_message.content)



# Use asyncio.run(assistant_run()) when running in a script.
await parser_run()


APIStatusError: Error code: 413 - {'error': {'message': 'Request too large for model `llama3-70b-8192` in organization `org_01jjek2v3wfwzrcnqevn2ry6gt` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 22759, please reduce your message size and try again. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}