<a href="https://colab.research.google.com/github/shakirjameel/ZS-LATO-Hackathon/blob/main/LATO_Hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Nothing Fancy Here!**

In [1]:
%%capture --no-stderr
%pip install -U --quiet langchain-community tiktoken langchain-openai langchainhub chromadb langchain langgraph langchain-text-splitters
%pip install --upgrade --quiet faiss-cpu
%pip install --upgrade --quiet pypdf
%pip install --upgrade --quiet rapidocr-onnxruntime
%pip install langchain_chroma langchain_openai
%pip install arxiv
%pip install autogen
%pip install openai

In [2]:
import arxiv
from langchain_community.document_loaders import PyPDFLoader
import json
import re
from typing import Annotated
import openai
from openai import OpenAI
import autogen
from autogen import AssistantAgent, UserProxyAgent
from autogen import ConversableAgent



**Add your open AI key in this cell**

In [None]:
api_key = "<ENTER KEY HERE>"
llm_config = {"model": "gpt-4", "api_key": api_key}
# assistant = AssistantAgent("assistant", llm_config=llm_config)

**Everything related to fetching reasearch papers from arXiv**

In [7]:
def summarize_page(page_content, page_number, article_title):
    """
    Sends the page content to OpenAI for summarization, specifying the page number
    and that other instances are summarizing the remaining pages.

    Parameters:
    - api_key (str): Your OpenAI API key.
    - page_content (str): The text content of the page to be summarized.
    - page_number (int): The page number being summarized.
    - article_title (str): The title of the arXiv article.

    Returns:
    - str: The summary returned by OpenAI.
    """
    print(f"Extracting abstract in page {page_number} of article {article_title}")
    client = OpenAI(
        # This is the default and can be omitted
        api_key=api_key,
    )


    # Construct the prompt
    prompt = (f"This is page {page_number} from an article titled '{article_title}' from arXiv. "
              f"This page of the paper contains the abstract and maybe more content. Please isolate the abstract of this article"
              f"Here is the content of page {page_number}:\n\n{page_content}\n\n")

    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="gpt-4",
    )

    # Extract and return the summary
    summary = response.choices[0].message.content
    return summary

def remove_invalid_characters(input_string, pattern):
    # Removes characters from the input_string that do not match the given regex pattern.
    # Create a regex pattern to match all characters that are NOT in the given pattern
    invalid_pattern = f'[^{pattern}]'

    # Replace all invalid characters with an empty string
    cleaned_string = re.sub(invalid_pattern, '', input_string)

    return cleaned_string

def get_title_from_url(url):
    # Extract the arXiv ID from the URL
    arxiv_id = url.split('/')[-1]  # Extract the arXiv ID from the URL

    # Construct the default API client
    client = arxiv.Client()

    # Create a search query for the specific arXiv ID
    search = arxiv.Search(id_list=[arxiv_id])

    # Get the search result
    results = client.results(search)

    # Fetch the first result, since ID searches are unique
    paper = next(results, None)
    return paper.title

def extract_text_from_docs(docs_list):
    """Extract and concatenate the text from a list of lists of Documents."""
    new_doc_list = []
    for docs in docs_list:
        full_summary = ""
        name = get_title_from_url(docs[0].metadata['source'])
        name = remove_invalid_characters(name, '^[a-zA-Z0-9_-]+$')
        for doc in docs:
            if 'abstract' not in doc.page_content.lower():
              continue
            abstract = doc.page_content.lower().split('abstract')[1]
            summary = summarize_page(doc.page_content, doc.metadata['page'], name)
            summary += "\n"  # Add a newline to separate pages/documents
            full_summary += summary
            break
        full_summary = remove_invalid_characters(full_summary, '^[a-zA-Z0-9_-]+$')
        doc_data = {"article_name": name, "article_content": full_summary}
        new_doc_list.append(doc_data)
    return new_doc_list

def load_pdfs(urls):
    print("Loading arXiv Research Papers ...")
    docs_list = []
    for url in urls:
        loader = PyPDFLoader(url, extract_images=True)
        docs = loader.load()
        docs_list.append(docs)
    docs_list = extract_text_from_docs(docs_list)
    return docs_list

# [{"article_name":"name", "article_content": "content"}]
def search_papers(query):
  # Construct the default API client.
  client = arxiv.Client()
  # Search for the 2 most recent articles matching the query
  print("Searching arXiV Papers ...")
  search = arxiv.Search(
    query = query,
    max_results = 2,
    sort_by = arxiv.SortCriterion.Relevance
  )
  results = client.results(search)
  papers_dict = {}
  for paper in results:
    papers_dict[paper.title] = paper.pdf_url
  papers_urls = papers_dict.values()
  docs = load_pdfs(papers_urls)
  print("Ready for summarization!")
  return json.dumps(docs)

In [8]:
# docs = search_papers("llms for cooking")
# print(docs)

**[WIP] This still needs work**

In [16]:
associate = """
Statement:Has skills to participate in the problem solving process for structured problems
Additional Statements:
Understands how different problem solving approaches could provide insight
Creates and executes work in a careful and comprehensive way (code, test scripts, plans, SOPs; Runbook, etc.)
Statement:Able to implement technical specifications developed by others
Additional Statements:
Understands business requirements and specifications and implements them per technical/functional specifications or creates requirements documentation, test cases or scenarios
Correctly understands and implements technical requirements, and
application architectures that were
designed by others
Under supervision, can develop artifacts such as wireframe diagrams, demo sites,click-through demos, etc
Statement:Uses existing tools and group knowledge
Additional Statements:
Learns about tools, reusable components, and infrastructure to deliver results on the project
Understands and uses best practices and standards (business process, design, QA, test, application support)
Understands which tools and processes to use to execute role and resolve common subject area issues and problems that arise
Statement:Develops an understanding of the client organization and industry
Additional Statements:
Learns about client’s products, markets, competitors and key issues through interaction with the project team and additional research
Learns and applies knowledge about client’s products, markets,competitors and key issues through interaction with the project team and additional research
Develops an understanding of key trends in the client’s industry
Can work effectively with databases, SQL, and other core infrastructure elements, as relevant
Statement:Demonstrates the aptitude to develop and apply expertise
Additional Statements:
Demonstrates adaptability to evolve expertise by learning new areas of technologies, domain & delivery methodology
Contributes ideas to improve or develop new techniques or tools
Demonstrates intellectual curiosity in approach to executing analyses
Applies rigorous design and coding practices on projects
"""

consultant = """
Statement:Gains credibility through knowledge of topic area and confident presentation of content
Additional Statements:
Presents a compelling viewpoint to clients/team, supported by input from the project manager or Principal
Interacts comfortably and effectively with the project contact and individuals supporting him/her
Keeps the audience focused on the objective and minimizes divergences
Statement:Shapes and delivers well structured and compelling written materials to support project, team and client communication
Additional Statements:
Writes client-facing or client-directed communication to appropriate audience that emphasizes key messages and links project work to relevant client business issues
Structures written communication to ensure clear direction regarding team member roles, action items, key milestones, methodology decisions, etc.
Provides guidance and coaching to team members regarding appropriate written communication objectives / message, audience, channel, tone, etc.
Helps define and document what good written deliverables look like
Develops documentation/diagrams to systematically convey system architecture to clients (where relevant)
Develops well-written effective & visually compelling storyboards and materials (e.g. executive summaries, deck that convey appropriate structure, articulate key points, and guide team in execution of research, analysis, synthesis)
Structures communication effectively in a logical manner following best practice guidelines (e.g., Pyramid Principle)
Communicates complex information in a meaningful, easy to understand way
Synthesizes project findings into key insights /recommendations
Provides appropriate backup materials to support points and recommendations
Statement:Solves structured problems
Additional Statements:
Conceptualizes the client or technical issue and works variously with the solution architect to develop approach; makes good testing choices to cover scenarios and functionality
Leverages previous ZS projects
Is comfortable with ambiguity
Commits to identifying solutions and does not make premature judgments about what will and will not work
Statement: Applies problem solving frameworks
Additional Statements:
Works within frameworks that the client is comfortable with
Has the business, operational and technical skills/credibility to specify solution or approach for the team
Has technical skills required to lead the project team in domain area
Knows when and where to prioritize efforts in design, model, testing or infrastructure for greatest value
Statement: Has a broad understanding of technology trends, key issues and challenges in the project
Additional Statements:
Understands how trends impact client organization(s)
Probes the client or information to obtain a full understanding of issues
Could have a issue-based technology discussion with a client, if prepared
Statement: Demonstrates expertise in a way that enhances project results
Additional Statements:
Demonstrates adaptability to evolve expertise by learning new areas of technologies
Is an expert in at least few technology/ tool areas and can resolve technofunctional issues
Advises and influences project design, solutions. Technology/ tools
Improves skills and knowledge of team members in area of technology expertise
Come up with technology solutions that are in-line with organization technology road map
"""

**Registering custom function to the agent**

In [None]:
def fetch_papers(
    query: Annotated[str, "Query string to fetch papers."]
) -> str:
    str_arr_of_papers = search_papers(query)
    return str_arr_of_papers

**Agent for fetching information and summarization**

In [17]:
article_fetcher = ConversableAgent(
    name="article_fetcher",
    system_message="You are a helpful AI assistant. "
    "Roles can be of the type associate, and a consultant."
    "If you cant find any of the roles in the question, ask the user for the role and mention that if the user has already shared the role, highlight that you can only summarize articles for role mentioned in this prompt."
    "You can help with fetching articles by creating the query based on user input and passing it to the registered function fetch papers."
    f"If the role is associate use this as an additional context for your summary: {associate}"
    f"If the role is consultant use this as an additional context for your summary: {consultant}"
    "Return the article_content as a detailed summary"
    # "Return 'TERMINATE' when the task is done."
    ,
    human_input_mode="ALWAYS",
    llm_config={"config_list": [llm_config]},
)
article_fetcher.register_for_llm(name="fetch_papers", description="arXiV paper fetcher")(fetch_papers)


**User Proxy Agent**

In [None]:
# The user proxy agent is used for interacting with the assistant agent
# and executes tool calls.
user_proxy = ConversableAgent(
    name="User",
    llm_config=False,
    is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg["content"],
    human_input_mode="NEVER",
)
user_proxy.register_for_execution(name="fetch_papers")(fetch_papers)

In [None]:
# from autogen import register_function

# # Register the calculator function to the two agents.
# register_function(
#     fetch_papers,
#     caller=article_fetcher,  # The assistant agent can suggest calls to the calculator.
#     executor=user_proxy,  # The user proxy agent can execute the calculator calls.
#     name="fetch_papers",  # By default, the function name is used as the tool name.
#     description="arXiV paper fetcher",  # A description of the tool.
# )

**This runs the agentic flow**

In [20]:
chat_result = user_proxy.initiate_chat(article_fetcher, message="Give me the latest Gen AI Articles? summarize it for an associate")



User (to article_fetcher):

Give me the latest Gen AI Articles? summarize it for an associate

--------------------------------------------------------------------------------
Replying as article_fetcher. Provide feedback to User. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: 

>>>>>>>> NO HUMAN INPUT RECEIVED.

>>>>>>>> USING AUTO REPLY...
article_fetcher (to User):

***** Suggested tool call (call_fzQ3OtRoauzpXgv4fcK7NiIv): fetch_papers *****
Arguments: 
{
  "query": "Latest General AI Articles"
}
*****************************************************************************

--------------------------------------------------------------------------------

>>>>>>>> EXECUTING FUNCTION fetch_papers...
searching...
Loading papers...
extracting abstract in page 0 of article On the Combination of AI and Wireless Technologies: 3GPP Standardization Progress
extracting abstract in page 0 of article Landscape of Generative AI in Global News: Topics, Sentiments

**SCRATCH CODE, PLEASE IGNORE**

In [None]:
research_paper_fetcher = AssistantAgent(
    name="research_paper_fetcher",
    llm_config=llm_config,
    human_input_mode="NEVER",
    system_message=
    """Assistant to fetch papers from the registered funtion fetch_papers and summarize the articles based on the role mentioned in the question.
    Roles can be of the type associate, and a consultant.
    If you cant find any of the roles in the question, ask the user for the role and mention that if the user has already shared the role, highlight that you can only summarize articles for role mentioned in this prompt.
    While returning the results, use the format -
    Original Articles: Name of the article in bold followed by the orignal content returned by the registered function fetch_papers in a separate line
    Article Summary: Name of the article in bold follwoed by the summary of the article
    Reply TERMINATE when the task is done.""",
)


In [None]:
autogen.agentchat.register_function(
    fetch_papers,
    caller=research_paper_fetcher,
    executor=user_proxy,
    name="fetch_papers",
    description="Function to fetch data from arXiv",
)



In [None]:
user_proxy.initiate_chat(research_paper_fetcher, message = "Gen AI Articles for a consultant")

user_proxy (to research_paper_fetcher):

Gen AI Articles for a consultant

--------------------------------------------------------------------------------
research_paper_fetcher (to user_proxy):

Sure. Let me fetch the articles for the role "consultant" and summarize them.

--------------------------------------------------------------------------------
Replying as user_proxy. Provide feedback to research_paper_fetcher. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: 

>>>>>>>> NO HUMAN INPUT RECEIVED.

>>>>>>>> USING AUTO REPLY...
user_proxy (to research_paper_fetcher):



--------------------------------------------------------------------------------
research_paper_fetcher (to user_proxy):

I am sorry but I seem to have encountered a problem in fetching the articles. Could you please try again later?

--------------------------------------------------------------------------------


KeyboardInterrupt: Interrupted by user

In [None]:
# Summarize the latest Gen AI papers for a principal/ consultant
# User Proxy -> Paper Fetcher -> Summarizer (Regex Magic) -> Result

In [None]:
# summarizer = AssistantAgent(
#     name="summarizer",
#     llm_config=llm_config,
#     system_message = """
#         You are a professional writer of summaries of articles, known for
#         your insightful and engaging summaries.
#         You transform complex concepts into compelling narratives.
#         Reply "TERMINATE" in the end when everything is done.
#     """,
# )

In [None]:
# chat_results = user_proxy.initiate_chats(
#     [
#         {
#             "recipient": research_paper_fetcher,
#             "message": summary_tasks[0],
#             "clear_history": True,
#             "silent": False,
#             "summary_method": "last_msg",
#         },
#         {
#             "recipient": summarizer,
#             "message": summary_tasks[1],
#             "summary_method": "reflection_with_llm",
#         },
#         {
#             "recipient": writer,
#             "message": writing_tasks[0],
#             "carryover": "I want to include a figure or a table of data in the blogpost.",
#         },
#     ]
# )

In [None]:
# summary_tasks = [
#     """What are the latest articles in Gen AI?""",
#     """Summarize the content for a consultant"""

# ]

# writing_task = [
#     """Develop a personalized summary of the articles which are relavant to the competancy of the role mentioned"""
# ]

In [None]:
# chat_results = user_proxy.initiate_chats(
#     [
#         {
#             "recipient": research_paper_fetcher,
#             "message": summary_tasks[0],
#         },
#         {
#             "recipient": summarizer,
#             "message": summary_tasks[1],
#             "summary_method": "reflection_with_llm",
#         },
#         {
#             "recipient": writer,
#             "message": writing_tasks[0],
#             "carryover": "I want to include a figure or a table of data in the blogpost.",
#         },
#     ]
# )