In [47]:
!pip -q install openai arxiv PyMuPDF

In [48]:
import os
import openai
from google.colab import userdata
from enum import Enum
from typing import Union

from pydantic import BaseModel
from openai import OpenAI
import arxiv
import fitz

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_KEY')


In [50]:


def download_single_paper(query: str):
    search = arxiv.Search(
        query=query,
        max_results=1,  # We only want to retrieve one paper
        sort_by=arxiv.SortCriterion.Relevance
    )
    result = next(search.results(), None)

    if result:
        dir_path = os.path.join('./data', query.replace(' ', ''))
        os.makedirs(dir_path, exist_ok=True)
        file_path = result.download_pdf(dirpath=dir_path)
        print(f"Downloaded paper: {result.title}")
        return file_path
    else:
        print("No papers found for this query.")
        return None, None, None, None, None


In [51]:
file_path= download_single_paper('LLM')

  result = next(search.results(), None)


Downloaded paper: Large Language Models as Software Components: A Taxonomy for LLM-Integrated Applications


In [52]:

def parse_pdf(file_path: str) -> str:
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

article = parse_pdf(file_path)



In [53]:
from enum import Enum
from pydantic import BaseModel
from typing import List, Optional

class SectionType(BaseModel):
    """
    Enumeration of section types in an academic paper.

    This model represents the summary of different sections that can be found in an academic paper.
    """
    abstract : str
    introduction : str
    methodology : str
    results : str
    conclusion : str

class PaperType(str, Enum):
    """
    Enumeration of academic paper types.

    This enum represents different categories of academic papers.
    """
    research_article = "research_article"
    review_article = "review_article"
    conference_paper = "conference_paper"
    thesis = "thesis"
    dissertation = "dissertation"

class Author(BaseModel):
    """
    Represents an author of an academic paper.

    This model captures basic information about an author, including their name
    and affiliation.
    """
    name: str
    affiliation: str

class PaperMetadata(BaseModel):
    """
    Represents metadata of an academic paper.

    This model captures basic metadata about an academic paper, including its title,
    authors, publication date, and journal.
    """
    title: str
    authors: List[Author]
    publication_date: str
    journal: str
    paper_type: PaperType

class AcademicPaperResponse(BaseModel):
    """
    Represents the structured response for an academic paper analysis.

    This model aggregates information about the paper's metadata and its section's summary.
    """
    metadata: PaperMetadata
    sections: SectionType


In [54]:
client = OpenAI()

completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-08-06",
    messages=[
        {"role": "system", "content": "You are a helpful assistant that scans for \
       different sections of a research paper."},
        {"role": "user", "content": article},
    ],
    response_format=AcademicPaperResponse,
)

message = completion.choices[0].message



In [55]:
message.parsed.metadata

PaperMetadata(title='Large Language Models as Software Components: A Taxonomy for LLM-Integrated Applications', authors=[Author(name='Irene Weber', affiliation='Kempten University of Applied Sciences, Germany')], publication_date='13 Jun 2024', journal='arXiv', paper_type=<PaperType.research_article: 'research_article'>)

In [56]:
for i in message.parsed.sections:
    print(i[0] + ': \n')
    print(i[1] + '\n')

abstract: 

Large Language Models (LLMs) have become widely adopted as tools for software engineering. This study presents a taxonomy to describe LLM-integrated applications, identifying essential dimensions for characterizing these systems. By analyzing recent applications, the taxonomy reveals the diverse integration methods and suggests a framework using feature vectors for visualization, aiming to advance the nascent field of LLM-integrated application engineering.

introduction: 

Large Language Models (LLMs) have significantly impacted fields such as medicine, law, marketing, and education, due to their proficiency in tasks like text understanding and code writing. The study explores LLMs as components in software systems, contrasting with traditional views of LLMs as software development tools, offering a taxonomy to frame LLM-integrated application engineering as an emerging research area.

methodology: 

The taxonomy was developed using a Design Science Research approach, appl