学术评价网站
输入一篇文章后，列出所有引用这篇文章的文献，以及这些文献评价该文章的话。
使用大模型总结文献对该文章的评价。

In [1]:
import os
import requests
import re
import pandas as pd
from typing import Generator, Union
S2_API_KEY = os.environ.get('S2_API_KEY', '')

# Transfer PaperID
如何转换成兼容的格式

In [2]:
def print_papers(papers):
    for idx, paper in enumerate(papers):
        print(f"{idx}  {paper['title']} || {','.join(author['name'] for author in paper['authors'])} || {paper['url']}")

def find_paper_by_title(query=None, result_limit = 10):
    papers = None
    while not papers:
        if not query:
            query = input('Find papers about what: ')
        if not query:
            continue

        rsp = requests.get('https://api.semanticscholar.org/graph/v1/paper/search',
                           headers={'X-API-KEY': S2_API_KEY},
                           params={'query': query, 'limit': result_limit, 'fields': 'authors,title,url'})
        rsp.raise_for_status()
        results = rsp.json()
        total = results["total"]
        if not total:
            print('No matches found. Please try another query.')
            continue

        print(f'Found {total} results. Showing up to {result_limit}.')
        papers = results['data']
        print_papers(papers)

    selection = ''
    while not re.fullmatch('\\d+', selection):
        selection = input('Select a paper # to base recommendations on: ')
    return results['data'][int(selection)]

find_paper_by_title()

Found 2 results. Showing up to 10.
0  Evaluating GPT and BERT models for protein–protein interaction identification in biomedical text || Hasin Rehana,Nur Bengisu Çam,Mert Basmaci,Jie Zheng,Christianah Jemiyo,Yongqun He,Arzucan Özgür,J. Hur || https://www.semanticscholar.org/paper/50a3f0dd12114fb2ca90a5511a6325524c3f6013
1  Evaluation of GPT and BERT-based models on identifying proteinprotein interactions in biomedical text || Hasin Rehana,Nur Bengisu Çam,Mert Basmaci,Y. He,Arzucan Özgür,J. Hur || https://www.semanticscholar.org/paper/1ad9a295ea841599383e5ae3e88381438d4a7db3


{'paperId': '50a3f0dd12114fb2ca90a5511a6325524c3f6013',
 'url': 'https://www.semanticscholar.org/paper/50a3f0dd12114fb2ca90a5511a6325524c3f6013',
 'title': 'Evaluating GPT and BERT models for protein–protein interaction identification in biomedical text',
 'authors': [{'authorId': '35668704', 'name': 'Hasin Rehana'},
  {'authorId': '2214609905', 'name': 'Nur Bengisu Çam'},
  {'authorId': '2180883959', 'name': 'Mert Basmaci'},
  {'authorId': '2315466736', 'name': 'Jie Zheng'},
  {'authorId': '2321307305', 'name': 'Christianah Jemiyo'},
  {'authorId': '2249526854', 'name': 'Yongqun He'},
  {'authorId': '2113900183', 'name': 'Arzucan Özgür'},
  {'authorId': '1747690', 'name': 'J. Hur'}]}

# Get Citations
用来返回引用

In [3]:
def get_citation_edges(**req_kwargs):
    """This helps with API endpoints that involve paging."""
    page_size = 1000
    offset = 0
    while True:
        req_kwargs.setdefault('params', dict())
        req_kwargs['params']['limit'] = page_size
        req_kwargs['params']['offset'] = offset
        rsp = requests.get(**req_kwargs)
        rsp.raise_for_status()

        page = rsp.json()["data"]
        for element in page:
            yield element

        if len(page) < page_size:
            break  # no more pages
        offset += page_size

def get_paper(paper_id):
    rsp = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}',
                       headers={'X-API-KEY': S2_API_KEY},
                       params={'fields': 'title,authors'})
    rsp.raise_for_status()
    return rsp.json()


def get_citations(paper_id):
    edges = get_citation_edges(url=f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations',
                               headers={'X-API-KEY': S2_API_KEY},
                               params={'fields': 'title,authors,isOpenAccess,openAccessPdf'})
    return list(edge['citingPaper'] for edge in edges)

In [10]:
print(get_paper('DOI:10.18653/v1/N18-3011'))

{'paperId': '649def34f8be52c8b66281af98ae884c09aef38b', 'title': 'Construction of the Literature Graph in Semantic Scholar', 'authors': [{'authorId': '145585097', 'name': 'Bridger Waleed Ammar'}, {'authorId': '3458736', 'name': 'Dirk Groeneveld'}, {'authorId': '1857797', 'name': 'Chandra Bhagavatula'}, {'authorId': '46181066', 'name': 'Iz Beltagy'}, {'authorId': '46230609', 'name': 'Miles Crawford'}, {'authorId': '145612610', 'name': 'Doug Downey'}, {'authorId': '38092776', 'name': 'Jason Dunkelberger'}, {'authorId': '143718836', 'name': 'Ahmed Elgohary'}, {'authorId': '46411828', 'name': 'Sergey Feldman'}, {'authorId': '4480314', 'name': 'Vu A. Ha'}, {'authorId': '143967880', 'name': 'Rodney Michael Kinney'}, {'authorId': '41018147', 'name': 'Sebastian Kohlmeier'}, {'authorId': '46258841', 'name': 'Kyle Lo'}, {'authorId': '144240185', 'name': 'Tyler C. Murray'}, {'authorId': '46256862', 'name': 'Hsu-Han Ooi'}, {'authorId': '39139825', 'name': 'Matthew E. Peters'}, {'authorId': '395613

In [4]:
citation1 = get_citations('DOI:10.18653/v1/N18-3011')
display(citation1)
citation_df = pd.DataFrame(citation1)
display(citation_df)

[{'paperId': 'db4e8d662dbe80f3ddf78e69b1c1053500894d25',
  'title': 'A Database of Stress-Strain Properties Auto-generated from the Scientific Literature using ChemDataExtractor',
  'isOpenAccess': False,
  'openAccessPdf': None,
  'authors': [{'authorId': '2193266278', 'name': 'Pankaj Kumar'},
   {'authorId': '2331845884', 'name': 'Saurabh Kabra'},
   {'authorId': '2164374024', 'name': 'Jacqueline M. Cole'}]},
 {'paperId': '50a3f0dd12114fb2ca90a5511a6325524c3f6013',
  'title': 'Evaluating GPT and BERT models for protein–protein interaction identification in biomedical text',
  'isOpenAccess': True,
  'openAccessPdf': None,
  'authors': [{'authorId': '35668704', 'name': 'Hasin Rehana'},
   {'authorId': '2214609905', 'name': 'Nur Bengisu Çam'},
   {'authorId': '2180883959', 'name': 'Mert Basmaci'},
   {'authorId': '2315466736', 'name': 'Jie Zheng'},
   {'authorId': '2321307305', 'name': 'Christianah Jemiyo'},
   {'authorId': '2249526854', 'name': 'Yongqun He'},
   {'authorId': '21139001

Unnamed: 0,paperId,title,isOpenAccess,openAccessPdf,authors
0,db4e8d662dbe80f3ddf78e69b1c1053500894d25,A Database of Stress-Strain Properties Auto-ge...,False,,"[{'authorId': '2193266278', 'name': 'Pankaj Ku..."
1,50a3f0dd12114fb2ca90a5511a6325524c3f6013,Evaluating GPT and BERT models for protein–pro...,True,,"[{'authorId': '35668704', 'name': 'Hasin Rehan..."
2,5eafc35cedbb28b033009947ca73b40007b2b407,Paper Copilot: A Self-Evolving and Efficient L...,False,,"[{'authorId': '2301577462', 'name': 'Guanyu Li..."
3,4772df95a893061e0fedc9a09c56f95d8926fb9d,Temporal Graph Neural Network-Powered Paper Re...,False,,"[{'authorId': '2284397773', 'name': 'Junhao Sh..."
4,036fd8b92722023742dc7fceb19a2ea1d56828de,Harvesting Textual and Structured Data from th...,False,,"[{'authorId': '2313732350', 'name': 'Francis K..."
...,...,...,...,...,...
374,463e73debcc2c8d4ee3e79e60be0d9b75e59fc84,Gender-Fair (Machine) Translation,False,,"[{'authorId': '2184245750', 'name': 'Manuel La..."
375,22e57a57f113852cba770f20ee87cbbb01b27111,The changing landscape of text mining - a revi...,False,,"[{'authorId': '8035829', 'name': 'M. Farrell'}..."
376,1dea91742e80f2f3cb2a42224ec300ceac359f4a,The Development and Deployment of Large Langua...,False,,"[{'authorId': '2265945051', 'name': 'Hongjian ..."
377,4a4f4277936b35ef033ab64c57f7cc848ca90aae,Leveraging multiple control codes for aspect-c...,False,,"[{'authorId': '2186557380', 'name': 'Kehan Lon..."


In [5]:
citation_df.dropna(subset=['openAccessPdf']).head(5)

Unnamed: 0,paperId,title,isOpenAccess,openAccessPdf,authors
11,e11c67919830bb0a29a86b05936d467227accc81,openalexR: An R-Tool for Collecting Bibliometr...,True,{'url': 'https://journal.r-project.org/article...,"[{'authorId': '2248919040', 'name': 'Massimo A..."
12,c5a528afd98274902b4987b887f19ecd282ca8bd,BioBBC: a multi-feature model that enhances th...,True,{'url': 'https://www.nature.com/articles/s4159...,"[{'authorId': '1753416672', 'name': 'Hind Alam..."
13,ce2beb2ad9efa56abdc59f4c613bda52cc11803c,Surveying biomedical relation extraction: a cr...,True,{'url': 'https://academic.oup.com/bib/article-...,"[{'authorId': '47451980', 'name': 'Ming-Siang ..."
16,1571d11e5c22d0bdefce3f6ffa333f040a2e409d,MetaTron: advancing biomedical annotation empo...,True,{'url': 'https://bmcbioinformatics.biomedcentr...,"[{'authorId': '2051750606', 'name': 'Ornella I..."
17,61313651e06abb093dfa3fd4430a1f5b146d5b35,The geography of eco-innovations and sustainab...,True,{'url': 'https://www.degruyter.com/document/do...,"[{'authorId': '2085072010', 'name': 'Hendrik H..."


# Download Papers

In [20]:
def get_paper_forPdf(session: requests.Session, paper_id: str, fields: str = 'paperId,title', **kwargs) -> dict:
    params = {
        'fields': fields,
        **kwargs,
    }
    headers = {
        'X-API-KEY': S2_API_KEY,
    }

    with session.get(f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}', params=params, headers=headers) as response:
        response.raise_for_status()
        print(response.json())
        return response.json()

def download_pdf(session: requests.Session, url: str, path: str, user_agent: str = 'requests/2.0.0'):
    # send a user-agent to avoid server error
    headers = {
        'user-agent': user_agent,
    }

    # stream the response to avoid downloading the entire file into memory
    with session.get(url, headers=headers, stream=True, verify=False) as response:
        # check if the request was successful
        response.raise_for_status()

        with open(path, 'wb') as f:
            # write the response to the file, chunk_size bytes at a time
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)


def download_paper(session: requests.Session, paper_id: str, directory: str = 'papers', user_agent: str = 'requests/2.0.0') -> Union[str, None]:
    paper = get_paper_forPdf(session, paper_id, fields='paperId,isOpenAccess,openAccessPdf')

    # check if the paper is open access
    if not paper['isOpenAccess']:
        print(f'Paper {paper_id} is not open access.')
        return None

    if paper['openAccessPdf'] is None:
        print(f'Paper {paper_id} does not have an open access pdf.')
        return None

    paperId: str = paper['paperId']
    pdf_url: str = paper['openAccessPdf']['url']
    pdf_path = os.path.join(directory, f'{paperId}.pdf')

    # create the directory if it doesn't exist
    os.makedirs(directory, exist_ok=True)

    # check if the pdf has already been downloaded
    if not os.path.exists(pdf_path):
        download_pdf(session, pdf_url, pdf_path, user_agent=user_agent)

    return pdf_path


def download_papers(paper_ids: list[str], directory: str = 'papers', user_agent: str = 'requests/2.0.0') -> Generator[tuple[str, Union[str, None, Exception]], None, None]:
    # use a session to reuse the same TCP connection
    with requests.Session() as session:
        for paper_id in paper_ids:
            try:
                yield paper_id, download_paper(session, paper_id, directory=directory, user_agent=user_agent)
            except Exception as e:
                yield paper_id, e


paperID = 'd449ec41cd3222f9f8b325adfacc8f4b9e7b95d3'
session = requests.Session()
print(download_paper(session=session, paper_id=paperID))

{'paperId': 'd449ec41cd3222f9f8b325adfacc8f4b9e7b95d3', 'isOpenAccess': True, 'openAccessPdf': {'url': 'https://www.aclweb.org/anthology/W18-2306.pdf', 'status': 'HYBRID'}}
papers/d449ec41cd3222f9f8b325adfacc8f4b9e7b95d3.pdf


In [None]:
# 读取 URLs
urlDf = pd.read_csv('/Users/Stansfield/Library/CloudStorage/OneDrive-sjtu.edu.cn/24春季课程/深度学习/深度学习大作业/papers/citationURLs.csv')
urls = urlDf['url'].tolist()
display(urlDf)

# 下载 PDF 的函数
def download_papers_from_urls(urls: list[str], directory: str, user_agent: str = 'requests/2.0.0', timeout: int = 10) -> Generator[tuple[str, Union[str, None, Exception]], None, None]:
    # 检查保存目录是否存在，不存在则创建
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # 使用 Session 复用 TCP 连接
    with requests.Session() as session:
        session.headers.update({'user-agent': user_agent})  # 设置 User-Agent
        for idx, url in enumerate(urls, 1):
            try:
                # 自动生成文件名
                filename = os.path.join(directory, f"paper_{idx}.pdf")
                print(f"Downloading {url} to {filename}...")
                download_pdf(session, url, filename, timeout)
                yield (url, filename, None)  # 返回成功信息
            except requests.exceptions.Timeout:
                print(f"Timeout occurred while downloading {url}. Skipping...")
                yield (url, None, "Timeout")  # 返回超时信息
            except Exception as e:
                print(f"Failed to download {url}: {e}")
                yield (url, None, e)  # 返回其他错误信息

# 下载单个 PDF 的函数
def download_pdf(session: requests.Session, url: str, filepath: str, timeout: int):
    # 使用流式响应避免将整个文件加载到内存中
    with session.get(url, stream=True, verify=True, timeout=timeout) as response:
        response.raise_for_status()  # 检查请求是否成功
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):  # 分块写入文件
                f.write(chunk)

# 执行下载
for result in download_papers_from_urls(urls, directory='/Users/Stansfield/Library/CloudStorage/OneDrive-sjtu.edu.cn/24春季课程/深度学习/深度学习大作业/papers/', timeout=10):  # 设置超时时间为 10 秒
    url, filepath, error = result
    if error:
        print(f"Error downloading {url}: {error}")
    else:
        print(f"Downloaded {url} to {filepath}")

    Unnamed: 0                                                url  status
0            0                     https://doi.org/10.3386/w32540     NaN
1            1  https://dl.acm.org/doi/pdf/10.1145/3677052.369...     NaN
2            2  https://www.econstor.eu/bitstream/10419/282537...     NaN
3            3                   https://arxiv.org/pdf/2403.06150     NaN
4            4  https://www.mdpi.com/2227-7099/12/3/59/pdf?ver...     NaN
..         ...                                                ...     ...
83          83                    http://arxiv.org/pdf/2205.04619     NaN
84          84                   https://arxiv.org/pdf/2208.06308     NaN
85          85                   https://arxiv.org/pdf/2212.03152     NaN
86          86  https://www.econstor.eu/bitstream/10419/246229...     NaN
87          87  https://www.coll.mpg.de/pdf_dat/2021_11online.pdf     NaN

[88 rows x 3 columns]
Downloading https://doi.org/10.3386/w32540 to /Users/Stansfield/Library/CloudStorage/OneD

KeyboardInterrupt: 

# Get Comments

In [7]:
# read pdf
import fitz

def read_pdf(pdf_path: str) -> str:
    with fitz.open(pdf_path) as pdf:
        text = ''
        for page in pdf:
            text += page.get_text()
        return text

read_pdf('papers/d449ec41cd3222f9f8b325adfacc8f4b9e7b95d3.pdf')


'Proceedings of the BioNLP 2018 workshop, pages 47–55\nMelbourne, Australia, July 19, 2018. c⃝2018 Association for Computational Linguistics\n47\nOntology Alignment in the Biomedical Domain\nUsing Entity Deﬁnitions and Context\nLucy Lu Wang†, Chandra Bhagavatula, Mark Neumann,\nKyle Lo, Chris Wilhelm, and Waleed Ammar\nAllen Institute for Artiﬁcial Intelligence\n†Department of Biomedical Informatics and Medical Education, University of Washington\nSeattle, Washington, USA\nlucylw@uw.edu\nAbstract\nOntology alignment is the task of identi-\nfying semantically equivalent entities from\ntwo given ontologies. Different ontologies\nhave different representations of the same\nentity, resulting in a need to de-duplicate\nentities when merging ontologies. We pro-\npose a method for enriching entities in an\nontology with external deﬁnition and con-\ntext information, and use this additional\ninformation for ontology alignment. We\ndevelop a neural architecture capable of\nencoding the addition