## Setup

In [None]:
!pip install S2query

Collecting S2query
  Downloading S2query-1.0.3.2-py3-none-any.whl (13 kB)
Installing collected packages: S2query
Successfully installed S2query-1.0.3.2


In [None]:
import pandas as pd
import ipywidgets as widgets
from ipywidgets import Layout
from IPython.display import display, clear_output

## Semantic Scholar API

In [None]:
# @title SearcherAPI
from S2query import S2paperAPI
from multiprocessing import Pool

# Search interface using SS API
class SearcherAPI:
    def __init__(self):
        # Creates data fetching object
        self.obj = S2paperAPI()
        self.find = False
        self.save_b = widgets.Button(description="Export Articles")
        self.save_b.on_click(self.post_processing)
        self.save_all_b = widgets.Button(description="Export All")
        self.save_all_b.on_click(self.save_data)

        # Creating buttons and textbox
        self.search_b = widgets.Button(description="Search")
        self.search_text = widgets.Textarea(
            description=" : Query",
            value="'artificial intelligence'+'Deep Learning'-'Biology'",
            placeholder="Type something",
            layout=Layout(
                width="50%", height="60px", display="flex", flex_flow="row-reverse"
            ),
            # description=':',
            disabled=False,
        )

        self.number_papers = widgets.IntText(
            value=100,
            min=2,
            # layout = widgets.Layout(width='180px'),
            style={"description_width": "initial"},
            layout=Layout(width="180px", display="flex", flex_flow="row-reverse"),
            description=" Number of papers",
            disabled=False,
        )

        # self.leftbox = widgets.HBox([self.search_text])
        self.rightbox = widgets.VBox(
            [self.search_text, self.number_papers, self.search_b]
        )

        # self.hbox = widgets.HBox([self.leftbox, self.rightbox])
        self.save_h = widgets.HBox([self.save_b, self.save_all_b])

        # On click function
        self.search_b.on_click(self.search_button)

    # Search button
    def search_button(self, p):
        print(self.search_text.value)
        # Searches data
        self.obj.get(
            self.search_text.value,
            n=self.number_papers.value,
            fields = ['externalIds', 'paperId', 'url', 'title', \
                        'abstract', 'venue', 'year', 'referenceCount', \
                        'citationCount', 'influentialCitationCount', \
                        'isOpenAccess', 'fieldsOfStudy', 'authors']
        )

        # Saves data found
        self.data = self.obj.all
        self.extract_info()

        # self.data

        self.find = True
        self.__call__()


    def extract_info(self):
        self.data = self.data.rename(columns={"externalIds": "doi"})
        doi = [d.get("DOI") for d in self.data.doi]
        self.data["doi"] = doi
        null = self.data.doi.isnull().values
        self.data = self.data.drop(self.data[null].index)
        self.data = self.data[self.data.doi != ""]
        self.data['authors'] = self.data.authors.map(lambda author: [contex['name']  for contex in author ] if len(author) != 0 else author)
        self.data = self.data[self.data.doi != ""]

    def doi2bib(self, doi):
        """
        Return a bibTeX string of metadata for a given DOI.
        """

        url = "http://dx.doi.org/" + doi

        headers = {"accept": "application/x-bibtex"}
        r = requests.get(url, headers = headers)

        return r.text



    def post_processing(self, p):
        self.works = Works()
        docs_type = []
        bibtex = []

        dois = self.data.doi.to_list()

        # with Pool(2) as p:
        #   docs_type = p.map(self.multiprocessing, dois)

        print("processing ...")

        for doi in dois:
            try:
                t = self.works.doi(doi)["type"]
                bibtex.append(self.doi2bib(doi))
            except:
                t = ""
                bibtex.append(t)
            docs_type.append(t)

        self.data["type"] = docs_type
        self.data["bibtex"] = bibtex
        mask = ["journal-article", "proceedings-article"]
        self.data_article = self.data[self.data["type"].isin(mask)]
        self.data_article = self.data_article.reset_index().drop("index", axis=1)
        self.data_article.to_csv("data_API_Articles.csv")
        self.__call__()
        print("Saved as data_API_Articles.csv")

    def save_data(self, p):
        self.data.to_csv("data_API.csv")
        self.__call__()
        print("Saved as data_API.csv")

    def multiprocessing(self, data):
        try:
            t = self.works.doi(data)["type"]
        except:
            t = ""

        return t

    # Call function
    def __call__(self):
        clear_output()
        # Displaying buttons
        display(self.rightbox)
        if self.find:
            display(self.save_h)

In [None]:
# data_API_Articles.csv
# data_API.csv
resultAPI = SearcherAPI()
resultAPI()

VBox(children=(Textarea(value="'handwritten text recognition'+'handwriting recognition'+'htr'+'ocr'+'historica…

HBox(children=(Button(description='Export Articles', style=ButtonStyle()), Button(description='Export All', st…

In [None]:
resultAPI.data.shape

(176, 13)

In [None]:
resultAPI.data.columns

Index(['paperId', 'doi', 'url', 'title', 'abstract', 'venue', 'year',
       'referenceCount', 'citationCount', 'influentialCitationCount',
       'isOpenAccess', 'fieldsOfStudy', 'authors'],
      dtype='object')

In [None]:
resultAPI.data.head()

Unnamed: 0,paperId,doi,url,title,abstract,venue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess,fieldsOfStudy,authors
0,04431d4d16159c13d398519d17167b64b003d0b3,10.1145/3519306,https://www.semanticscholar.org/paper/04431d4d...,A User Perspective on HTR Methods for the Auto...,Recent breakthroughs in Artificial Intelligenc...,ACM Journal on Computing and Cultural Heritage,2022.0,30,2,0,False,[Computer Science],"[Mohamed Ali Souibgui, Asma Bensalah, Jialuo C..."
1,94b43dd0fc8026031afed35acc9ad68263dc9d4e,10.1109/INDICON56171.2022.10039882,https://www.semanticscholar.org/paper/94b43dd0...,A Novel Segmentation Algorithm For Handwritten...,"In the present world, the financial system, po...",IEEE India Conference,2022.0,9,0,0,False,,"[Amitha Mary Benny, D. Sudarsan]"
2,aaa55f93b7905c8bd5377be99e8efca06f8b2752,10.1007/s12046-022-01864-9,https://www.semanticscholar.org/paper/aaa55f93...,Identification of handwritten Gujarati alphanu...,,Sādhanā,2022.0,23,0,0,False,,"[Krishn Limbachiya, Ankit Sharma, P. Thakkar, ..."
4,c50764ff7a6d52f59dd631dd41900b1295d25f65,10.1109/ICFHR-2018.2018.00054,https://www.semanticscholar.org/paper/c50764ff...,Neural Text Line Segmentation of Multilingual ...,We present a novel method for detecting text l...,International Conference on Frontiers in Handw...,2018.0,24,6,1,False,[Computer Science],"[Patrick Schone, Christian Hargraves, Jon Morr..."
6,85a75d558420589466cc3bb93886874b49d25206,10.4995/THESIS/10251/37978,https://www.semanticscholar.org/paper/85a75d55...,Bernoulli HMMs for Handwritten Text Recognition,In last years Hidden Markov Models (HMMs) have...,,2014.0,113,0,0,True,[Computer Science],[Adrián Giménez Pastor]


In [None]:
# resultAPI.data.iloc[4][4]

In [None]:
resultAPI.data['year'].unique().max()

2023.0

In [None]:
recent_papers = resultAPI.data[resultAPI.data['year'] == 2023.0]

In [None]:
recent_papers.head()

Unnamed: 0,paperId,doi,url,title,abstract,venue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess,fieldsOfStudy,authors
13,493b55c1806e2c79facb4f37463e8b78cec162e5,10.3390/app13074584,https://www.semanticscholar.org/paper/493b55c1...,A Survey of OCR in Arabic Language: Applicatio...,Optical character recognition (OCR) is the pro...,Applied Sciences,2023.0,67,6,0,True,,"[S. Faizullah, Muhammad Sohaib Ayub, Sajid Hus..."
18,5acd922dc28e5d74332d260640770110440bb4a1,10.1109/AIC57670.2023.10263877,https://www.semanticscholar.org/paper/5acd922d...,Marwari (Heritage Script) OCR Using Attention ...,The Marwari language is home to a vast collect...,International Workshop on Artificial Intellige...,2023.0,19,0,0,False,,"[Manish Kumar Gupta, Siddharth Dhawan, Surya V..."
30,2a62d85ac4640d701612f08f8e5d6d14adc1dd3c,10.1109/ICEARS56392.2023.10085174,https://www.semanticscholar.org/paper/2a62d85a...,An Enhanced Machine Learning Technique for Tex...,Optical Character Recognition (OCR) has become...,2023 Second International Conference on Electr...,2023.0,24,0,0,False,,"[R. Deepa, S. Gayathri, P. Chitra, J. Jasmine,..."
35,1b97f78cc3056859949aa2c48e995df03ccf7a4f,10.48550/arXiv.2310.16809,https://www.semanticscholar.org/paper/1b97f78c...,Exploring OCR Capabilities of GPT-4V(ision) : ...,This paper presents a comprehensive evaluation...,arXiv.org,2023.0,82,1,0,False,[Computer Science],"[Yongxin Shi, Dezhi Peng, Wenhui Liao, Zening ..."
45,3e0f185430d6aa15ba5df0ebda976e51e8042d63,10.1007/s10032-023-00428-9,https://www.semanticscholar.org/paper/3e0f1854...,An end-to-end pipeline for historical censuses...,,International Journal on Document Analysis and...,2023.0,63,0,0,True,[Computer Science],"[Rémi Petitpierre, Marion Kramer, Lucas Rappo]"


In [None]:
recent_papers.shape

(28, 13)

## Semantic Scholar website (scraping)

In [None]:
#@title SearcherWeb
from S2query import S2paperWeb

# Interface de busca utilizando a Web do SS
class SearcherWeb():
  def __init__(self):
    # cria o objeto para puxar os dados
    self.obj = S2paperWeb()
    self.find = False


    # Cria os buttoes/textbox
    self.search_b = widgets.Button(description="Search")
    self.search_text =  widgets.Textarea(
    value="'artificial intelligence'+'Deep Learning'-'Biology'",
    placeholder='Type something',
    layout=Layout(width='50%', height='60px',
                  display= 'flex', flex_flow= 'row-reverse'),
    # description=':',
    disabled=False
)
        # Funcao quando clicar o botao
    self.search_b.on_click(self.search_button)


    self.save_all_b = widgets.Button(description="Export Data")
    self.save_all_b.on_click(self.save_data)

    # Configura opcoes possiveis do parametro sort
    self.sort = widgets.Dropdown(
    options = ["total-citations", "influence", "pub-date", "relevance"],
    value = 'relevance',
    description = 'Sort by',
    layout=Layout(width='180px',
                  display= 'flex', flex_flow= 'row-reverse'),
    disabled = False
)

    # Configura opcoes possiveis do parametro venue
    self.venue = widgets.SelectMultiple(
    options=["PloS one", "AAAI", "Scientific reports", "IEEE Access", \
             "ArXiv", "Expert Syst. Appl.""ICML", "Neurocomputing", \
             "Sensors", "Remote. Sens."],
    value=[],
    layout=Layout(width='300px'),
    #rows=10,
    description='venue',
    disabled=False
)

        # Criando um botao de selecionar todos
    # para as venues
    self.but_venue = widgets.Button(description = 'All venues')

    self.but_venue.on_click(self.venue_button)

    # Configura as opcoes do parametro yearFilter
    self.min = widgets.IntText(
    description='Min. year: ',
    value=2000,
    disabled=False
)

    self.max = widgets.IntText(
    description='Max. year: ',
    value=2023,
    disabled=False
)



    # Configura as opcoes do parametro publicationTypes
    self.pubtype = widgets.SelectMultiple(
    options=["ClinicalTrial","CaseReport","Editorial","Study", "Book",\
             "News","Review","Conference","LettersAndComments",\
             "JournalArticle"],
    value=["JournalArticle"],
    #rows=10,
    description='Publication',
    disabled=False
)

    # Criando um botao de selecionar todos
    # para as venues
    self.but_pub = widgets.Button(description = 'All pub. types')

    self.but_pub.on_click(self.pub_button)

    # Configura as opcoes do parametro fieldsOfStudy
    self.fos = widgets.SelectMultiple(
    options=["agricultural-and-food-sciences","art","biology",\
             "business","computer-science","chemistry","economics",\
             "education","engineering","environmental-science",\
             "geography","geology","history","law","linguistics",\
             "materials-science","mathematics","medicine","philosophy",\
             "physics","political-science","sociology","psychology"],
    value=[],
    #rows=10,
    description='fieldsOfStudy',
    disabled=False
)

    # Criando um botao de selecionar todos
    # para as venues
    self.but_fos = widgets.Button(description = 'All fields')

    self.but_fos.on_click(self.fos_button)

    self.number_papers = widgets.IntText(
    value=100,
    min=2,
    # layout = widgets.Layout(width='180px'),
    style= {'description_width': 'initial'},
    layout=Layout(width='180px',
                  display= 'flex', flex_flow= 'row-reverse'),
    description=' Number of papers',
    disabled=False)

    # self.leftbox = widgets.HBox([self.search_text])
    self.vbox1 = widgets.VBox([self.sort, self.number_papers])
    self.hbox1 = widgets.HBox([self.search_text, self.vbox1])
    self.hbox2 = widgets.HBox([self.min, self.max])
    self.hbox3 = widgets.HBox([self.venue, self.but_venue])
    self.hbox4 = widgets.HBox([self.pubtype, self.but_pub])
    self.hbox5 = widgets.HBox([self.fos, self.but_fos])

    self.Vbox = widgets.VBox([self.hbox1,
                              self.hbox2,
                              self.hbox3,
                              self.hbox4,
                              self.hbox5,
                              self.search_b
                              ])


  # Search button
  def search_button(self, p):
    # self.__call__()

    print(self.search_text.value)
    # Busca os dados


    self.yearfilter = {"min": self.min.value, "max": self.max.value}
    self.obj.get(self.search_text.value,
                 n=self.number_papers.value,
                 sort = self.sort.value,
                 venue = self.venue.value,
                 yearFilter = self.yearfilter,
                 publicationTypes = self.pubtype.value,
                 fieldsOfStudy = self.fos.value)


    # Guarda os dados achados
    self.data = self.prepare_data(self.obj.all)
    self.find = True
    self.__call__()

    # print("done")

  # Definindo a funcao do botao de field of study
  def fos_button(self, p):

    self.fos.value = self.fos.options

  # Definindo a funcao do botao de venue
  def venue_button(self, p):

    self.venue.value = self.venue.options

  # Definindo a funcao do botao de pubtype
  def pub_button(self, p):

    self.pubtype.value = self.pubtype.options
  # cria um subset dictionary da funcao que busca os dados
  def subset(self, dicttosub):
    # Seleciona apenas as colunas
    subdict = {k:dicttosub[k] for k in ('title','paperAbstract',
                                        'year', 'primaryPaperLink',
                                        'entities', 'fieldsOfStudy',
                                        'authors', 'citationStats')}

    # Renomeia a key do dictionar
    subdict['abstract'] = subdict.pop('paperAbstract')
    subdict['numReferences'] = subdict['citationStats']['numReferences']
    subdict['numCitations'] = subdict['citationStats']['numCitations']
    subdict.pop('citationStats')

    # Guarda apenas um link de referencia
    try:
      subdict['link'] = subdict['primaryPaperLink']['url']
    except:
      subdict['link'] = np.nan

    subdict.pop('primaryPaperLink')

    return subdict


  def save_data(self, p):
    self.data.to_csv("data_WEB.csv")
    self.__call__()
    print("Saved as data_WEB.csv")

  # Trata os dados para transformar de json para pandas.dataframe
  def prepare_data(self, data):

    # Loop sobre os resultados para selecionar tudo em uma lista
    papers = []
    for x in data['Results']:
      papers_page = [self.subset(paper) for paper in x['Page']['Papers']]
      papers.append(papers_page)
    papers = list(chain.from_iterable(papers))

    # Transforma em Pandas
    data = pd.DataFrame(papers)
    return data

  # Funcao para chamar a classe
  def __call__(self):
    clear_output()
    # Mostra os butoes
    display(self.Vbox)
    if self.find:
      display(self.save_all_b)

In [None]:
# data_API.csv
resultWEB = SearcherWeb()
resultWEB()

In [None]:
resultWEB.data.shape

In [None]:
resultWEB.data.head(2)