Skip to content

Commit

Permalink
Merge pull request #254 from scholarly-python-package/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
programize-admin committed Feb 3, 2021
2 parents bc0ab17 + 952fc4f commit 41ec4bb
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 19 deletions.
19 changes: 12 additions & 7 deletions scholarly/_navigator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from .publication_parser import _SearchScholarIterator
from .author_parser import AuthorParser
from .publication_parser import PublicationParser
from .data_types import Author
from .data_types import Author, PublicationSource

class DOSException(Exception):
"""DOS attack was detected."""
Expand Down Expand Up @@ -246,10 +246,11 @@ def search_publication(self, url: str,
:rtype: {Publication}
"""
soup = self._get_soup(url)
res = PublicationParser(self, soup.find_all('div', 'gs_or')[0], 'scholar')
publication_parser = PublicationParser(self)
pub = publication_parser.get_publication(soup.find_all('div', 'gs_or')[0], PublicationSource.PUBLICATION_SEARCH_SNIPPET)
if filled:
res.fill()
return res
pub = publication_parser.fill(pub)
return pub

def search_publications(self, url: str) -> _SearchScholarIterator:
"""Returns a Publication Generator given a url
Expand All @@ -261,21 +262,25 @@ def search_publications(self, url: str) -> _SearchScholarIterator:
"""
return _SearchScholarIterator(self, url)

def search_author_id(self, id: str, filled: bool = False) -> Author:
def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0) -> Author:
"""Search by author ID and return a Author object
:param id: the Google Scholar id of a particular author
:type url: str
:param filled: If the returned Author object should be filled
:type filled: bool, optional
:param sortby: if the object is an author, select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'.
:type sortby: string
:param publication_limit: Select the max number of publications you want you want to fill for the author. Defaults to no limit.
:type publication_limit: int
:returns: an Author object
:rtype: {Author}
"""
author_parser = AuthorParser(self)
res = author_parser.get_author(id)
if filled:
res = author_parser.fill(res)
res = author_parser.fill(res, sortby=sortby, publication_limit=publication_limit)
else:
res = author_parser.fill(res, sections=['basics'])
res = author_parser.fill(res, sections=['basics'], sortby=sortby, publication_limit=publication_limit)
return res

def search_organization(self, url: str, fromauthor: bool) -> list:
Expand Down
78 changes: 72 additions & 6 deletions scholarly/_scholarly.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import copy
import pprint
from typing import Callable
from typing import Callable, List
from ._navigator import Navigator
from ._proxy_generator import ProxyGenerator
from dotenv import find_dotenv, load_dotenv
Expand All @@ -14,6 +14,7 @@

_AUTHSEARCH = '/citations?hl=en&view_op=search_authors&mauthors={0}'
_KEYWORDSEARCH = '/citations?hl=en&view_op=search_authors&mauthors=label:{0}'
_KEYWORDSEARCHBASE = '/citations?hl=en&view_op=search_authors&mauthors={}'
_PUBSEARCH = '/scholar?hl=en&q={0}'


Expand Down Expand Up @@ -181,7 +182,7 @@ def search_author(self, name: str):
url = _AUTHSEARCH.format(requests.utils.quote(name))
return self.__nav.search_authors(url)

def fill(self, object: dict, sections=[]) -> Author or Publication:
def fill(self, object: dict, sections=[], sortby: str = "citedby", publication_limit: int = 0) -> Author or Publication:
"""Fills the object according to its type.
If the container type is Author it will fill the additional author fields
If it is Publication it will fill it accordingly.
Expand All @@ -190,11 +191,15 @@ def fill(self, object: dict, sections=[]) -> Author or Publication:
:type object: Author or Publication
:param sections: the sections that the user wants filled for an Author object. This can be: ['basics', 'indices', 'counts', 'coauthors', 'publications']
:type sections: list
:param sortby: if the object is an author, select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'.
:type sortby: string
:param publication_limit: if the object is an author, select the max number of publications you want you want to fill for the author. Defaults to no limit.
:type publication_limit: int
"""

if object['container_type'] == "Author":
author_parser = AuthorParser(self.__nav)
object = author_parser.fill(object, sections)
object = author_parser.fill(object, sections, sortby, publication_limit)
if object is False:
raise ValueError("Incorrect input")
elif object['container_type'] == "Publication":
Expand Down Expand Up @@ -231,8 +236,12 @@ def citedby(self, object: Publication)->_SearchScholarIterator:
return


def search_author_id(self, id: str, filled: bool = False)->Author:
def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0)->Author:
"""Search by author id and return a single Author object
:param sortby: select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'.
:type sortby: string
:param publication_limit: if the object is an author, select the max number of publications you want you want to fill for the author. Defaults to no limit.
:type publication_limit: int
:Example::
Expand All @@ -252,7 +261,7 @@ def search_author_id(self, id: str, filled: bool = False)->Author:
'scholar_id': 'EmD_lTEAAAAJ',
'source': 'AUTHOR_PROFILE_PAGE'}
"""
return self.__nav.search_author_id(id, filled)
return self.__nav.search_author_id(id, filled, sortby, publication_limit)

def search_keyword(self, keyword: str):
"""Search by keyword and return a generator of Author objects
Expand Down Expand Up @@ -287,6 +296,45 @@ def search_keyword(self, keyword: str):
url = _KEYWORDSEARCH.format(requests.utils.quote(keyword))
return self.__nav.search_authors(url)

def search_keywords(self, keywords: List[str]):
"""Search by keywords and return a generator of Author objects
:param keywords: a list of keywords to be searched
:type keyword: List[str]
:Example::
.. testcode::
search_query = scholarly.search_keywords(['crowdsourcing', 'privacy'])
scholarly.pprint(next(search_query))
:Output::
.. testoutput::
{'affiliation': 'Cornell University',
'citedby': 40976,
'email_domain': '',
'filled': False,
'interests': ['Crowdsourcing',
'privacy',
'social computing',
'game theory',
'user-generated content'],
'name': 'Arpita Ghosh',
'scholar_id': '_cMw1IUAAAAJ',
'source': 'SEARCH_AUTHOR_SNIPPETS',
'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=_cMw1IUAAAAJ'}
"""

formated_keywords = ['label:'+requests.utils.quote(keyword) for keyword in keywords]
formated_keywords = '+'.join(formated_keywords)
url = _KEYWORDSEARCHBASE.format(formated_keywords)
return self.__nav.search_authors(url)



def search_pubs_custom_url(self, url: str)->_SearchScholarIterator:
"""Search by custom URL and return a generator of Publication objects
URL should be of the form '/scholar?q=...'
Expand All @@ -304,7 +352,25 @@ def search_author_custom_url(self, url: str)->Author:
:type url: string
"""
return self.__nav.search_authors(url)


def get_related_articles(self, object: Publication)->_SearchScholarIterator:
"""
Search google scholar for related articles to a specific publication.
:param object: Publication object used to get the related articles
:type object: Publication
"""
if object['container_type'] != 'Publication':
print("Not a publication object")
return

if object['source'] == PublicationSource.AUTHOR_PUBLICATION_ENTRY:
if 'url_related_articles' not in object.keys():
object = self.fill(object)
return self.__nav.search_publications(object['url_related_articles'])
elif object['source'] == PublicationSource.PUBLICATION_SEARCH_SNIPPET:
return self.__nav.search_publications(object['url_related_articles'])

def pprint(self, object: Author or Publication)->None:
"""Pretty print an Author or Publication container object
Expand Down
20 changes: 16 additions & 4 deletions scholarly/author_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def _fill_counts(self, soup, author):
for c in soup.find_all('span', class_='gsc_g_al')]
author['cites_per_year'] = dict(zip(years, cites))

def _fill_publications(self, soup, author):
def _fill_publications(self, soup, author, publication_limit: int = 0):
author['publications'] = list()
pubstart = 0
url_citations = _CITATIONAUTH.format(author['scholar_id'])
Expand All @@ -118,6 +118,8 @@ def _fill_publications(self, soup, author):
for row in soup.find_all('tr', class_='gsc_a_tr'):
new_pub = pub_parser.get_publication(row, PublicationSource.AUTHOR_PUBLICATION_ENTRY)
author['publications'].append(new_pub)
if (publication_limit) and (len(author['publications']) >= publication_limit):
break
if 'disabled' not in soup.find('button', id='gsc_bpf_more').attrs:
pubstart += _PAGESIZE
url = '{0}&cstart={1}&pagesize={2}'.format(
Expand All @@ -137,7 +139,7 @@ def _fill_coauthors(self, soup, author):
new_coauthor['source'] = AuthorSource.CO_AUTHORS_LIST
author['coauthors'].append(new_coauthor)

def fill(self, author, sections: list = []):
def fill(self, author, sections: list = [], sortby="citedby", publication_limit: int = 0):
"""Populate the Author with information from their profile
The `sections` argument allows for finer granularity of the profile
Expand All @@ -152,6 +154,10 @@ def fill(self, author, sections: list = []):
* ``publications``: fills publications;
* ``[]``: fills all of the above
:type sections: ['basics','citations','counts','coauthors','publications',[]] list, optional
:param sortby: Select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'.
:type sortby: string
:param publication_limit: Select the max number of publications you want you want to fill for the author. Defaults to no limit.
:type publication_limit: int
:returns: The filled object if fill was successfull, False otherwise.
:rtype: Author or bool
Expand Down Expand Up @@ -296,19 +302,25 @@ def fill(self, author, sections: list = []):
"""
try:
sections = [section.lower() for section in sections]
sortby_str = ''
if sortby == "year":
sortby_str = '&view_op=list_works&sortby=pubdate'
elif sortby != "citedby":
raise Exception("Please enter a valid sortby parameter. Options: 'year', 'citedby'")
url_citations = _CITATIONAUTH.format(author['scholar_id'])
url_citations += sortby_str
url = '{0}&pagesize={1}'.format(url_citations, _PAGESIZE)
soup = self.nav._get_soup(url)

if sections == []:
for i in self._sections:
if i not in author['filled']:
getattr(self, f'_fill_{i}')(soup, author)
(getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit))
author['filled'].add(i)
else:
for i in sections:
if i in self._sections and i not in author['filled']:
getattr(self, f'_fill_{i}')(soup, author)
(getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit))
author['filled'].add(i)
except Exception as e:
raise(e)
Expand Down
2 changes: 2 additions & 0 deletions scholarly/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ class Publication(TypedDict, total=False):
of multiple publications, and therefore may have multiple "citedby_id"
values.
(source: AUTHOR_PUBLICATION_ENTRY)
:param url_related_articles: the url containing link for related articles of a publication (needs fill() for AUTHOR_PUBLICATION_ENTRIES)
:param url_add_sclib: (source: PUBLICATION_SEARCH_SNIPPET)
:param url_scholarbib: the url containing links for
the BibTeX entry, EndNote, RefMan and RefWorks (source: PUBLICATION_SEARCH_SNIPPET)
Expand All @@ -169,6 +170,7 @@ class Publication(TypedDict, total=False):
eprint_url: str
pub_url: str
url_add_sclib: str
url_related_articles: str
url_scholarbib: str
filled: bool
source: PublicationSource
Expand Down
9 changes: 8 additions & 1 deletion scholarly/publication_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,9 @@ def _scholar_pub(self, __data, publication: Publication):
if 'Cited by' in link.text:
publication['num_citations'] = int(re.findall(r'\d+', link.text)[0].strip())
publication['citedby_url'] = link['href']

if 'Related articles' in link.text:
publication['url_related_articles'] = link['href']

if __data.find('div', class_='gs_ggs gs_fl'):
publication['eprint_url'] = __data.find(
Expand All @@ -257,7 +260,7 @@ def fill(self, publication: Publication)->Publication:
for item in soup.find_all('div', class_='gs_scl'):
key = item.find(class_='gsc_vcd_field').text.strip().lower()
val = item.find(class_='gsc_vcd_value')
if key == 'authors':
if key == 'authors' or key == 'inventors':
publication['bib']['author'] = ' and '.join(
[i.strip() for i in val.text.split(',')])
elif key == 'journal':
Expand Down Expand Up @@ -306,6 +309,10 @@ def fill(self, publication: Publication)->Publication:
publication['cites_id'] = re.findall(
_SCHOLARPUBRE, val.a['href'])[0]
publication['citedby_url'] = _CITEDBYLINK.format(publication['cites_id'])
elif key == 'scholar articles':
for entry in val.find_all('a'):
if entry.text.lower() == 'related articles':
publication['url_related_articles'] = entry.get('href')[26:]
# number of citation per year
years = [int(y.text) for y in soup.find_all(class_='gsc_vcd_g_t')]
cites = [int(c.text) for c in soup.find_all(class_='gsc_vcd_g_al')]
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name='scholarly',
version='1.0.5',
version='1.0.6',
author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva',
author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca',
description='Simple access to Google Scholar authors and citations',
Expand Down

0 comments on commit 41ec4bb

Please sign in to comment.