Skip to content

Commit

Permalink
Merge pull request #240 from scholarly-python-package/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
programize-admin committed Jan 19, 2021
2 parents 226e772 + ba10e0e commit d6c95a7
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 4 deletions.
33 changes: 32 additions & 1 deletion scholarly/_navigator.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
class DOSException(Exception):
"""DOS attack was detected."""

class MaxTriesExceededException(Exception):
pass

class Singleton(type):
_instances = {}
Expand Down Expand Up @@ -65,6 +67,10 @@ def set_logger(self, enable: bool):

self.logger.setLevel((logging.INFO if enable else logging.CRITICAL))

def set_timeout(self, timeout: int):
"""Set timeout period in seconds for scholarly"""
if timeout >= 0:
self._TIMEOUT = timeout

def use_proxy(self, pg: ProxyGenerator):
if pg is not None:
Expand Down Expand Up @@ -149,7 +155,7 @@ def _get_page(self, pagerequest: str) -> str:

tries += 1
self._session, timeout = self.pm.get_next_proxy(num_tries = tries, old_timeout = timeout)
raise Exception("Cannot fetch the page from Google Scholar.")
raise MaxTriesExceededException("Cannot Fetch from Google Scholar.")


def _set_retries(self, num_retries: int) -> None:
Expand Down Expand Up @@ -271,3 +277,28 @@ def search_author_id(self, id: str, filled: bool = False) -> Author:
else:
res = author_parser.fill(res, sections=['basics'])
return res

def search_organization(self, url: str, fromauthor: bool) -> list:
"""Generate instiution object from author search page.
if no results are found and `fromuthor` is True, then use the first author from the search
to get institution/organization name.
"""
soup = self._get_soup(url)
rows = soup.find_all('h3', 'gsc_inst_res')
if rows:
self.logger.info("Found institution")

res = []
for row in rows:
res.append({'Organization': row.a.text, 'id': row.a['href'].split('org=', 1)[1]})

if rows == [] and fromauthor is True:
try:
auth = next(self.search_authors(url))
authorg = self.search_author_id(auth.id).organization
authorg['fromauthor'] = True
res.append(authorg)
except Exception:
res = []

return res
33 changes: 31 additions & 2 deletions scholarly/_scholarly.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,15 @@ def set_logger(self, enable: bool):
"""
self.__nav.set_logger(enable)

def set_timeout(self, timeout: int):
"""Set timeout period in seconds for scholarly"""
self.__nav.set_timeout(timeout)


def search_pubs(self,
query: str, patents: bool = True,
citations: bool = True, year_low: int = None,
year_high: int = None)->_SearchScholarIterator:
year_high: int = None, sortby_date: str = None)->_SearchScholarIterator:
"""Searches by query and returns a generator of Publication objects
:param query: terms to be searched
Expand All @@ -68,6 +72,8 @@ def search_pubs(self,
:type year_low: int, optional
:param year_high: maximum year of publication, defaults to None
:type year_high: int, optional
:param sortby_date: 'abstracts' for abstracts, 'everything' for all results
:type sortyby_date: string, optional
:returns: Generator of Publication objects
:rtype: Iterator[:class:`Publication`]
Expand Down Expand Up @@ -116,8 +122,14 @@ def search_pubs(self,
yr_hi = '&as_yhi={0}'.format(year_high) if year_high is not None else ''
citations = '&as_vis={0}'.format(1 - int(citations))
patents = '&as_sdt={0},33'.format(1 - int(patents))
sortby = ''

if sortby_date == 'abstract':
sortby = '&scisbd=1'
elif sortby_date == 'everything':
sortby = '&scisbd=2'
# improve str below
url = url + yr_lo + yr_hi + citations + patents
url = url + yr_lo + yr_hi + citations + patents + sortby
return self.__nav.search_publications(url)

def search_single_pub(self, pub_title: str, filled: bool = False)->PublicationParser:
Expand Down Expand Up @@ -317,3 +329,20 @@ def pprint(self, object: Author or Publication)->None:
del to_print['container_type']
print(pprint.pformat(to_print))

def search_org(self, name: str, fromauthor: bool = False) -> list:
"""Search by organization name and return a list of possible disambiguations
:Example::
.. testcode::
search_query = scholarly.search_org('ucla')
print(search_query)
:Output::
.. testoutput::
[{'Organization': 'University of California, Los Angeles',
'id': '14108176128635076915'},
{'Organization': 'Universidad Centroccidental Lisandro Alvarado',
'id': '9670678584336165373'}
]
"""

url = _AUTHSEARCH.format(requests.utils.quote(name))
return self.__nav.search_organization(url, fromauthor)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name='scholarly',
version='1.0.3',
version='1.0.4',
author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva',
author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca',
description='Simple access to Google Scholar authors and citations',
Expand Down

0 comments on commit d6c95a7

Please sign in to comment.