-
Notifications
You must be signed in to change notification settings - Fork 124
/
search.py
130 lines (111 loc) · 5.03 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""Superclass to access all Scopus search APIs and dump the results."""
from hashlib import md5
from json import dumps, loads
from os.path import exists, join
from warnings import warn
from pybliometrics.scopus.exception import ScopusQueryError
from pybliometrics.scopus.utils import SEARCH_URL, cache_file, get_content,\
get_folder, print_progress
class Search:
def __init__(self, query, api, refresh, view='STANDARD', count=200,
max_entries=5000, cursor=False, download=True, verbose=False, **kwds):
"""Class intended as superclass to perform a search query.
Parameters
----------
query : str
A string of the query.
api : str
The name of the Scopus API to be accessed. Allowed values:
AffiliationSearch, AuthorSearch, ScopusSearch.
refresh : bool
Whether to refresh the cached file if it exists or not.
view : str
The view of the file that should be downloaded.
count : int (optional, default=200)
The number of entries to be displayed at once. A smaller number
means more queries with each query having less results.
max_entries : int (optional, default=5000)
Raise error when the number of results is beyond this number.
To skip this check, set `max_entries` to `None`.
cursor : str (optional, default=False)
Whether to use the cursor in order to iterate over all search
results without limit on the number of the results. In contrast
to `start` parameter, the `cursor` parameter does not allow users
to obtain partial results.
download : bool (optional, default=True)
Whether to download results (if they have not been cached) or not.
kwds : key-value parings, optional
Keywords passed on to requests header. Must contain fields
and values specified in the respective API specification.
Raises
------
ScopusQueryError
If the number of search results exceeds max_entries.
ValueError
If the api parameteris an invalid entry.
"""
# Read the file contents if file exists and we are not refreshing,
# otherwise download query anew and cache file
fname = md5(query.encode('utf8')).hexdigest()
qfile = join(get_folder(api, view), fname)
if not refresh and exists(qfile):
with open(qfile, "rb") as f:
self._json = [loads(line) for line in f.readlines()]
self._n = len(self._json)
else:
# Set query parameters
params = {'query': query, 'count': count, 'view': view}
if cursor:
params.update({'cursor': '*'})
else:
params.update({'start': 0})
# Download results
res = cache_file(url=SEARCH_URL[api], params=params, **kwds).json()
n = int(res['search-results'].get('opensearch:totalResults', 0))
self._n = n
if not cursor and n > max_entries: # Stop if there are too many results
text = ('Found {} matches. Set max_entries to a higher '
'number, change your query ({}) or set '
'subscription=True'.format(n, query))
raise ScopusQueryError(text)
if download:
self._json = _parse(res, params, n, api, verbose, **kwds)
# Finally write out the file
with open(qfile, 'wb') as f:
for item in self._json:
f.write('{}\n'.format(dumps(item)).encode('utf-8'))
else:
# Assures that properties will not result in an error
self._json = []
self._view = view
def get_results_size(self):
"""Return the number of results (works even if download=False)."""
return self._n
def _parse(res, params, n, api, verbose, **kwds):
"""Auxiliary function to download results and parse json."""
cursor = "cursor" in params
if not cursor:
start = params["start"]
if n == 0:
return ""
_json = res.get('search-results', {}).get('entry', [])
if verbose:
chunk = 1
chunks = int(n/params['count']) + (n % params['count'] > 0) + 1 #roundup + 1 for the final iteration
print('Downloading results for query "{}":'.format(params['query']))
print_progress(chunk, chunks)
# Download the remaining information in chunks
while n > 0:
n -= params["count"]
if cursor:
pointer = res['search-results']['cursor'].get('@next')
params.update({'cursor': pointer})
else:
start += params["count"]
params.update({'start': start})
res = cache_file(url=SEARCH_URL[api], params=params, **kwds).json()
_json.extend(res.get('search-results', {}).get('entry', []))
if verbose:
chunk += 1
print_progress(chunk, chunks)
return _json