-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Web of Science queries #223
Changes from 8 commits
4212aff
ccf56cf
a9d8375
1edf337
526f7c5
1513d2d
4576912
0097a62
90889e2
b2702a7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,252 @@ | ||
|
||
# Queries on the Web of Science (or Web of Knowledge) | ||
class WosQueries | ||
|
||
# this is the maximum number that can be returned in a single query by WoS | ||
MAX_RECORDS = 100 | ||
|
||
QUERY_LANGUAGE = 'en'.freeze | ||
|
||
# limit the start date when searching for publications, format: YYYY-MM-DD | ||
START_DATE = '1970-01-01'.freeze | ||
|
||
attr_reader :wos_client | ||
attr_reader :database | ||
|
||
# @param wos_client [WosClient] a Web Of Science client | ||
# @param database [String] a WOS database identifier (default 'WOK') | ||
def initialize(wos_client, database = 'WOK') | ||
@wos_client = wos_client | ||
@database = database | ||
end | ||
|
||
# @param uid [String] a WOS UID | ||
# @return [WosRecords] | ||
def cited_references(uid) | ||
message = cited_references_params(uid) | ||
retrieve_records(:cited_references, message) | ||
end | ||
|
||
# @param uid [String] a WOS UID | ||
# @return [WosRecords] | ||
def citing_articles(uid) | ||
message = citing_articles_params(uid) | ||
retrieve_records(:citing_articles, message) | ||
end | ||
|
||
# @param uid [String] a WOS UID | ||
# @return [WosRecords] | ||
def related_records(uid) | ||
message = related_records_params(uid) | ||
retrieve_records(:related_records, message) | ||
end | ||
|
||
# @param uids [Array<String>] a list of WOS UIDs | ||
# @return [WosRecords] | ||
def retrieve_by_id(uids) | ||
message = retrieve_by_id_params(uids) | ||
retrieve_records(:retrieve_by_id, message) | ||
end | ||
|
||
# @param name [String] a CSV name pattern: {last name}, {first_name} [{middle_name} | {middle initial}] | ||
# @return [WosRecords] | ||
def search_by_name(name) | ||
message = search_by_name_params(name) | ||
retrieve_records(:search, message) | ||
end | ||
|
||
private | ||
|
||
################################################################### | ||
# WoS Query Record Collators | ||
|
||
def retrieve_records(operation, message) | ||
response = wos_client.search.call(operation, message: message) | ||
retrieve_additional_records(response, "#{operation}_response".to_sym) | ||
end | ||
|
||
# @param response [Savon::Response] | ||
# @param response_type [Symbol] | ||
# @return [WosRecords] | ||
def retrieve_additional_records(response, response_type) | ||
records = records(response, response_type) | ||
record_total = records_found(response, response_type) | ||
if record_total > MAX_RECORDS | ||
retrieve_operation = :retrieve | ||
retrieve_operation = :cited_references_retrieve if response_type == :cited_references_response | ||
query_id = query_id(response, response_type) | ||
# How many iterations to go? We've already got MAX_RECORDS | ||
iterations = record_total / MAX_RECORDS | ||
iterations -= 1 if (record_total % MAX_RECORDS).zero? | ||
[*1..iterations].each do |i| | ||
first_record = (MAX_RECORDS * i) + 1 | ||
message = { | ||
queryId: query_id, | ||
retrieveParameters: retrieve_parameters(first_record: first_record) | ||
} | ||
response_i = wos_client.search.call(retrieve_operation, message: message) | ||
records_i = records(response_i, "#{retrieve_operation}_response".to_sym) | ||
records = records.merge_records records_i | ||
end | ||
end | ||
records | ||
end | ||
|
||
################################################################### | ||
# WoS SOAP Response Parsers | ||
|
||
# @param response [Savon::Response] a WoS SOAP response | ||
# @param response_type [Symbol] a WoS SOAP response type | ||
# @return [Hash] return data | ||
def response_return(response, response_type) | ||
response.body[response_type][:return] | ||
end | ||
|
||
# @param response [Savon::Response] a WoS SOAP response | ||
# @param response_type [Symbol] a WoS SOAP response type | ||
# @return [Integer] | ||
def query_id(response, response_type) | ||
response_return(response, response_type)[:query_id].to_i | ||
end | ||
|
||
# @param response [Savon::Response] a WoS SOAP response | ||
# @param response_type [Symbol] a WoS SOAP response type | ||
# @return [Integer] | ||
def records_found(response, response_type) | ||
response_return(response, response_type)[:records_found].to_i | ||
end | ||
|
||
# @param response [Savon::Response] a WoS SOAP response | ||
# @param response_type [Symbol] a WoS SOAP response type | ||
# @return [WosRecords] | ||
def records(response, response_type) | ||
WosRecords.new(records: response_return(response, response_type)[:records]) | ||
end | ||
|
||
################################################################### | ||
# Search User Query Helpers | ||
|
||
# Constructs a WoS name query | ||
# @param name [String] a CSV name pattern: {last name}, {first_name} [{middle_name} | {middle initial}] | ||
def name_query(name) | ||
split_name = name.split(',') | ||
last_name = split_name[0] | ||
first_middle_name = split_name[1] | ||
first_name = first_middle_name.split(' ')[0] | ||
middle_name = first_middle_name.split(' ')[1] | ||
name_query = "#{last_name} #{first_name} OR #{last_name} #{first_name[0]}" | ||
name_query += " OR #{last_name} #{first_name[0]}#{middle_name[0]} OR #{last_name} #{first_name} #{middle_name[0]}" unless middle_name.blank? | ||
name_query | ||
end | ||
|
||
# Search authors from these institutions | ||
# @return [Array<String>] institution names | ||
def institutions | ||
['Stanford University'] | ||
end | ||
|
||
################################################################### | ||
# WoS Query Parameters | ||
|
||
# @param uid [String] a WOS UID | ||
# @return [Hash] citedReferences parameters | ||
def cited_references_params(uid) | ||
retrieve_options = [ { key: 'Hot', value: 'On' } ] | ||
{ | ||
databaseId: database, | ||
uid: uid, | ||
queryLanguage: QUERY_LANGUAGE, | ||
retrieveParameters: retrieve_parameters(options: retrieve_options) | ||
} | ||
end | ||
|
||
# @param uid [String] a WOS UID | ||
# @return [Hash] citingArticles parameters | ||
def citing_articles_params(uid) | ||
{ | ||
databaseId: database, | ||
uid: uid, | ||
timeSpan: time_span, | ||
queryLanguage: QUERY_LANGUAGE, | ||
retrieveParameters: retrieve_parameters | ||
} | ||
end | ||
|
||
# @param uid [String] a WOS UID | ||
# @return [Hash] relatedRecords parameters | ||
def related_records_params(uid) | ||
# The 'WOS' database is the only option for this query | ||
{ | ||
databaseId: 'WOS', | ||
uid: uid, | ||
timeSpan: time_span, | ||
queryLanguage: QUERY_LANGUAGE, | ||
retrieveParameters: retrieve_parameters | ||
} | ||
end | ||
|
||
# @param uids [Array<String>] a list of WOS UIDs | ||
# @return [Hash] retrieveById parameters | ||
def retrieve_by_id_params(uids) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A ton of repetition here in the base_params.merge(different_key: different_val) in the places where individual key/val pairs need to be added or overridden? This would put the focus on what is unique to each set of params rather than repeating all the things that are in common. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you look closely at it, there are slight differences in each set of params. Given that we know the request XML is order-sensitive (right? why? weird, if so, but yea ...), doesn't this make it difficult to use a Hash.merge strategy with any confidence? I think we should be glad that savon allows us to use Hash params. These methods have already been refactored a bit to allow for some existing use-case flexibility; we might discover more as we go. I'm open to exploring whether or not a Hash.merge strategy might work and will try it out soon. If it's not a blocker right now, maybe we can move on and adapt as we go? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I acknowledged those slight differences in my comment. My intent is to have a common starting point, then merge/reject from there. This would make it obvious what each of those differences is, I.E., what is special about this method. The ordered XML thing does not apply since we are still giving the Savon gem a Hash. It figures out the order now, and it would figure out the order the same way, since we would be passing it exactly the same param. This is just about how we build the hash and how much we repeat ourselves. In general, this codebase has suffered from lack of facility manipulating ruby Hash, so I would like to start repairing that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll take another look at this. |
||
{ | ||
databaseId: database, | ||
uid: uids, | ||
queryLanguage: QUERY_LANGUAGE, | ||
retrieveParameters: retrieve_parameters | ||
} | ||
end | ||
|
||
# @param first_record [Integer] the record number offset (defaults to 1) | ||
# @param count [Integer] the number of records to retrieve (defaults to 100) | ||
# @return [Hash] retrieve parameters | ||
def retrieve_parameters(count: MAX_RECORDS, first_record: 1, options: retrieve_options) | ||
{ | ||
firstRecord: first_record, | ||
count: count, | ||
option: options | ||
} | ||
end | ||
|
||
# @return [Array<Hash>] retrieve parameter options | ||
def retrieve_options | ||
[ | ||
{ | ||
key: 'RecordIDs', | ||
value: 'On' | ||
}, | ||
{ | ||
key: 'targetNamespace', | ||
value: 'http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord' | ||
} | ||
] | ||
end | ||
|
||
# @param user_query [String] | ||
# @return [Hash] search query parameters | ||
def search_params(user_query) | ||
{ | ||
queryParameters: { | ||
databaseId: database, | ||
userQuery: user_query, | ||
timeSpan: time_span, | ||
queryLanguage: QUERY_LANGUAGE | ||
}, | ||
retrieveParameters: retrieve_parameters | ||
} | ||
end | ||
|
||
# @param name [String] a CSV name pattern: {last name}, {first_name} [{middle_name} | {middle initial}] | ||
# @return [Hash] search query parameters | ||
def search_by_name_params(name) | ||
user_query = "AU=(#{name_query(name)}) AND AD=(#{institutions.join(' OR ')})" | ||
search_params(user_query) | ||
end | ||
|
||
# @return [Hash] time span dates | ||
def time_span | ||
{ | ||
begin: START_DATE, | ||
end: Time.zone.now.strftime('%Y-%m-%d') | ||
} | ||
end | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you have these 5 public methods that are effectively aliases for private ones? Put the body of the private methods here and remove the "collator" methods.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll try to move some stuff where this is done in #242 already, into this PR. This pattern of using a collator emerged from doing one request first and then another and another and then noticing that the collators themselves could be refactored, until they became so small that they, well, don't need to exist any more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done