Skip to content

Commit

Permalink
Merge pull request #314 from ror-community/upgrade-es7
Browse files Browse the repository at this point in the history
Upgrade es7
  • Loading branch information
lizkrznarich committed Apr 18, 2023
2 parents 5320354 + 4cc08fc commit b9c0456
Show file tree
Hide file tree
Showing 22 changed files with 4,875 additions and 161 deletions.
18 changes: 17 additions & 1 deletion .github/workflows/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,15 @@ jobs:
runs-on: ubuntu-latest
env:
ELASTIC_HOST: "localhost"
ELASTIC_PORT: "9200"
ELASTIC7_HOST: "localhost"
ELASTIC7_PORT: "9201"
ELASTIC_PASSWORD: "changeme"
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.AWS_REGION }}
GITHUB_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
LAUNCH_DARKLY_KEY: ${{ secrets.LAUNCH_DARKLY_KEY_DEV }}
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:6.8.23
Expand All @@ -27,6 +31,18 @@ jobs:
http.cors.allow-origin: "*"
ports:
- 9200:9200
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:7.10.0
env:
discovery.type: single-node
ES_JAVA_OPTS: -Xms512m -Xmx512m
ELASTIC_PASSWORD: changeme
xpack.security.enabled: "false"
http.cors.enabled: "true"
http.cors.allow-origin: "*"
http.port: 9201
ports:
- 9201:9201
steps:
- name: Checkout ror-api code
uses: actions/checkout@v2
Expand Down Expand Up @@ -66,7 +82,7 @@ jobs:
- name: Setup
working-directory: ./ror-api
run: |
python manage.py setup $LATEST_DUMP_FILE 6
python manage.py setup $LATEST_DUMP_FILE 7
- name: Test
working-directory: ./ror-api
run: |
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
Django==2.2.28
elasticsearch_dsl==6.4.0
elasticsearch_dsl==7.4.1
geonamescache==1.3.0
requests==2.22.0
requests-aws4auth==0.9
mock==3.0.5
base32_crockford==0.3.0
elasticsearch==6.4.0
elasticsearch==7.10.1
djangorestframework==3.11.2
coreapi==2.3.3
django-prometheus==1.0.15
Expand All @@ -19,3 +19,4 @@ statsmodels==0.10.2
boto3
titlecase==2.3
update_address @ git+https://github.com/ror-community/update_address.git
launchdarkly-server-sdk
14 changes: 11 additions & 3 deletions rorapi/es_utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
from .settings import ES, ES_VARS
from .settings import ES, ES7, ES_VARS

from elasticsearch_dsl import Search, Q


class ESQueryBuilder():
"""Elasticsearch query builder class"""
def __init__(self):
self.search = Search(using=ES, index=ES_VARS['INDEX'])
def __init__(self, enable_es_7):
if enable_es_7:
print("using ES7")
self.search = Search(using=ES7, index=ES_VARS['INDEX'])
self.search = self.search.extra(track_total_hits=True)
else:
print("using ES6")
self.search = Search(using=ES, index=ES_VARS['INDEX'])

self.search = self.search.params(search_type='dfs_query_then_fetch')


def add_id_query(self, id):
self.search = self.search.query('match',
id={
Expand Down
6 changes: 6 additions & 0 deletions rorapi/features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import ldclient
from ldclient.config import Config
from .settings import LAUNCH_DARKLY_KEY

ldclient.set_config(Config(LAUNCH_DARKLY_KEY))
launch_darkly_client = ldclient.get()
1 change: 0 additions & 1 deletion rorapi/management/commands/indexror.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@ def index(dataset):
body.append({
'index': {
'_index': index,
'_type': 'org',
'_id': org['id']
}
})
Expand Down
28 changes: 14 additions & 14 deletions rorapi/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def match_by_query(text, matching_type, query, countries):
]
return chosen, all_matched

def match_by_type(text, matching_type, countries):
def match_by_type(text, matching_type, countries, enable_es_7):
'''Match affiliation text using specific matching mode/type.'''

fields = ['name.norm', 'aliases.norm', 'labels.label.norm']
Expand All @@ -228,7 +228,7 @@ def match_by_type(text, matching_type, countries):
else:
substrings.append(text)

queries = [ESQueryBuilder() for _ in substrings]
queries = [ESQueryBuilder(enable_es_7) for _ in substrings]

for s, q in zip(substrings, queries):
if matching_type == MATCHING_TYPE_PHRASE:
Expand Down Expand Up @@ -263,10 +263,10 @@ def __init__(self, text):
self.matched = None
self.all_matched = []

def match(self, countries, min_score):
def match(self, countries, min_score, enable_es_7):
for matching_type in NODE_MATCHING_TYPES:
chosen, all_matched = match_by_type(self.text, matching_type,
countries)
countries, enable_es_7)
self.all_matched.extend(all_matched)
if self.matched is None:
self.matched = chosen
Expand Down Expand Up @@ -328,9 +328,9 @@ def remove_low_scores(self, min_score):
if node.matched is not None and node.matched.score < min_score:
node.matched = None

def match(self, countries, min_score):
def match(self, countries, min_score, enable_es_7):
for node in self.nodes:
node.match(countries, min_score)
node.match(countries, min_score, enable_es_7)
self.remove_low_scores(min_score)
chosen = []
all_matched = []
Expand All @@ -342,7 +342,7 @@ def match(self, countries, min_score):
chosen.append(node.matched)
acr_chosen, acr_all_matched = match_by_type(self.affiliation,
MATCHING_TYPE_ACRONYM,
countries)
countries, enable_es_7)
all_matched.extend(acr_all_matched)
return chosen, all_matched

Expand Down Expand Up @@ -399,27 +399,27 @@ def get_output(chosen, all_matched, active_only):
output.append(best)
return sorted(output, key=lambda x: x.score, reverse=True)[:100]

def check_exact_match(affiliation, countries):
qb = ESQueryBuilder()
def check_exact_match(affiliation, countries, enable_es_7):
qb = ESQueryBuilder(enable_es_7)
qb.add_string_query('"' + affiliation + '"')
return match_by_query(affiliation, MATCHING_TYPE_EXACT, qb.get_query(), countries)

def match_affiliation(affiliation, active_only):
def match_affiliation(affiliation, active_only, enable_es_7):
countries = get_countries(affiliation)
exact_chosen, exact_all_matched = check_exact_match(affiliation, countries)
exact_chosen, exact_all_matched = check_exact_match(affiliation, countries, enable_es_7)
if exact_chosen.score == 1.0:
return get_output(exact_chosen, exact_all_matched, active_only)
else:
graph = MatchingGraph(affiliation)
chosen, all_matched = graph.match(countries, MIN_CHOSEN_SCORE)
chosen, all_matched = graph.match(countries, MIN_CHOSEN_SCORE, enable_es_7)
return get_output(chosen, all_matched, active_only)

def match_organizations(params):
def match_organizations(params, enable_es_7):
if 'affiliation' in params:
active_only = True
if 'all_status' in params:
if params['all_status'] == '' or params['all_status'].lower() == "true":
active_only = False
matched = match_affiliation(params.get('affiliation'), active_only)
matched = match_affiliation(params.get('affiliation'), active_only, enable_es_7)
return None, MatchingResult(matched)
return Errors('"affiliation" parameter missing'), None
9 changes: 7 additions & 2 deletions rorapi/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ def __init__(self, data):
class Organization(Entity):
"""Organization model class"""
def __init__(self, data):
if "_source" in data:
data = data["_source"]
super(Organization, self).__init__(data, [
'id', 'name', 'types', 'links', 'aliases', 'acronyms', 'status',
'wikipedia_url', 'established', 'relationships', 'addresses'
Expand Down Expand Up @@ -163,8 +165,11 @@ def __init__(self, data):

class ListResult:
"""A model class for the list of organizations returned from the search"""
def __init__(self, data):
self.number_of_results = data.hits.total
def __init__(self, data, enable_es_7):
if enable_es_7:
self.number_of_results = data.hits.total.get('value')
else:
self.number_of_results = data.hits.total
self.time_taken = data.took
self.items = [Organization(x) for x in data]
self.meta = Aggregations(data.aggregations)
Expand Down
27 changes: 16 additions & 11 deletions rorapi/queries.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import json
from titlecase import titlecase
from collections import defaultdict

Expand Down Expand Up @@ -145,10 +146,10 @@ def validate(params):
return Errors(errors) if errors else None


def build_search_query(params):
def build_search_query(params, enable_es_7):
"""Builds search query from API parameters"""

qb = ESQueryBuilder()
qb = ESQueryBuilder(enable_es_7)
ror_id = None

if 'all_status' in params:
Expand Down Expand Up @@ -201,37 +202,41 @@ def build_search_query(params):
('countries', 'country.country_code'), ('statuses', 'status')])

qb.paginate(int(params.get('page', 1)))

return qb.get_query()


def build_retrieve_query(ror_id):
def build_retrieve_query(ror_id, enable_es_7):
"""Builds retrieval query"""

qb = ESQueryBuilder()
qb = ESQueryBuilder(enable_es_7)
qb.add_id_query(ror_id)
return qb.get_query()


def search_organizations(params):
def search_organizations(params, enable_es_7):
"""Searches for organizations according to the parameters"""

error = validate(params)
if error is not None:
return error, None

search = build_search_query(params)
return None, ListResult(search.execute())
search = build_search_query(params, enable_es_7)
return None, ListResult(search.execute(), enable_es_7)


def retrieve_organization(ror_id):
def retrieve_organization(ror_id, enable_es_7):
"""Retrieves the organization of the given ROR ID"""
if any(ror_id in ror_id_url for ror_id_url in GRID_REMOVED_IDS):
return Errors(["ROR ID \'{}\' was removed by GRID during the time period (Jan 2019-Mar 2022) "
"that ROR was synced with GRID. We are currently working with the ROR Curation Advisory Board "
"to restore these records and expect to complete this work in 2022".format(ror_id)]), None
search = build_retrieve_query(ror_id)
search = build_retrieve_query(ror_id, enable_es_7)
results = search.execute()
if results.hits.total > 0:
total = None
if enable_es_7:
total = results.hits.total.get('value')
else:
total = results.hits.total
if total > 0:
return None, Organization(results[0])
return Errors(['ROR ID \'{}\' does not exist'.format(ror_id)]), None
2 changes: 2 additions & 0 deletions rorapi/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,3 +272,5 @@
ROR_API = {'PAGE_SIZE': 20, 'ID_PREFIX': 'https://ror.org/'}

GRID_REMOVED_IDS = []

LAUNCH_DARKLY_KEY = os.environ.get('LAUNCH_DARKLY_KEY')
31 changes: 0 additions & 31 deletions rorapi/tests/data/test_data_empty.json

This file was deleted.

31 changes: 31 additions & 0 deletions rorapi/tests/data/test_data_empty_es6.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 0,
"hits": []
},
"aggregations": {
"types": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
},
"countries": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
},
"statuses": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
}
35 changes: 35 additions & 0 deletions rorapi/tests/data/test_data_empty_es7.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"took" : 32,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"types" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
},
"statuses" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
},
"countries" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [ ]
}
}
}
File renamed without changes.
Loading

0 comments on commit b9c0456

Please sign in to comment.