Skip to content
This repository has been archived by the owner on Sep 7, 2023. It is now read-only.

Commit

Permalink
Merge pull request #2102 from MarcAbonce/wikipedia-api-cleanup
Browse files Browse the repository at this point in the history
Fix Wikipedia's paragraph extraction
  • Loading branch information
asciimoo committed Jul 27, 2020
2 parents 6d18769 + 77b9faa commit 1185c06
Showing 1 changed file with 1 addition and 27 deletions.
28 changes: 1 addition & 27 deletions searx/engines/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,29 +49,6 @@ def request(query, params):
return params


# get first meaningful paragraph
# this should filter out disambiguation pages and notes above first paragraph
# "magic numbers" were obtained by fine tuning
def extract_first_paragraph(content, title, image):
first_paragraph = None

failed_attempts = 0
for paragraph in content.split('\n'):

starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
length = len(paragraph)

if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
first_paragraph = paragraph
break

failed_attempts += 1
if failed_attempts > 3:
return None

return first_paragraph


# get response from search-request
def response(resp):
results = []
Expand All @@ -97,10 +74,7 @@ def response(resp):
if image:
image = image.get('source')

extract = page.get('extract')

summary = extract_first_paragraph(extract, title, image)
summary = summary.replace('() ', '')
summary = page.get('extract', '').split('\n')[0].replace('()', '')

# link to wikipedia article
wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
Expand Down

0 comments on commit 1185c06

Please sign in to comment.