Skip to content

Commit

Permalink
Completing scraping data.
Browse files Browse the repository at this point in the history
  • Loading branch information
rendicahya committed Jul 8, 2022
1 parent 2477a66 commit beeb5d4
Showing 1 changed file with 31 additions and 10 deletions.
41 changes: 31 additions & 10 deletions sinta/affiliation_authors.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,34 @@
from pprint import pprint

from bs4 import BeautifulSoup
from requests import get
from pprint import pprint

from util.config import get_config
from util.utils import format_output, cast, listify, run_thread, compact_list, singlify
from util.utils import format_output, cast, run_thread, singlify


def affiliation_authors(affiliation_id):
def affiliation_authors(affiliation_id, output_format='dict'):
affiliation_id = singlify(affiliation_id)
domain = get_config()['domain']
url = f'{domain}/affiliations/authors/{affiliation_id}'
html = BeautifulSoup(get(url).content, 'html.parser')
n_pages = html.select('.pagination-text')[0].text.split(' ')[3]

n_pages = cast(html.select('.pagination-text')[0].text.split(' ')[3])
result = parse(html)
thread_result = run_thread(worker, list(range(2, n_pages + 1)), affiliation_id=affiliation_id)

result.extend(thread_result)

return format_output(result, output_format)


def worker(page, result, **kwargs):
affiliation_id = kwargs['affiliation_id']
domain = get_config()['domain']
url = f'{domain}/affiliations/authors/{affiliation_id}?page={page}'
html = BeautifulSoup(get(url).content, 'html.parser')
data = parse(html)

result.extend(data)


def parse(html):
Expand All @@ -34,6 +50,10 @@ def parse(html):
h_index_numbers = [cast(h_index_row[i].text.split(':')[1].strip()) for i in (0, 1)]
h_index = dict(zip(('scopus', 'scholar'), h_index_numbers))

score_names = 'sinta_3_years', 'sinta', 'affil_3_years', 'affil'
score_numbers = [cast(row.select('.stat-num')[i].text.replace('.', '')) for i in range(4)]
scores = dict(zip(score_names, score_numbers))

result.append({
'profile_picture': profile_picture,
'id': profile_id,
Expand All @@ -43,13 +63,14 @@ def parse(html):
'url': department_url,
'name': department_name
},
'h_index': h_index
'h_index': h_index,
'scores': scores
})

pprint(result)

break
return result


if __name__ == '__main__':
affiliation_authors(404)
data = affiliation_authors(404)

pprint(data)

0 comments on commit beeb5d4

Please sign in to comment.