Skip to content

Commit

Permalink
preparing to add imperial college london research software director s…
Browse files Browse the repository at this point in the history
…craper (#58)

* preparing to add imperial college london research software directory scraper
Signed-off-by: vsoch <vsoch@users.noreply.github.com>
  • Loading branch information
vsoch committed May 12, 2022
1 parent 9bdf82b commit 8b5b3aa
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
run: |
export PATH="/usr/share/miniconda/bin:$PATH"
source activate black
pip install black==20.8b1
pip install black
black --check rse
- name: Check imports with pyflakes
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are:
The versions coincide with releases on pip.

## [0.0.x](https://github.com/rseng/rse/tree/master) (0.0.x)
- adding imperial college london research software directory (0.0.36)
- adding molssi scraper (0.0.35)
- adding ropensci scraper (0.0.34)
- updating local index to be a data table (0.0.33)
Expand Down
27 changes: 26 additions & 1 deletion docs/_docs/getting-started/scrapers.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ and it includes links to repositories:
- [Research Software NL Dictionary](#researchsoftwarenl)
- [ROpenSci](#ropensci)
- [The Molecular Sciences Software Institute](#molssi)
- [The Imperial College London Research Software Directory](#imperial)


<a id="joss">
Expand Down Expand Up @@ -292,7 +293,7 @@ $ rse scrape --dry-run molssi
```

The [within python](#within-python) interaction is the same, except you need to
select the ropensci named parser.
select the molssi named parser.

```python
from rse.main.scrapers import get_named_scraper
Expand All @@ -302,3 +303,27 @@ scraper = get_named_scraper('molssi')
from rse.main.scrapers import MolssiScraper
scraper = MolssiScraperScraper()
```

<a id="imperial">
### Imperial College London Research Software Directory

You can browse the directory [here](https://imperialcollegelondon.github.io/research-software-directory/)!
Note that if you want to use this as a Jekyll template (without relying on Alogia) @vsoch
has prepared a template [here](https://github.com/vsoch/search).

```bash
$ rse scrape imperial
$ rse scrape --dry-run imperial
```

The [within python](#within-python) interaction is the same, except you need to
select the right named parser.

```python
from rse.main.scrapers import get_named_scraper
scraper = get_named_scraper('imperial')

# or!
from rse.main.scrapers import ImperialCollegeLondonScraper
scraper = ImperialCollegeLondonScraper()
```
3 changes: 3 additions & 0 deletions rse/main/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .molssi import MolssiScraper
from .rsnl import RSNLScraper
from .ropensci import ROpenSciScraper
from .imperial import ImperialCollegeLondonScraper
import re


Expand All @@ -32,6 +33,8 @@ def get_named_scraper(name, config=None):
scraper = ROpenSciScraper()
elif re.search("molssi", name, re.IGNORECASE):
scraper = MolssiScraper()
elif re.search("imperial", name, re.IGNORECASE):
scraper = ImperialCollegeLondonScraper()

if not scraper:
raise NotImplementedError(f"There is no matching scraper for {name}")
Expand Down
16 changes: 9 additions & 7 deletions rse/main/scrapers/hal.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def __init__(self, query=None, **kwargs):
super().__init__(query)

def latest(self, paginate=False, delay=0.0):
"""populate self.results with some number of latest entries. Unlike
"""
populate self.results with some number of latest entries. Unlike
a search, a latest scraper does not by default paginate. Hal will by
default return all entries, so the user is required to define a number
for latest.
Expand All @@ -40,8 +41,9 @@ def latest(self, paginate=False, delay=0.0):
return self.scrape(url, delay=delay)

def search(self, query, paginate=True, delay=0.0):
"""populate self.results with a listing based on matching a search criteria.
we search the description.
"""
populate self.results with a listing based on matching a search criteria.
We search the description.
"""
url = (
"http://api.archives-ouvertes.fr/search/?q=%s&fq=docType_s:(SOFTWARE)&wt=json"
Expand All @@ -50,9 +52,8 @@ def search(self, query, paginate=True, delay=0.0):
return self.scrape(url, delay=delay)

def scrape(self, url, paginate=False, delay=0.0):
"""A shared function to scrape a set of repositories. Since the JoSS
pages for a search and the base are the same, we can use a shared
function.
"""
A shared function to scrape a set of repositories.
"""
# Api doesn't appear to have pagination
response = requests.get(url, headers={"User-Agent": get_user_agent()})
Expand All @@ -75,7 +76,8 @@ def scrape(self, url, paginate=False, delay=0.0):
return self.results

def create(self, database=None, config_file=None):
"""After a scrape (whether we obtain latest or a search query) we
"""
After a scrape (whether we obtain latest or a search query) we
run create to create software repositories based on results.
"""
from rse.main import Encyclopedia
Expand Down
95 changes: 95 additions & 0 deletions rse/main/scrapers/imperial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""
Copyright (C) 2022 Vanessa Sochat.
This Source Code Form is subject to the terms of the
Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed
with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""

from rse.logger import logger
from rse.utils.urls import get_user_agent, repository_regex
from rse.main.parsers import get_parser
import logging
import requests
import re
import csv

from .base import ScraperBase

bot = logging.getLogger("rse.main.scrapers.imperial")

# Allow the regex to have newline at end
repository_regex = repository_regex.strip("$")

csv_url = "https://raw.githubusercontent.com/ImperialCollegeLondon/research-software-directory/main/repos.csv"


class ImperialCollegeLondonScraper(ScraperBase):

name = "imperial"

def __init__(self, query=None, **kwargs):
super().__init__(query)

def latest(self, paginate=False, delay=0.0):
"""
Populate self.results with some number of latest entries.
"""
return self.scrape(csv_url, delay=delay)

def search(self, query, paginate=True, delay=0.0):
"""
Populate self.results with a listing based on matching a search criteria.
"""
return self.scrape(csv_url, delay=delay)

def scrape(self, url, paginate=False, delay=0.0):
"""
A shared function to scrape a set of repositories.
"""
# Api doesn't appear to have pagination
response = requests.get(url, headers={"User-Agent": get_user_agent()})
if response.status_code != 200:
logger.exit("Could not retrieve data from %s" % url)

reader = csv.reader(response.text.split("\n"))
parsed = [x for x in list(reader) if x]
headers = parsed.pop(0)

# Lookup based on index
headers = {k: i for i, k in enumerate(headers)}
for row in parsed:
repo = row[headers["url"]]
match = re.search(repository_regex, repo, re.IGNORECASE)
repo_url = None
if match:
repo_url = match.group()

if repo_url:
meta = {k: row[i] for k, i in headers.items()}
entry = {k: v for k, v in meta.items() if v}
bot.info("Found repository: %s" % repo_url)
self.results.append(entry)

return self.results

def create(self, database=None, config_file=None):
"""
After a scrape (whether we obtain latest or a search query) we
run create to create software repositories based on results.
"""
from rse.main import Encyclopedia

client = Encyclopedia(config_file=config_file, database=database)
for entry in self.results:
repo = get_parser(entry["url"])

# Add results that don't exist
if not client.exists(repo.uid):
client.add(repo.uid)
for k, v in entry.items():
if k == "url" or not v:
continue
client.label(repo.uid, k, v, force=True)
2 changes: 1 addition & 1 deletion rse/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"""

__version__ = "0.0.35"
__version__ = "0.0.36"
AUTHOR = "Vanessa Sochat"
AUTHOR_EMAIL = "vsoch@users.noreply.github.io"
NAME = "rse"
Expand Down
4 changes: 3 additions & 1 deletion tests/test_parser_github.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@ def test_parser_github(tmp_path):
data = parser.export()
for key in ["timestamp", "url", "html_url"]:
assert key in data



def test_org_repos(tmp_path):
"""Test the github parser to retrieve org repos."""
from rse.main.parsers import GitHubParser

parser = GitHubParser()
data = parser.get_org_repos("ropensci", paginate=False)
assert len(data) == 100

0 comments on commit 8b5b3aa

Please sign in to comment.