Skip to content

Commit

Permalink
Add ropensci scraper (#55)
Browse files Browse the repository at this point in the history
* work to add ropensci as new scraper
Signed-off-by: vsoch <vsoch@users.noreply.github.com>
  • Loading branch information
vsoch committed Jan 29, 2022
1 parent 2580b0d commit 16a3474
Show file tree
Hide file tree
Showing 16 changed files with 270 additions and 25 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are:
The versions coincide with releases on pip.

## [0.0.x](https://github.com/rseng/rse/tree/master) (0.0.x)
- adding ropensci scraper (0.0.34)
- updating local index to be a data table (0.0.33)
- move repository link to be part of card (0.0.32)
- ipython should not be required for shell (0.0.31)
Expand Down
41 changes: 41 additions & 0 deletions docs/_docs/getting-started/scrapers.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ and it includes links to repositories:
- [bio.tools](#biotools)
- [Hal Research Software Database](#hal)
- [Research Software NL Dictionary](#researchsoftwarenl)
- [ROpenSci](#ropensci)


<a id="joss">
Expand Down Expand Up @@ -235,3 +236,43 @@ scraper = get_named_scraper('rsnl')
from rse.main.scrapers import RSNLScraper
scraper = RSNLScraper()
```


<a id="ropensci">
### ROpenSci

The [ROpenSci](https://github.com/ropensci/) GitHub organization includes peer
reviewed software that renders to [https://docs.ropensci.org](https://docs.ropensci.org).
We do this by way of parsing the ROpenSci GitHub repository (e.g., to get the latest
or full listing) and also comparing this to the [registry.json](https://github.com/ropensci/roregistry/blob/gh-pages/registry.json)
file. This means that we:

1. Start with the GitHub repository listing, and skip over any repos not in the registry.
2. We update the metadata with topics and description (if not defined) from the registry.
3. In the case of a full parsing (not looking for latest) we add any names from the registry not seen in GitHub (e.g., repositories in other organizations)
4. In the case of a "latest" parsing we do not consider this list.

This seems to do a fairly good job of capturing the bulk of ROpenSci repos!
The "latest" scrape looks like this:

```bash
$ rse scrape ropensci
```

To do a dry run:

```bash
$ rse scrape --dry-run ropensci
```

The [within python](#within-python) interaction is the same, except you need to
select the ropensci named parser.

```python
from rse.main.scrapers import get_named_scraper
scraper = get_named_scraper('ropensci')
```
```python
from rse.main.scrapers import ROpenSciScraper
scraper = ROpenSciScraper()
```
2 changes: 1 addition & 1 deletion rse/app/templates/topics/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ <h1 style="color:white">Research Software Encyclopedia</h1>
<tr>
<th>Repository</th>
<th>Description</th>
<th>✏️ Taxonomy</th>
<th>✏️ Criteria</th>
<th>✏️ Taxonomy</th>
<th>Topics</th>
</tr>
</thead>
Expand Down
1 change: 0 additions & 1 deletion rse/client/shell.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from rse.main import Encyclopedia
from rse.defaults import RSE_SHELL
import sys


def main(args, extra):
Expand Down
1 change: 0 additions & 1 deletion rse/client/topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
"""

from rse.main import Encyclopedia
from rse.logger import bot


def main(args, extra):
Expand Down
4 changes: 2 additions & 2 deletions rse/main/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,10 @@ def bulk_update(self, filename, rewrite=False):
pass
return repos

def add(self, uid, quiet=False):
def add(self, uid, quiet=False, data=None):
"""A wrapper to add a repository to the software database."""
if not self.exists(uid):
repo = self.db.add(uid)
repo = self.db.add(uid, data=data)
return repo
if not quiet:
bot.error(f"{uid} already exists in the database.")
Expand Down
8 changes: 6 additions & 2 deletions rse/main/database/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,15 @@ def exists(self, uid):
except:
return False

def add(self, uid):
def add(self, uid, data=None):
"""Add a new software repository to the database."""
if uid:
parser = get_parser(uid, config=self.config)
data = parser.get_metadata()

if not data:
data = parser.get_metadata()
else:
parser.data = data

# If it's a parser handoff
if isinstance(data, ParserBase):
Expand Down
8 changes: 6 additions & 2 deletions rse/main/database/relational.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,17 @@ def clear(self):

# Add or Update requires executor

def add(self, uid):
def add(self, uid, data=None):
"""Create a new repo based on a uid that matches to a parser."""
from rse.main.database.models import SoftwareRepository

parser = get_parser(uid, config=self.config)
if not self.exists(parser.uid):
data = parser.get_metadata()

if not data:
data = parser.get_metadata()
else:
parser.data = data

# If it's a parser handoff
if isinstance(data, ParserBase):
Expand Down
75 changes: 65 additions & 10 deletions rse/main/parsers/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@
"""

import logging
import random
import requests
from rse.utils.urls import check_response
from time import sleep
from rse.utils.urls import get_user_agent, check_response

from .base import ParserBase

Expand Down Expand Up @@ -52,6 +54,39 @@ def get_description(self, data=None):
data = data or self.data
return data.get("description")

def get_org_repos(self, org, paginate=True, delay=None):
"""
A helper function to get a listing of org repos.
"""
self.load_secrets()
url = "https://api.github.com/orgs/%s/repos?per_page=100" % (org)
headers = {
"Accept": "application/vnd.github.symmetra-preview+json",
"User-Agent": get_user_agent(),
}
if self.token:
headers["Authorization"] = "token %s" % self.token

repos = []

# Start at 2, as 1 is implied to be the first
page = 2
original_url = url
while url is not None:
response = requests.get(url, headers=headers)
data = check_response(response)

# Reset the url to be None
url = None
if data and paginate:
url = original_url + "&page=%s" % page

repos = repos + data
page += 1
# Sleep for a random amount of time to give a rest!
sleep(delay or random.choice(range(1, 10)) * 0.1)
return repos

def get_metadata(self, uri=None):
"""Retrieve repository metadata. The common metadata (timestamp) is
added by the software repository parser, and here we need to
Expand All @@ -78,8 +113,22 @@ def get_metadata(self, uri=None):
if data is None:
return None

self.data = self.parse_github_repo(data)
return self.data

def parse_github_repo(self, repo):
"""
Given an API response for a GitHub repository, parse a minimal set.
"""
self.load_secrets()
headers = {
"Accept": "application/vnd.github.symmetra-preview+json",
}
if self.token:
headers["Authorization"] = "token %s" % self.token

# Only save minimal set
self.data = {}
data = {}
for key in [
"name",
"url",
Expand All @@ -99,20 +148,26 @@ def get_metadata(self, uri=None):
"license",
"subscribers_count",
]:
if key in data:
self.data[key] = data[key]
self.data["owner"] = {}
if key in repo:
data[key] = repo[key]
data["owner"] = {}
for key in ["html_url", "avatar_url", "login", "type"]:
self.data["owner"][key] = data["owner"][key]
data["owner"][key] = repo["owner"][key]

# Also try to get topics
headers.update({"Accept": "application/vnd.github.mercy-preview+json"})
url = "%s/topics" % url
url = "%s/topics" % repo["url"]
response = requests.get(url, headers=headers)

# Successful query!
# Add topics on successful query
topics = check_response(response)
if topics is not None:
self.data["topics"] = topics.get("names", [])
data["topics"] = topics.get("names", [])

return self.data
# Add topics from another source
if "topics" not in data and "topics" in repo:
data["topics"] = repo["topics"]
elif "topics" in data and "topics" in repo:
data["topics"] += [x for x in repo["topics"] if x not in data["topics"]]

return data
3 changes: 3 additions & 0 deletions rse/main/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .hal import HalScraper
from .joss import JossScraper
from .rsnl import RSNLScraper
from .ropensci import ROpenSciScraper
import re


Expand All @@ -26,6 +27,8 @@ def get_named_scraper(name, config=None):
scraper = HalScraper()
elif re.search("(researchsoftwarenl|rsnl)", name, re.IGNORECASE):
scraper = RSNLScraper()
elif re.search("ropensci", name, re.IGNORECASE):
scraper = ROpenSciScraper()

if not scraper:
raise NotImplementedError(f"There is no matching scraper for {name}")
Expand Down
132 changes: 132 additions & 0 deletions rse/main/scrapers/ropensci.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""
Copyright (C) 2022 Vanessa Sochat.
This Source Code Form is subject to the terms of the
Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed
with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""

from rse.main.parsers import GitHubParser
import logging
import requests

from .base import ScraperBase

bot = logging.getLogger("rse.main.scrapers.ropensci")


class ROpenSciScraper(ScraperBase):

name = "ropensci"

def __init__(self, query=None, **kwargs):
super().__init__(query)

def latest(self, paginate=False, delay=0.0):
"""
Only return latest (top page) of results (default is sort by created)
"""
return self.scrape(paginate=paginate, delay=delay)

def search(self, query=None, paginate=True, delay=0.0):
"""
Return all paginated results with search (no query accepted for now)
"""
return self.scrape(paginate=paginate, delay=delay)

def read_registry(self):
"""
Read the registry file to create a lookup of repos based on GitHub URL.
"""
lookup = {}
response = requests.get(
"https://raw.githubusercontent.com/ropensci/roregistry/gh-pages/registry.json"
).json()
for entry in response.get("packages", []):
# This is the GitHub URL
url = entry.get("github")
if not url:
continue
lookup[url] = entry
return lookup

def get_registry_topics(self, meta):
"""
Given a metadata entry from the registry lookup, parse and return topics
"""
topics = [x.strip() for x in meta.get("keywords", "").split(",") if x.strip()]
if meta.get("ropensci_category"):
topics += [meta.get("ropensci_category")]
return topics

def scrape(self, paginate=False, delay=None):
"""A shared function to scrape a set of repositories. Since the JoSS
pages for a search and the base are the same, we can use a shared
function.
"""
# This serves as 1: a lookup, and 2: to find non ropensci repos!
lookup = self.read_registry()

# Full GitHub urls that we expect to find!
names = set(lookup.keys())

parser = GitHubParser()
repos = parser.get_org_repos("ropensci", paginate=paginate, delay=delay)

for entry in repos:

# We determine belonging based on the github url
if entry["html_url"] not in names:
bot.info("Skipping repository: %s" % entry["html_url"])
continue

meta = lookup[entry["html_url"]]

# Get topics from R metadata
topics = self.get_registry_topics(meta)
names.remove(entry["html_url"])
if "topics" not in entry:
entry["topics"] = []
entry["topics"] += topics

# Add a description if missing
if not entry.get("description") and meta.get("description"):
entry["description"] = meta["description"]

bot.info("Adding repository: %s" % entry["html_url"])
self.results.append(parser.parse_github_repo(entry))

# If paginate is True, we intend to add ALL repos, so check those still remaining
# E.g., there are repos in other orgs that won't be found above
if paginate and names:
for name in names:

parser = GitHubParser(uid=name)
# Topics will be added here!
entry = parser.get_metadata()
meta = lookup[name]

# Add a description if missing
if not entry.get("description") and meta.get("description"):
entry["description"] = meta["description"]
entry["topics"] += self.get_registry_topics(meta)
bot.info("Adding repository: %s" % entry["html_url"])
self.results.append(entry)

return self.results

def create(self, database=None, config_file=None):
"""After a scrape (whether we obtain latest or a search query) we
run create to create software repositories based on results.
"""
from rse.main import Encyclopedia

client = Encyclopedia(config_file=config_file, database=database)
for result in self.results:
uid = result["html_url"].split("//")[-1]

# Add results that don't exist
if not client.exists(uid):
client.add(uid, data=result)

0 comments on commit 16a3474

Please sign in to comment.