Add ropensci scraper (#55)

* work to add ropensci as new scraper Signed-off-by: vsoch <vsoch@users.noreply.github.com>
rseng · Jan 29, 2022 · 16a3474 · 16a3474
1 parent 2580b0d
commit 16a3474
Show file tree

Hide file tree

Showing 16 changed files with 270 additions and 25 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are:
 The versions coincide with releases on pip.
 
 ## [0.0.x](https://github.com/rseng/rse/tree/master) (0.0.x)
+ - adding ropensci scraper (0.0.34)
  - updating local index to be a data table (0.0.33)
  - move repository link to be part of card (0.0.32)
  - ipython should not be required for shell (0.0.31)

diff --git a/docs/_docs/getting-started/scrapers.md b/docs/_docs/getting-started/scrapers.md
@@ -33,6 +33,7 @@ and it includes links to repositories:
  - [bio.tools](#biotools)
  - [Hal Research Software Database](#hal)
  - [Research Software NL Dictionary](#researchsoftwarenl)
+ - [ROpenSci](#ropensci)
 
 
 <a id="joss">
@@ -235,3 +236,43 @@ scraper = get_named_scraper('rsnl')
 from rse.main.scrapers import RSNLScraper
 scraper = RSNLScraper()
 ```
+
+
+<a id="ropensci">
+### ROpenSci
+
+The [ROpenSci](https://github.com/ropensci/) GitHub organization includes peer
+reviewed software that renders to [https://docs.ropensci.org](https://docs.ropensci.org).
+We do this by way of parsing the ROpenSci GitHub repository (e.g., to get the latest
+or full listing) and also comparing this to the [registry.json](https://github.com/ropensci/roregistry/blob/gh-pages/registry.json)
+file. This means that we:
+
+1. Start with the GitHub repository listing, and skip over any repos not in the registry.
+2. We update the metadata with topics and description (if not defined) from the registry.
+3. In the case of a full parsing (not looking for latest) we add any names from the registry not seen in GitHub (e.g., repositories in other organizations)
+4. In the case of a "latest" parsing we do not consider this list.
+
+This seems to do a fairly good job of capturing the bulk of ROpenSci repos!
+The "latest" scrape looks like this:
+
+```bash
+$ rse scrape ropensci
+```
+
+To do a dry run:
+
+```bash
+$ rse scrape --dry-run ropensci
+```
+
+The [within python](#within-python) interaction is the same, except you need to
+select the ropensci named parser.
+
+```python
+from rse.main.scrapers import get_named_scraper
+scraper = get_named_scraper('ropensci')
+```
+```python
+from rse.main.scrapers import ROpenSciScraper
+scraper = ROpenSciScraper()
+```
diff --git a/rse/app/templates/topics/index.html b/rse/app/templates/topics/index.html
@@ -29,8 +29,8 @@ <h1 style="color:white">Research Software Encyclopedia</h1>
         <tr>
             <th>Repository</th>
             <th>Description</th>
-            <th>✏️ Taxonomy</th>
             <th>✏️ Criteria</th>
+            <th>✏️ Taxonomy</th>
             <th>Topics</th>
         </tr>
   </thead>      

diff --git a/rse/client/shell.py b/rse/client/shell.py
@@ -10,7 +10,6 @@
 
 from rse.main import Encyclopedia
 from rse.defaults import RSE_SHELL
-import sys
 
 
 def main(args, extra):

diff --git a/rse/client/topics.py b/rse/client/topics.py
@@ -9,7 +9,6 @@
 """
 
 from rse.main import Encyclopedia
-from rse.logger import bot
 
 
 def main(args, extra):

diff --git a/rse/main/__init__.py b/rse/main/__init__.py
@@ -127,10 +127,10 @@ def bulk_update(self, filename, rewrite=False):
                     pass
         return repos
 
-    def add(self, uid, quiet=False):
+    def add(self, uid, quiet=False, data=None):
         """A wrapper to add a repository to the software database."""
         if not self.exists(uid):
-            repo = self.db.add(uid)
+            repo = self.db.add(uid, data=data)
             return repo
         if not quiet:
             bot.error(f"{uid} already exists in the database.")

diff --git a/rse/main/database/filesystem.py b/rse/main/database/filesystem.py
@@ -77,11 +77,15 @@ def exists(self, uid):
         except:
             return False
 
-    def add(self, uid):
+    def add(self, uid, data=None):
         """Add a new software repository to the database."""
         if uid:
             parser = get_parser(uid, config=self.config)
-            data = parser.get_metadata()
+
+            if not data:
+                data = parser.get_metadata()
+            else:
+                parser.data = data
 
             # If it's a parser handoff
             if isinstance(data, ParserBase):

diff --git a/rse/main/database/relational.py b/rse/main/database/relational.py
@@ -97,13 +97,17 @@ def clear(self):
 
     # Add or Update requires executor
 
-    def add(self, uid):
+    def add(self, uid, data=None):
         """Create a new repo based on a uid that matches to a parser."""
         from rse.main.database.models import SoftwareRepository
 
         parser = get_parser(uid, config=self.config)
         if not self.exists(parser.uid):
-            data = parser.get_metadata()
+
+            if not data:
+                data = parser.get_metadata()
+            else:
+                parser.data = data
 
             # If it's a parser handoff
             if isinstance(data, ParserBase):

diff --git a/rse/main/parsers/github.py b/rse/main/parsers/github.py
@@ -9,8 +9,10 @@
 """
 
 import logging
+import random
 import requests
-from rse.utils.urls import check_response
+from time import sleep
+from rse.utils.urls import get_user_agent, check_response
 
 from .base import ParserBase
 
@@ -52,6 +54,39 @@ def get_description(self, data=None):
         data = data or self.data
         return data.get("description")
 
+    def get_org_repos(self, org, paginate=True, delay=None):
+        """
+        A helper function to get a listing of org repos.
+        """
+        self.load_secrets()
+        url = "https://api.github.com/orgs/%s/repos?per_page=100" % (org)
+        headers = {
+            "Accept": "application/vnd.github.symmetra-preview+json",
+            "User-Agent": get_user_agent(),
+        }
+        if self.token:
+            headers["Authorization"] = "token %s" % self.token
+
+        repos = []
+
+        # Start at 2, as 1 is implied to be the first
+        page = 2
+        original_url = url
+        while url is not None:
+            response = requests.get(url, headers=headers)
+            data = check_response(response)
+
+            # Reset the url to be None
+            url = None
+            if data and paginate:
+                url = original_url + "&page=%s" % page
+
+            repos = repos + data
+            page += 1
+            # Sleep for a random amount of time to give a rest!
+            sleep(delay or random.choice(range(1, 10)) * 0.1)
+        return repos
+
     def get_metadata(self, uri=None):
         """Retrieve repository metadata. The common metadata (timestamp) is
         added by the software repository parser, and here we need to
@@ -78,8 +113,22 @@ def get_metadata(self, uri=None):
         if data is None:
             return None
 
+        self.data = self.parse_github_repo(data)
+        return self.data
+
+    def parse_github_repo(self, repo):
+        """
+        Given an API response for a GitHub repository, parse a minimal set.
+        """
+        self.load_secrets()
+        headers = {
+            "Accept": "application/vnd.github.symmetra-preview+json",
+        }
+        if self.token:
+            headers["Authorization"] = "token %s" % self.token
+
         # Only save minimal set
-        self.data = {}
+        data = {}
         for key in [
             "name",
             "url",
@@ -99,20 +148,26 @@ def get_metadata(self, uri=None):
             "license",
             "subscribers_count",
         ]:
-            if key in data:
-                self.data[key] = data[key]
-        self.data["owner"] = {}
+            if key in repo:
+                data[key] = repo[key]
+        data["owner"] = {}
         for key in ["html_url", "avatar_url", "login", "type"]:
-            self.data["owner"][key] = data["owner"][key]
+            data["owner"][key] = repo["owner"][key]
 
         # Also try to get topics
         headers.update({"Accept": "application/vnd.github.mercy-preview+json"})
-        url = "%s/topics" % url
+        url = "%s/topics" % repo["url"]
         response = requests.get(url, headers=headers)
 
-        # Successful query!
+        # Add topics on successful query
         topics = check_response(response)
         if topics is not None:
-            self.data["topics"] = topics.get("names", [])
+            data["topics"] = topics.get("names", [])
 
-        return self.data
+        # Add topics from another source
+        if "topics" not in data and "topics" in repo:
+            data["topics"] = repo["topics"]
+        elif "topics" in data and "topics" in repo:
+            data["topics"] += [x for x in repo["topics"] if x not in data["topics"]]
+
+        return data
diff --git a/rse/main/scrapers/__init__.py b/rse/main/scrapers/__init__.py
@@ -12,6 +12,7 @@
 from .hal import HalScraper
 from .joss import JossScraper
 from .rsnl import RSNLScraper
+from .ropensci import ROpenSciScraper
 import re
 
 
@@ -26,6 +27,8 @@ def get_named_scraper(name, config=None):
         scraper = HalScraper()
     elif re.search("(researchsoftwarenl|rsnl)", name, re.IGNORECASE):
         scraper = RSNLScraper()
+    elif re.search("ropensci", name, re.IGNORECASE):
+        scraper = ROpenSciScraper()
 
     if not scraper:
         raise NotImplementedError(f"There is no matching scraper for {name}")

diff --git a/rse/main/scrapers/ropensci.py b/rse/main/scrapers/ropensci.py
@@ -0,0 +1,132 @@
+"""
+
+Copyright (C) 2022 Vanessa Sochat.
+
+This Source Code Form is subject to the terms of the
+Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed
+with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+
+from rse.main.parsers import GitHubParser
+import logging
+import requests
+
+from .base import ScraperBase
+
+bot = logging.getLogger("rse.main.scrapers.ropensci")
+
+
+class ROpenSciScraper(ScraperBase):
+
+    name = "ropensci"
+
+    def __init__(self, query=None, **kwargs):
+        super().__init__(query)
+
+    def latest(self, paginate=False, delay=0.0):
+        """
+        Only return latest (top page) of results (default is sort by created)
+        """
+        return self.scrape(paginate=paginate, delay=delay)
+
+    def search(self, query=None, paginate=True, delay=0.0):
+        """
+        Return all paginated results with search (no query accepted for now)
+        """
+        return self.scrape(paginate=paginate, delay=delay)
+
+    def read_registry(self):
+        """
+        Read the registry file to create a lookup of repos based on GitHub URL.
+        """
+        lookup = {}
+        response = requests.get(
+            "https://raw.githubusercontent.com/ropensci/roregistry/gh-pages/registry.json"
+        ).json()
+        for entry in response.get("packages", []):
+            # This is the GitHub URL
+            url = entry.get("github")
+            if not url:
+                continue
+            lookup[url] = entry
+        return lookup
+
+    def get_registry_topics(self, meta):
+        """
+        Given a metadata entry from the registry lookup, parse and return topics
+        """
+        topics = [x.strip() for x in meta.get("keywords", "").split(",") if x.strip()]
+        if meta.get("ropensci_category"):
+            topics += [meta.get("ropensci_category")]
+        return topics
+
+    def scrape(self, paginate=False, delay=None):
+        """A shared function to scrape a set of repositories. Since the JoSS
+        pages for a search and the base are the same, we can use a shared
+        function.
+        """
+        # This serves as 1: a lookup, and 2: to find non ropensci repos!
+        lookup = self.read_registry()
+
+        # Full GitHub urls that we expect to find!
+        names = set(lookup.keys())
+
+        parser = GitHubParser()
+        repos = parser.get_org_repos("ropensci", paginate=paginate, delay=delay)
+
+        for entry in repos:
+
+            # We determine belonging based on the github url
+            if entry["html_url"] not in names:
+                bot.info("Skipping repository: %s" % entry["html_url"])
+                continue
+
+            meta = lookup[entry["html_url"]]
+
+            # Get topics from R metadata
+            topics = self.get_registry_topics(meta)
+            names.remove(entry["html_url"])
+            if "topics" not in entry:
+                entry["topics"] = []
+            entry["topics"] += topics
+
+            # Add a description if missing
+            if not entry.get("description") and meta.get("description"):
+                entry["description"] = meta["description"]
+
+            bot.info("Adding repository: %s" % entry["html_url"])
+            self.results.append(parser.parse_github_repo(entry))
+
+        # If paginate is True, we intend to add ALL repos, so check those still remaining
+        # E.g., there are repos in other orgs that won't be found above
+        if paginate and names:
+            for name in names:
+
+                parser = GitHubParser(uid=name)
+                # Topics will be added here!
+                entry = parser.get_metadata()
+                meta = lookup[name]
+
+                # Add a description if missing
+                if not entry.get("description") and meta.get("description"):
+                    entry["description"] = meta["description"]
+                entry["topics"] += self.get_registry_topics(meta)
+                bot.info("Adding repository: %s" % entry["html_url"])
+                self.results.append(entry)
+
+        return self.results
+
+    def create(self, database=None, config_file=None):
+        """After a scrape (whether we obtain latest or a search query) we
+        run create to create software repositories based on results.
+        """
+        from rse.main import Encyclopedia
+
+        client = Encyclopedia(config_file=config_file, database=database)
+        for result in self.results:
+            uid = result["html_url"].split("//")[-1]
+
+            # Add results that don't exist
+            if not client.exists(uid):
+                client.add(uid, data=result)