preparing to add imperial college london research software director s…

…craper (#58) * preparing to add imperial college london research software directory scraper Signed-off-by: vsoch <vsoch@users.noreply.github.com>
rseng · May 12, 2022 · 8b5b3aa · 8b5b3aa
1 parent 9bdf82b
commit 8b5b3aa
Show file tree

Hide file tree

Showing 8 changed files with 139 additions and 11 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -21,7 +21,7 @@ jobs:
         run: |
           export PATH="/usr/share/miniconda/bin:$PATH"
           source activate black
-          pip install black==20.8b1
+          pip install black
           black --check rse
 
       - name: Check imports with pyflakes

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are:
 The versions coincide with releases on pip.
 
 ## [0.0.x](https://github.com/rseng/rse/tree/master) (0.0.x)
+ - adding imperial college london research software directory (0.0.36)
  - adding molssi scraper (0.0.35)
  - adding ropensci scraper (0.0.34)
  - updating local index to be a data table (0.0.33)

diff --git a/docs/_docs/getting-started/scrapers.md b/docs/_docs/getting-started/scrapers.md
@@ -35,6 +35,7 @@ and it includes links to repositories:
  - [Research Software NL Dictionary](#researchsoftwarenl)
  - [ROpenSci](#ropensci)
  - [The Molecular Sciences Software Institute](#molssi)
+ - [The Imperial College London Research Software Directory](#imperial)
 
 
 <a id="joss">
@@ -292,7 +293,7 @@ $ rse scrape --dry-run molssi
 ```
 
 The [within python](#within-python) interaction is the same, except you need to
-select the ropensci named parser.
+select the molssi named parser.
 
 ```python
 from rse.main.scrapers import get_named_scraper
@@ -302,3 +303,27 @@ scraper = get_named_scraper('molssi')
 from rse.main.scrapers import MolssiScraper
 scraper = MolssiScraperScraper()
 ```
+
+<a id="imperial">
+### Imperial College London Research Software Directory
+
+You can browse the directory [here](https://imperialcollegelondon.github.io/research-software-directory/)!
+Note that if you want to use this as a Jekyll template (without relying on Alogia) @vsoch
+has prepared a template [here](https://github.com/vsoch/search). 
+
+```bash
+$ rse scrape imperial
+$ rse scrape --dry-run imperial
+```
+
+The [within python](#within-python) interaction is the same, except you need to
+select the right named parser.
+
+```python
+from rse.main.scrapers import get_named_scraper
+scraper = get_named_scraper('imperial')
+
+# or!
+from rse.main.scrapers import ImperialCollegeLondonScraper
+scraper = ImperialCollegeLondonScraper()
+```
diff --git a/rse/main/scrapers/__init__.py b/rse/main/scrapers/__init__.py
@@ -14,6 +14,7 @@
 from .molssi import MolssiScraper
 from .rsnl import RSNLScraper
 from .ropensci import ROpenSciScraper
+from .imperial import ImperialCollegeLondonScraper
 import re
 
 
@@ -32,6 +33,8 @@ def get_named_scraper(name, config=None):
         scraper = ROpenSciScraper()
     elif re.search("molssi", name, re.IGNORECASE):
         scraper = MolssiScraper()
+    elif re.search("imperial", name, re.IGNORECASE):
+        scraper = ImperialCollegeLondonScraper()
 
     if not scraper:
         raise NotImplementedError(f"There is no matching scraper for {name}")

diff --git a/rse/main/scrapers/hal.py b/rse/main/scrapers/hal.py
@@ -31,7 +31,8 @@ def __init__(self, query=None, **kwargs):
         super().__init__(query)
 
     def latest(self, paginate=False, delay=0.0):
-        """populate self.results with some number of latest entries. Unlike
+        """
+        populate self.results with some number of latest entries. Unlike
         a search, a latest scraper does not by default paginate. Hal will by
         default return all entries, so the user is required to define a number
         for latest.
@@ -40,8 +41,9 @@ def latest(self, paginate=False, delay=0.0):
         return self.scrape(url, delay=delay)
 
     def search(self, query, paginate=True, delay=0.0):
-        """populate self.results with a listing based on matching a search criteria.
-        we search the description.
+        """
+        populate self.results with a listing based on matching a search criteria.
+        We search the description.
         """
         url = (
             "http://api.archives-ouvertes.fr/search/?q=%s&fq=docType_s:(SOFTWARE)&wt=json"
@@ -50,9 +52,8 @@ def search(self, query, paginate=True, delay=0.0):
         return self.scrape(url, delay=delay)
 
     def scrape(self, url, paginate=False, delay=0.0):
-        """A shared function to scrape a set of repositories. Since the JoSS
-        pages for a search and the base are the same, we can use a shared
-        function.
+        """
+        A shared function to scrape a set of repositories.
         """
         # Api doesn't appear to have pagination
         response = requests.get(url, headers={"User-Agent": get_user_agent()})
@@ -75,7 +76,8 @@ def scrape(self, url, paginate=False, delay=0.0):
         return self.results
 
     def create(self, database=None, config_file=None):
-        """After a scrape (whether we obtain latest or a search query) we
+        """
+        After a scrape (whether we obtain latest or a search query) we
         run create to create software repositories based on results.
         """
         from rse.main import Encyclopedia

diff --git a/rse/main/scrapers/imperial.py b/rse/main/scrapers/imperial.py
@@ -0,0 +1,95 @@
+"""
+
+Copyright (C) 2022 Vanessa Sochat.
+
+This Source Code Form is subject to the terms of the
+Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed
+with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+
+from rse.logger import logger
+from rse.utils.urls import get_user_agent, repository_regex
+from rse.main.parsers import get_parser
+import logging
+import requests
+import re
+import csv
+
+from .base import ScraperBase
+
+bot = logging.getLogger("rse.main.scrapers.imperial")
+
+# Allow the regex to have newline at end
+repository_regex = repository_regex.strip("$")
+
+csv_url = "https://raw.githubusercontent.com/ImperialCollegeLondon/research-software-directory/main/repos.csv"
+
+
+class ImperialCollegeLondonScraper(ScraperBase):
+
+    name = "imperial"
+
+    def __init__(self, query=None, **kwargs):
+        super().__init__(query)
+
+    def latest(self, paginate=False, delay=0.0):
+        """
+        Populate self.results with some number of latest entries.
+        """
+        return self.scrape(csv_url, delay=delay)
+
+    def search(self, query, paginate=True, delay=0.0):
+        """
+        Populate self.results with a listing based on matching a search criteria.
+        """
+        return self.scrape(csv_url, delay=delay)
+
+    def scrape(self, url, paginate=False, delay=0.0):
+        """
+        A shared function to scrape a set of repositories.
+        """
+        # Api doesn't appear to have pagination
+        response = requests.get(url, headers={"User-Agent": get_user_agent()})
+        if response.status_code != 200:
+            logger.exit("Could not retrieve data from %s" % url)
+
+        reader = csv.reader(response.text.split("\n"))
+        parsed = [x for x in list(reader) if x]
+        headers = parsed.pop(0)
+
+        # Lookup based on index
+        headers = {k: i for i, k in enumerate(headers)}
+        for row in parsed:
+            repo = row[headers["url"]]
+            match = re.search(repository_regex, repo, re.IGNORECASE)
+            repo_url = None
+            if match:
+                repo_url = match.group()
+
+            if repo_url:
+                meta = {k: row[i] for k, i in headers.items()}
+                entry = {k: v for k, v in meta.items() if v}
+                bot.info("Found repository: %s" % repo_url)
+                self.results.append(entry)
+
+        return self.results
+
+    def create(self, database=None, config_file=None):
+        """
+        After a scrape (whether we obtain latest or a search query) we
+        run create to create software repositories based on results.
+        """
+        from rse.main import Encyclopedia
+
+        client = Encyclopedia(config_file=config_file, database=database)
+        for entry in self.results:
+            repo = get_parser(entry["url"])
+
+            # Add results that don't exist
+            if not client.exists(repo.uid):
+                client.add(repo.uid)
+                for k, v in entry.items():
+                    if k == "url" or not v:
+                        continue
+                client.label(repo.uid, k, v, force=True)
diff --git a/rse/version.py b/rse/version.py
@@ -8,7 +8,7 @@
 
 """
 
-__version__ = "0.0.35"
+__version__ = "0.0.36"
 AUTHOR = "Vanessa Sochat"
 AUTHOR_EMAIL = "vsoch@users.noreply.github.io"
 NAME = "rse"

diff --git a/tests/test_parser_github.py b/tests/test_parser_github.py
@@ -32,10 +32,12 @@ def test_parser_github(tmp_path):
     data = parser.export()
     for key in ["timestamp", "url", "html_url"]:
         assert key in data
-
+
+
 def test_org_repos(tmp_path):
     """Test the github parser to retrieve org repos."""
     from rse.main.parsers import GitHubParser
+
     parser = GitHubParser()
     data = parser.get_org_repos("ropensci", paginate=False)
     assert len(data) == 100