Adding molssi scraper (#56)

* adding molssi scraper Signed-off-by: vsoch <vsoch@users.noreply.github.com>
rseng · Mar 11, 2022 · 9bdf82b · 9bdf82b
1 parent 16a3474
commit 9bdf82b
Show file tree

Hide file tree

Showing 5 changed files with 188 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are:
 The versions coincide with releases on pip.
 
 ## [0.0.x](https://github.com/rseng/rse/tree/master) (0.0.x)
+ - adding molssi scraper (0.0.35)
  - adding ropensci scraper (0.0.34)
  - updating local index to be a data table (0.0.33)
  - move repository link to be part of card (0.0.32)

diff --git a/docs/_docs/getting-started/scrapers.md b/docs/_docs/getting-started/scrapers.md
@@ -34,6 +34,7 @@ and it includes links to repositories:
  - [Hal Research Software Database](#hal)
  - [Research Software NL Dictionary](#researchsoftwarenl)
  - [ROpenSci](#ropensci)
+ - [The Molecular Sciences Software Institute](#molssi)
 
 
 <a id="joss">
@@ -276,3 +277,28 @@ scraper = get_named_scraper('ropensci')
 from rse.main.scrapers import ROpenSciScraper
 scraper = ROpenSciScraper()
 ```
+
+<a id="molssi">
+### Molssi
+
+The Molecular Sciences Software Institute maintains a paginated listing of software for
+computational chemistry, or more generally, molecular science at [https://molssi.org/software-search/](https://molssi.org/software-search/)
+that is accessible via the scaper here. Examples of command line usage include:
+
+
+```bash
+$ rse scrape molssi
+$ rse scrape --dry-run molssi
+```
+
+The [within python](#within-python) interaction is the same, except you need to
+select the ropensci named parser.
+
+```python
+from rse.main.scrapers import get_named_scraper
+scraper = get_named_scraper('molssi')
+
+# or!
+from rse.main.scrapers import MolssiScraper
+scraper = MolssiScraperScraper()
+```
diff --git a/rse/main/scrapers/__init__.py b/rse/main/scrapers/__init__.py
@@ -11,6 +11,7 @@
 from .biotools import BioToolsScraper
 from .hal import HalScraper
 from .joss import JossScraper
+from .molssi import MolssiScraper
 from .rsnl import RSNLScraper
 from .ropensci import ROpenSciScraper
 import re
@@ -29,6 +30,8 @@ def get_named_scraper(name, config=None):
         scraper = RSNLScraper()
     elif re.search("ropensci", name, re.IGNORECASE):
         scraper = ROpenSciScraper()
+    elif re.search("molssi", name, re.IGNORECASE):
+        scraper = MolssiScraper()
 
     if not scraper:
         raise NotImplementedError(f"There is no matching scraper for {name}")

diff --git a/rse/main/scrapers/molssi.py b/rse/main/scrapers/molssi.py
@@ -0,0 +1,157 @@
+"""
+
+Copyright (C) 2022 Vanessa Sochat.
+
+This Source Code Form is subject to the terms of the
+Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed
+with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+
+from rse.utils.urls import get_user_agent
+from rse.main.parsers import get_parser
+import logging
+import requests
+import random
+import sys
+from time import sleep
+
+from .base import ScraperBase
+
+bot = logging.getLogger("rse.main.scrapers.molssi")
+
+
+class MolssiScraper(ScraperBase):
+
+    name = "molssi"
+
+    def __init__(self, query=None, **kwargs):
+        super().__init__(query)
+        self.baseurl = "https://api.molssi.org"
+
+    def latest(self, paginate=False, delay=0.0):
+        return self.scrape(paginate=paginate, delay=delay)
+
+    def search(self, query, paginate=True, delay=0.0):
+        return self.scrape(paginate=paginate, delay=delay, query=query)
+
+    def scrape(self, paginate=False, delay=None, query=""):
+        """A shared function to scrape software from molssi."""
+        url = "%s/search" % self.baseurl
+        try:
+            from bs4 import BeautifulSoup
+            import bs4
+        except ImportError:
+            sys.exit("BeautifulSoup is required. pip install rse[scraper].")
+
+        data = {
+            "domain": "",
+            "languages": "[]",
+            "mm_filters": None,
+            "price": "",
+            "qm_filters": "{}",
+            "query_text": query,
+        }
+
+        response = requests.get(
+            url, params=data, headers={"User-Agent": get_user_agent()}
+        )
+        soup = BeautifulSoup(response.text, "html.parser")
+        contenders = soup.find_all("a", {"class": "card-link"}, href=True)
+        for contender in contenders:
+            href = contender.attrs.get("href")
+            if "software_detail" not in href:
+                continue
+
+            href = "%s/%s" % (self.baseurl, href)
+
+            # Sleep for a random amount of time to give a rest!
+            sleep(delay or random.choice(range(1, 10)) * 0.1)
+            response = requests.get(href, headers={"User-Agent": get_user_agent()})
+            meta_soup = BeautifulSoup(response.text, "html5lib")
+            links = meta_soup.find_all("a", href=True)
+
+            source_code = None
+            citation = None
+            for link in links:
+                if "source code" in link.text.lower():
+                    source_code = link.attrs.get("href")
+
+                    if (
+                        source_code
+                        and "github" in source_code
+                        or "gitlab" in source_code
+                    ):
+                        break
+                    if (
+                        source_code
+                        and "github" not in source_code
+                        or "gitlab" not in source_code
+                    ):
+                        source_code = None
+
+            # If no source code, continue parsing until we find
+            if not source_code:
+                for link in links:
+                    href = link.attrs.get("href")
+                    if href and "github" in href or "gitlab" in href:
+                        source_code = href
+                        break
+
+            # We couldn't find the source code
+            if not source_code:
+                continue
+
+            # Do we have a citation?
+            for bolded in meta_soup.find_all("b"):
+                if "citation" in bolded.text.lower():
+                    if "row" in bolded.parent.parent.attrs.get("class"):
+                        nexts = list(bolded.parent.parent.parent.next_elements)
+                        for next_element in nexts:
+
+                            # The citation is the first link under this bolded section
+                            if isinstance(
+                                next_element, bs4.element.Tag
+                            ) and next_element.find_next("a"):
+                                citation = next_element.find_next("a")
+                                citation = citation.attrs.get("href")
+                                break
+
+            # These are not citations!
+            if citation and ("mailto" in citation or citation == source_code):
+                citation = None
+
+            repo = None
+            if source_code and citation:
+                bot.info("Found repository %s and doi %s" % (source_code, citation))
+                repo = {"url": source_code, "doi": citation}
+            elif source_code:
+                bot.info("Found repository %s" % source_code)
+                repo = {"url": source_code}
+
+            if repo:
+                self.results.append(repo)
+
+        return self.results
+
+    def create(self, database=None, config_file=None):
+        """After a scrape (whether we obtain latest or a search query) we
+        run create to create software repositories based on results.
+        """
+        from rse.main import Encyclopedia
+
+        client = Encyclopedia(config_file=config_file, database=database)
+        for result in self.results:
+            uid = result["url"].split("//")[-1]
+
+            # If a repository is added that isn't represented
+            try:
+                repo = get_parser(uid)
+            except NotImplementedError as exc:
+                bot.warning(exc)
+                continue
+
+            # Add results that don't exist
+            if not client.exists(repo.uid):
+                client.add(repo.uid)
+                client.label(repo.uid, key="doi", value=result.get("doi"))
diff --git a/rse/version.py b/rse/version.py
@@ -8,7 +8,7 @@
 
 """
 
-__version__ = "0.0.34"
+__version__ = "0.0.35"
 AUTHOR = "Vanessa Sochat"
 AUTHOR_EMAIL = "vsoch@users.noreply.github.io"
 NAME = "rse"