Skip to content

Commit

Permalink
Adding molssi scraper (#56)
Browse files Browse the repository at this point in the history
* adding molssi scraper
Signed-off-by: vsoch <vsoch@users.noreply.github.com>
  • Loading branch information
vsoch committed Mar 11, 2022
1 parent 16a3474 commit 9bdf82b
Show file tree
Hide file tree
Showing 5 changed files with 188 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are:
The versions coincide with releases on pip.

## [0.0.x](https://github.com/rseng/rse/tree/master) (0.0.x)
- adding molssi scraper (0.0.35)
- adding ropensci scraper (0.0.34)
- updating local index to be a data table (0.0.33)
- move repository link to be part of card (0.0.32)
Expand Down
26 changes: 26 additions & 0 deletions docs/_docs/getting-started/scrapers.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ and it includes links to repositories:
- [Hal Research Software Database](#hal)
- [Research Software NL Dictionary](#researchsoftwarenl)
- [ROpenSci](#ropensci)
- [The Molecular Sciences Software Institute](#molssi)


<a id="joss">
Expand Down Expand Up @@ -276,3 +277,28 @@ scraper = get_named_scraper('ropensci')
from rse.main.scrapers import ROpenSciScraper
scraper = ROpenSciScraper()
```

<a id="molssi">
### Molssi

The Molecular Sciences Software Institute maintains a paginated listing of software for
computational chemistry, or more generally, molecular science at [https://molssi.org/software-search/](https://molssi.org/software-search/)
that is accessible via the scaper here. Examples of command line usage include:


```bash
$ rse scrape molssi
$ rse scrape --dry-run molssi
```

The [within python](#within-python) interaction is the same, except you need to
select the ropensci named parser.

```python
from rse.main.scrapers import get_named_scraper
scraper = get_named_scraper('molssi')

# or!
from rse.main.scrapers import MolssiScraper
scraper = MolssiScraperScraper()
```
3 changes: 3 additions & 0 deletions rse/main/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .biotools import BioToolsScraper
from .hal import HalScraper
from .joss import JossScraper
from .molssi import MolssiScraper
from .rsnl import RSNLScraper
from .ropensci import ROpenSciScraper
import re
Expand All @@ -29,6 +30,8 @@ def get_named_scraper(name, config=None):
scraper = RSNLScraper()
elif re.search("ropensci", name, re.IGNORECASE):
scraper = ROpenSciScraper()
elif re.search("molssi", name, re.IGNORECASE):
scraper = MolssiScraper()

if not scraper:
raise NotImplementedError(f"There is no matching scraper for {name}")
Expand Down
157 changes: 157 additions & 0 deletions rse/main/scrapers/molssi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
"""
Copyright (C) 2022 Vanessa Sochat.
This Source Code Form is subject to the terms of the
Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed
with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""

from rse.utils.urls import get_user_agent
from rse.main.parsers import get_parser
import logging
import requests
import random
import sys
from time import sleep

from .base import ScraperBase

bot = logging.getLogger("rse.main.scrapers.molssi")


class MolssiScraper(ScraperBase):

name = "molssi"

def __init__(self, query=None, **kwargs):
super().__init__(query)
self.baseurl = "https://api.molssi.org"

def latest(self, paginate=False, delay=0.0):
return self.scrape(paginate=paginate, delay=delay)

def search(self, query, paginate=True, delay=0.0):
return self.scrape(paginate=paginate, delay=delay, query=query)

def scrape(self, paginate=False, delay=None, query=""):
"""A shared function to scrape software from molssi."""
url = "%s/search" % self.baseurl
try:
from bs4 import BeautifulSoup
import bs4
except ImportError:
sys.exit("BeautifulSoup is required. pip install rse[scraper].")

data = {
"domain": "",
"languages": "[]",
"mm_filters": None,
"price": "",
"qm_filters": "{}",
"query_text": query,
}

response = requests.get(
url, params=data, headers={"User-Agent": get_user_agent()}
)
soup = BeautifulSoup(response.text, "html.parser")
contenders = soup.find_all("a", {"class": "card-link"}, href=True)
for contender in contenders:
href = contender.attrs.get("href")
if "software_detail" not in href:
continue

href = "%s/%s" % (self.baseurl, href)

# Sleep for a random amount of time to give a rest!
sleep(delay or random.choice(range(1, 10)) * 0.1)
response = requests.get(href, headers={"User-Agent": get_user_agent()})
meta_soup = BeautifulSoup(response.text, "html5lib")
links = meta_soup.find_all("a", href=True)

source_code = None
citation = None
for link in links:
if "source code" in link.text.lower():
source_code = link.attrs.get("href")

if (
source_code
and "github" in source_code
or "gitlab" in source_code
):
break
if (
source_code
and "github" not in source_code
or "gitlab" not in source_code
):
source_code = None

# If no source code, continue parsing until we find
if not source_code:
for link in links:
href = link.attrs.get("href")
if href and "github" in href or "gitlab" in href:
source_code = href
break

# We couldn't find the source code
if not source_code:
continue

# Do we have a citation?
for bolded in meta_soup.find_all("b"):
if "citation" in bolded.text.lower():
if "row" in bolded.parent.parent.attrs.get("class"):
nexts = list(bolded.parent.parent.parent.next_elements)
for next_element in nexts:

# The citation is the first link under this bolded section
if isinstance(
next_element, bs4.element.Tag
) and next_element.find_next("a"):
citation = next_element.find_next("a")
citation = citation.attrs.get("href")
break

# These are not citations!
if citation and ("mailto" in citation or citation == source_code):
citation = None

repo = None
if source_code and citation:
bot.info("Found repository %s and doi %s" % (source_code, citation))
repo = {"url": source_code, "doi": citation}
elif source_code:
bot.info("Found repository %s" % source_code)
repo = {"url": source_code}

if repo:
self.results.append(repo)

return self.results

def create(self, database=None, config_file=None):
"""After a scrape (whether we obtain latest or a search query) we
run create to create software repositories based on results.
"""
from rse.main import Encyclopedia

client = Encyclopedia(config_file=config_file, database=database)
for result in self.results:
uid = result["url"].split("//")[-1]

# If a repository is added that isn't represented
try:
repo = get_parser(uid)
except NotImplementedError as exc:
bot.warning(exc)
continue

# Add results that don't exist
if not client.exists(repo.uid):
client.add(repo.uid)
client.label(repo.uid, key="doi", value=result.get("doi"))
2 changes: 1 addition & 1 deletion rse/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"""

__version__ = "0.0.34"
__version__ = "0.0.35"
AUTHOR = "Vanessa Sochat"
AUTHOR_EMAIL = "vsoch@users.noreply.github.io"
NAME = "rse"
Expand Down

0 comments on commit 9bdf82b

Please sign in to comment.