In [None]:
import random
import time
import bs4
import json
from tqdm import tqdm
from pymongo import MongoClient
import copy
from playwright.sync_api import sync_playwright

Current crawler doesn't work in Google Colab as it uses playwright.sync_api. It can be rewritten as profile scraper and use GET requests, but on practice it is getting blocked very fast. Several ways was tried to escape blocking, however using playwright is one where it didn't get blocked at all.

It also saves results to MongoDB, but it can be modified to support other ways.

Currently we wait random amount of time between scraping, so we don't get blocked by server.


In [None]:
class GoogleScholarScrapper:

    USER_AGENT = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    }

    def __init__(self, db_name, collection_name):
        self.client = MongoClient("127.0.0.1", 27017)
        self.db = self.client[db_name]
        self.collection = self.db[collection_name]

    def __get_html_content(self, url):
        try:
            with sync_playwright() as p:
                browser = p.chromium.launch(headless=True)
                page = browser.new_page()
                page.set_extra_http_headers(self.USER_AGENT)
                page.goto(url)
                page.wait_for_load_state("load")
                page.wait_for_timeout(
                    random.randint(5000, 10000)
                )  # Random delay between 5 to 10 seconds
                html_content = page.content()
        except Exception as e:
            print(f"Error while fetching {url}: {e}")
            html_content = None

        return html_content

    def __get_abstract_from_gs(self, gs_url):
        soup = bs4.BeautifulSoup(self.__get_html_content(gs_url), "lxml")
        description = soup.find("div", {"class": "gsh_csp"})

        if not description:
            description = soup.find("div", {"class": "gsh_small"})
        if not description:
            description = soup.find("div", {"id": "gsc_oci_descr"})

        if description:
            description = description.get_text()

        time.sleep(random.randint(20, 35))
        return str(description)

    def process_publications(self, publications):
        for publication_number, publication in enumerate(publications):
            if publication["abstract"] == "None":
                abstract = self.__get_abstract_from_gs(publication["gs_url"])
                publications[publication_number]["abstract"] = abstract
                # with open("all_data_filled_null_abstracts.json", "w") as f:
                #     json.dump(members_info, f)
        return publications

    def update_abstracts(self, members_info):
        for member_info in tqdm(members_info):
            print(member_info["authorID"])
            copy_member_info = copy.deepcopy(member_info)

            copy_member_info["publications"] = self.process_publications(
                copy_member_info["publications"]
            )
            copy_member_info["publications_pubdate"] = self.process_publications(
                copy_member_info["publications_pubdate"]
            )

            self.collection.delete_one({"authorID": copy_member_info["authorID"]})
            self.collection.insert_one(copy_member_info)

Example of json file content:

```
[{
  "authorID": "T4wUsIMAAAAJ",
  "publications": [
    {
      "title": "TARDBP mutations in individuals with sporadic and familial amyotrophic lateral sclerosis",
      "gs_url": "https://scholar.google.com/citations?view_op=view_citation&hl=en&user=T4wUsIMAAAAJ&citation_for_view=T4wUsIMAAAAJ:u5HHmVD_uO8C",
      "abstract": None,
      "doi": "https://doi.org/10.1038/ng.132"
    },
  ],
  "publications_pubdate": [
    {
      "title": "TARDBP mutations in individuals with sporadic and familial amyotrophic lateral sclerosis",
      "gs_url": "https://scholar.google.com/citations?view_op=view_citation&hl=en&user=T4wUsIMAAAAJ&citation_for_view=T4wUsIMAAAAJ:u5HHmVD_uO8C",
      "abstract": None,
      "doi": null
    },
  ]
}
]
```


In [None]:
scraper = GoogleScholarScrapper("waterloo_ai", "abstracts")

with open("../gs_scrapped/waterloo_ai.abstracts_all.json") as f:
    members_info = json.load(f)

scraper.update_abstracts(members_info)