In [None]:
import requests
import json
from os import listdir
from os.path import isfile, join
import pandas as pd
from tqdm import tqdm

In [None]:
# helper functions
def write_to_json(filename, obj):
    with open(filename, "w") as f:
        json.dump(obj, f)


def get_files_in_dir(dir_path):
    only_files = [
        dir_path + "/" + f for f in listdir(dir_path) if isfile(join(dir_path, f))
    ]
    return only_files


class ExtensibleArray:
    def __init__(self):
        self.array = []

    def set_value(self, index, value):
        current_length = len(self.array)

        if index >= current_length:
            self.array.extend([""] * (index - current_length + 1))

        self.array[index] = value

    def get_value(self, index):
        return self.array[index]

    def get_array(self):
        return self.array

In [None]:
class PublicationAbstractGetter:
    def get_data_by_title_openalex(self, title):
        try:
            url = f'https://api.openalex.org/works?filter=title.search:"{title}"'
            response = requests.get(url)
            return response.json()["results"][0]
        except:
            return {}

    def get_abstract(self, abstract_inverted_index):
        abstract_list = ExtensibleArray()
        for word, positions in abstract_inverted_index.items():
            for position in positions:
                abstract_list.set_value(position, word)
        abstract = " ".join(abstract_list.get_array())
        return abstract

    def __get_publications_abstracts(self, publications):
        publications_abstracts = []
        for publication in publications:
            try:
                publication_data = self.get_data_by_title_openalex(publication["title"])
                abstract_inverted_index = publication_data["abstract_inverted_index"]
                abstract = self.get_abstract(abstract_inverted_index)
                publications_abstracts.append(
                    {
                        "title": publication["title"],
                        "gs_url": f"https://scholar.google.com/citations?{publication['url']}",
                        "abstract": abstract,
                        "doi": publication_data["doi"],
                    }
                )
            except:
                print("Error")
                print(publication["title"])
                publications_abstracts.append(
                    {
                        "title": publication["title"],
                        "gs_url": f"https://scholar.google.com/citations?{publication['url']}",
                        "abstract": None,
                        "doi": publication_data.get("doi", None),
                    }
                )
        return publications_abstracts

    def update_abstracts(self, gs_members_info_file_paths):
        members_publications_abstracts = []

        for file_path in tqdm(gs_members_info_file_paths):
            f = open(file_path)
            member_info_gs = json.load(f)

            member_publications_abstracts = dict()
            member_publications_abstracts["authorID"] = member_info_gs["authorID"]
            member_publications_abstracts["publications"] = (
                self.__get_publications_abstracts(member_info_gs["publications"])
            )
            member_publications_abstracts["publications_pubdate"] = (
                self.__get_publications_abstracts(
                    member_info_gs["publications_pubdate"]
                )
            )

            members_publications_abstracts.append(member_publications_abstracts)

        return members_publications_abstracts

AI_lab_data folder should contain files with persons data such as authorID, publications, publications_pubdate


In [None]:
scraper = PublicationAbstractGetter("waterloo_ai", "abstracts")

gs_members_info_file_paths = get_files_in_dir("../AI_lab_data")
members_publications_abstracts = scraper.update_abstracts(gs_members_info_file_paths)
write_to_json("all_data.json", members_publications_abstracts)