In [1]:
from pyalex import Works, config
import time
import json

config.email = "sunglassesguy05@gmail.com"

# Getting first 20 entries

In [2]:
ds = Works().get(per_page=20)

print(f"Columns: {ds[0].keys()}")

for i, work in enumerate(ds):
    print(f"-- Work {i+1}:")
    print(f"Title: {work['title']}")
    print(f"Authors: {[author['author']['display_name'] for author in work['authorships']]}")
    print(f"Date: {work['publication_date']}")
    print(f"Abstract: {work['abstract']}")
    print(f"Tags: {[concept['display_name'] for concept in work['concepts']]}")
    print(f"Citations: {work['cited_by_count']}")


print(ds[0])

Columns: dict_keys(['id', 'doi', 'title', 'display_name', 'publication_year', 'publication_date', 'ids', 'language', 'primary_location', 'type', 'indexed_in', 'open_access', 'authorships', 'institutions', 'countries_distinct_count', 'institutions_distinct_count', 'corresponding_author_ids', 'corresponding_institution_ids', 'apc_list', 'apc_paid', 'fwci', 'has_fulltext', 'cited_by_count', 'citation_normalized_percentile', 'cited_by_percentile_year', 'biblio', 'is_retracted', 'is_paratext', 'is_xpac', 'primary_topic', 'topics', 'keywords', 'concepts', 'mesh', 'locations_count', 'locations', 'best_oa_location', 'sustainable_development_goals', 'grants', 'awards', 'funders', 'has_content', 'referenced_works_count', 'referenced_works', 'related_works', 'abstract_inverted_index', 'counts_by_year', 'updated_date', 'created_date'])
-- Work 1:
Title: Radiation Resistant Camera System for Monitoring Deuterium Plasma Discharges in the Large Helical Device
Authors: ['M. Shoji']
Date: 2020-06-08
Ab

# Sample 20 entries

In [3]:
ds = Works().sample(20).get()

print(f"Columns: {ds[0].keys()}")

for i, work in enumerate(ds):
    print(f"-- Work {i+1}:")
    print(f"Title: {work['title']}")
    print(f"Authors: {[author['author']['display_name'] for author in work['authorships']]}")
    print(f"Date: {work['publication_date']}")
    print(f"Abstract: {work['abstract']}")
    print(f"Tags: {[concept['display_name'] for concept in work['concepts']]}")
    print(f"Citations: {work['cited_by_count']}")


print(ds[0])

Columns: dict_keys(['id', 'doi', 'title', 'display_name', 'relevance_score', 'publication_year', 'publication_date', 'ids', 'language', 'primary_location', 'type', 'indexed_in', 'open_access', 'authorships', 'institutions', 'countries_distinct_count', 'institutions_distinct_count', 'corresponding_author_ids', 'corresponding_institution_ids', 'apc_list', 'apc_paid', 'fwci', 'has_fulltext', 'cited_by_count', 'citation_normalized_percentile', 'cited_by_percentile_year', 'biblio', 'is_retracted', 'is_paratext', 'is_xpac', 'primary_topic', 'topics', 'keywords', 'concepts', 'mesh', 'locations_count', 'locations', 'best_oa_location', 'sustainable_development_goals', 'grants', 'awards', 'funders', 'has_content', 'referenced_works_count', 'referenced_works', 'related_works', 'abstract_inverted_index', 'counts_by_year', 'updated_date', 'created_date'])
-- Work 1:
Title: The capital of culture and the culture of capital
Authors: ['Peter Bearder']
Date: 2021-03-08
Abstract: Most poetry is not 'poe

# Sample more than 200 entries

Since the API only allows 200 entries max per query, we'll have to do more queries using pagination

For the same random seed, it limits to about 10k results. When you need more, just change the seed.

In [4]:
target_count = 1000
seed = 67
cur_page = 1
ds = dict() # by id

while len(ds) < target_count:
    batch = Works().sample(target_count, seed=seed).get(per_page=200, page=cur_page)

    if not batch:
        print("No more works returned, stopping.")
        break

    for work in batch:
        ds[work['id']] = work # filter out duplicates

        if len(ds) >= target_count:
            break
    
    print(f"Collected {len(ds)} unique works so far...")

    cur_page += 1
    time.sleep(0.5) # to avoid rate limiting

ds = list(ds.values())

for i, work in enumerate(ds[:10]):
    print(f"-- Work {i+1}:")
    print(f"Title: {work['title']}")
    print(f"Authors: {[author['author']['display_name'] for author in work['authorships']]}")
    print(f"Date: {work['publication_date']}")
    print(f"Abstract: {work['abstract']}")
    print(f"Tags: {[concept['display_name'] for concept in work['concepts']]}")
    print(f"Citations: {work['cited_by_count']}")


print(ds[0])

Collected 200 unique works so far...
Collected 400 unique works so far...
Collected 600 unique works so far...
Collected 800 unique works so far...
Collected 1000 unique works so far...
-- Work 1:
Title: CCDC 1411099: Experimental Crystal Structure Determination
Authors: ['Truong Giang', 'Emanuel Hupf', 'Andreas Nordheider', 'Enno Lork', 'Alexandra M. Z. Slawin', 'Sergey G. Makarov', 'Sergey Yu. Ketkov', 'Stefan Mebs', 'J. Derek Woollins', 'Jens Beckmann']
Date: 2015-01-01
Abstract: An entry from the Cambridge Structural Database, the world’s repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.
Tags: ['Crystallography', 'Crystal (programming language)', 'Crystal structure', 'Space (punctuation)', 'Diffraction', 'Molecule', 'Group (perio

In [6]:
# write to file so we don't have to wait for API requests again
print(len(ds))
with open("openalex_works_dataset.json", "w") as f:
    f.write(json.dumps(ds))

1000
