# ORKG data download

This notebook downloads all papers from the ORKG API and stores it in a SQLite database.

### JSON helper functions

In [22]:
import json
import os


# Read name.json data in directory
def read_json(filename: str):
    json_file = os.path.join(filename)
    with open(json_file) as f:
        img_annotations = json.load(f)

    return img_annotations


# Pretty print json data to console
def print_json(data: any, tag=""):
    json_str = json.dumps(data, indent=2, sort_keys=True)
    if tag:
        print(tag, json_str)
    else:
        print(json_str)


# Pretty print json data to file
def write_json(filename: str, data: any):
    with open(filename, "w") as file:
        json.dump(data, file, indent=2, sort_keys=True)

### Initialize SQLite database

In [23]:
import sqlite3

base_dir = "data/"
database_path = os.path.join(base_dir, "datalake.db")

# Open a connection to the SQLite database file
db = sqlite3.connect(database_path)

# Create a cursor object to execute SQL statements
cursor = db.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS orkg_statements (id TEXT PRIMARY KEY, data JSON)")

<sqlite3.Cursor at 0x7f30ab3097a0>

In [24]:
#cursor.execute("DROP TABLE IF EXISTS orkg_statements")
#print(cursor.fetchall())

### Functions to store data in a SQLite database

In [25]:
def entry_exists(id: str):
    cursor.execute("SELECT * FROM orkg_statements WHERE id = ?", (id,))
    rows = cursor.fetchall()
    return len(rows) > 0


def store_data(id: str, data: dict):
    cursor.execute("INSERT INTO orkg_statements (id, data) VALUES (?, ?)", (id, json.dumps(data)))
    db.commit()

### Download all paper ids

The predicate P30 is used to get all paper ids.

In [None]:
import requests


def get_predicates(predicate: str, page=int(0), size=int(100)):
    url = "http://orkg.org/api/statements/predicate/" + predicate

    params = {
        "page": page,
        "size": size,
        "sort": True
    }

    response = requests.get(url, params=params, timeout=6)
    return response.json()

In [27]:
page = 0
pages = 1

In [28]:
import time

# Download all predicate P30 data, this contains all paper ids
# These data is then used to download the corresponding paper data by subject

paper_ids = set()

while True:
    print(f"Download page {page + 1} of {pages}")

    try:
        data = get_predicates("P30", page=page, size=100)
    except:
        # The ORKG API is not very stable, so we need to retry if it fails
        print(f"Error: page={page} --> retrying in 20 seconds...")
        time.sleep(20)
        continue

    content = data['content']

    for entry in content:
        id = entry["subject"]["id"]
        paper_ids.add(id)

    page += 1
    pages = data['totalPages']
    if "last" not in data or data['last']:
        break

Download page 1 of 1
Download page 2 of 271
Download page 3 of 271
Download page 4 of 271
Download page 5 of 271
Download page 6 of 271
Download page 7 of 271
Download page 8 of 271
Download page 9 of 271
Download page 10 of 271
Download page 11 of 271
Download page 12 of 271
Download page 13 of 271
Download page 14 of 271
Download page 15 of 271
Download page 16 of 271
Download page 17 of 271
Download page 18 of 271
Download page 19 of 271
Download page 20 of 271
Download page 21 of 271
Download page 22 of 271
Download page 23 of 271
Download page 24 of 271
Download page 25 of 271
Download page 26 of 271
Download page 27 of 271
Download page 28 of 271
Download page 29 of 271
Download page 30 of 271
Download page 31 of 271
Download page 32 of 271
Download page 33 of 271
Download page 34 of 271
Download page 35 of 271
Download page 36 of 271
Download page 37 of 271
Download page 38 of 271
Download page 39 of 271
Download page 40 of 271
Download page 41 of 271
Download page 42 of 271
Dow

In [29]:
paper_ids = list(paper_ids)
print("Papers", len(paper_ids))

Papers 26737


### Download paper data

After downloading all paper ids, we can download the corresponding paper data.

In [30]:
def get_statement(paper_id: str):
    url = "https://orkg.org/api/statements/" + paper_id + "/bundle/"
    response = requests.get(url, timeout=8)
    data = response.json()

    return data

In [31]:
from tqdm import tqdm

progress_bar = tqdm(
    leave=False,
    ncols=80,  # Limit the progress bar to a fixed width
    total=len(paper_ids)
)

index = 0

while index < len(paper_ids):
    paper_id = paper_ids[index]
    if entry_exists(paper_id):
        progress_bar.update(1)
        index += 1
        continue

    try:
        data = get_statement(paper_id)
    except:
        # The ORKG API is not very stable, so we need to retry if it fails
        print(f"Error: index={index} paper_id={paper_id} --> retrying in 60 seconds...")
        time.sleep(60)
        continue

    if "error" in data:
        print(f"Error: index={index} paper_id={paper_id}")
        print_json(data)
        break

    store_data(paper_id, data)
    progress_bar.update(1)
    index += 1

progress_bar.close()

  0%|                                                 | 0/26737 [00:00<?, ?it/s]
 96%|█████████████████████████████████▋ | 25742/26737 [00:26<00:00, 7969.10it/s][A

Error: index=25742 paper_id=R164478 --> retrying in 60 seconds...


 96%|███████████████████████████████████▋ | 25746/26737 [01:29<00:29, 34.00it/s]

Error: index=25763 paper_id=R110705 --> retrying in 60 seconds...


 96%|███████████████████████████████████▋ | 25764/26737 [02:40<01:20, 12.14it/s]

Error: index=25764 paper_id=R73159 --> retrying in 60 seconds...
Error: index=25764 paper_id=R73159 --> retrying in 60 seconds...


 96%|███████████████████████████████████▋ | 25773/26737 [04:57<03:38,  4.42it/s]

Error: index=25778 paper_id=R395242 --> retrying in 60 seconds...


 96%|███████████████████████████████████▋ | 25780/26737 [06:06<05:53,  2.71it/s]

Error: index=25795 paper_id=R171568 --> retrying in 60 seconds...


 96%|███████████████████████████████████▋ | 25797/26737 [07:16<09:51,  1.59it/s]

Error: index=25807 paper_id=R27291 --> retrying in 60 seconds...


 97%|███████████████████████████████████▋ | 25812/26737 [08:31<16:52,  1.09s/it]

Error: index=25822 paper_id=R201930 --> retrying in 60 seconds...


 97%|███████████████████████████████████▋ | 25828/26737 [09:43<25:50,  1.71s/it]

Error: index=25831 paper_id=R28517 --> retrying in 60 seconds...


 97%|███████████████████████████████████▋ | 25831/26737 [09:56<25:45,  1.71s/it]

Error: index=25831 paper_id=R28517 --> retrying in 60 seconds...


 97%|███████████████████████████████████▊ | 25902/26737 [12:15<17:35,  1.26s/it]

Error: index=25906 paper_id=R214065 --> retrying in 60 seconds...


 97%|███████████████████████████████████▊ | 25909/26737 [13:25<33:14,  2.41s/it]

Error: index=25922 paper_id=R574596 --> retrying in 60 seconds...


 97%|███████████████████████████████████▊ | 25923/26737 [14:36<45:07,  3.33s/it]

Error: index=25924 paper_id=R213157 --> retrying in 60 seconds...


 97%|████████████████████████████████████ | 26042/26737 [16:13<03:09,  3.67it/s]

Error: index=26043 paper_id=R31745 --> retrying in 60 seconds...


 98%|████████████████████████████████████ | 26086/26737 [17:31<02:22,  4.56it/s]

Error: index=26090 paper_id=R203739 --> retrying in 60 seconds...


 98%|████████████████████████████████████▏| 26111/26737 [18:42<10:12,  1.02it/s]

Error: index=26111 paper_id=R569702 --> retrying in 60 seconds...


 98%|████████████████████████████████████▏| 26115/26737 [19:51<58:20,  5.63s/it]

Error: index=26116 paper_id=R538585 --> retrying in 60 seconds...


 98%|████████████████████████████████████▏| 26116/26737 [20:06<58:14,  5.63s/it]

Error: index=26116 paper_id=R538585 --> retrying in 60 seconds...


 98%|████████████████████████████████████▏| 26190/26737 [22:23<04:28,  2.04it/s]

Error: index=26191 paper_id=R458545 --> retrying in 60 seconds...


 98%|██████████████████████████████████▎| 26192/26737 [23:31<2:09:56, 14.30s/it]

Error: index=26192 paper_id=R507948 --> retrying in 60 seconds...


 98%|██████████████████████████████████▎| 26196/26737 [24:40<2:02:08, 13.55s/it]

Error: index=26196 paper_id=R169421 --> retrying in 60 seconds...


 98%|████████████████████████████████████▎| 26254/26737 [25:56<00:56,  8.62it/s]

Error: index=26255 paper_id=R437110 --> retrying in 60 seconds...


 98%|████████████████████████████████████▍| 26286/26737 [27:09<02:14,  3.36it/s]

Error: index=26287 paper_id=R141003 --> retrying in 60 seconds...


 98%|████████████████████████████████████▍| 26332/26737 [28:28<00:57,  7.06it/s]

Error: index=26332 paper_id=R528812 --> retrying in 60 seconds...


 99%|████████████████████████████████████▌| 26394/26737 [29:50<02:12,  2.59it/s]

Error: index=26394 paper_id=R56084 --> retrying in 60 seconds...


 99%|████████████████████████████████████▌| 26404/26737 [30:58<16:53,  3.04s/it]

Error: index=26404 paper_id=R595160 --> retrying in 60 seconds...


 99%|████████████████████████████████████▌| 26411/26737 [32:07<23:23,  4.31s/it]

Error: index=26412 paper_id=R500538 --> retrying in 60 seconds...


 99%|████████████████████████████████████▌| 26426/26737 [33:20<06:34,  1.27s/it]

Error: index=26428 paper_id=R283587 --> retrying in 60 seconds...


 99%|████████████████████████████████████▌| 26436/26737 [34:31<15:46,  3.14s/it]

Error: index=26438 paper_id=R266349 --> retrying in 60 seconds...


 99%|████████████████████████████████████▋| 26531/26737 [35:59<00:27,  7.58it/s]

Error: index=26531 paper_id=R137404 --> retrying in 60 seconds...


 99%|████████████████████████████████████▋| 26535/26737 [37:07<20:54,  6.21s/it]

Error: index=26535 paper_id=R441714 --> retrying in 60 seconds...


 99%|████████████████████████████████████▋| 26550/26737 [38:17<08:31,  2.74s/it]

Error: index=26551 paper_id=R497096 --> retrying in 60 seconds...


 99%|████████████████████████████████████▊| 26558/26737 [39:26<11:56,  4.01s/it]

Error: index=26558 paper_id=R565868 --> retrying in 60 seconds...


100%|████████████████████████████████████▉| 26707/26737 [41:03<00:03,  9.48it/s]

Error: index=26708 paper_id=R574425 --> retrying in 60 seconds...


                                                                                

In [32]:
# Print count of database table orkg
cursor.execute("SELECT COUNT(*) FROM orkg_statements")
print(cursor.fetchone()[0])

26745


In [None]:
cursor.close()